use NumpyTupleDataset for padding pattern

chainer · Sep 19, 2019 · 624f732 · 624f732
1 parent 4a51521
commit 624f732
Show file tree

Hide file tree

Showing 8 changed files with 21 additions and 172 deletions.
diff --git a/chainer_chemistry/dataset/preprocessors/gin_preprocessor.py b/chainer_chemistry/dataset/preprocessors/gin_preprocessor.py
@@ -51,13 +51,6 @@ def get_input_features(self, mol):
         adj_array = construct_adj_matrix(mol, out_size=self.out_size)
         return atom_array, adj_array
 
-    def create_dataset(self, *args, **kwargs):
-        # args: (atom_array, adj_array, label_array)
-        data_list = [
-            PaddingGraphData(x=x, adj=adj, y=y) for (x, adj, y) in zip(*args)
-        ]
-        return PaddingGraphDataset(data_list)
-
 
 class GINSparsePreprocessor(MolPreprocessor):
     def __init__(self, max_atoms=-1, out_size=-1, add_Hs=False):

diff --git a/chainer_chemistry/dataset/preprocessors/mol_preprocessor.py b/chainer_chemistry/dataset/preprocessors/mol_preprocessor.py
@@ -1,6 +1,7 @@
 from rdkit import Chem
 
 from chainer_chemistry.dataset.preprocessors.base_preprocessor import BasePreprocessor  # NOQA
+from chainer_chemistry.datasets.numpy_tuple_dataset import NumpyTupleDataset  # NOQA
 
 
 class MolPreprocessor(BasePreprocessor):
@@ -94,7 +95,7 @@ def get_input_features(self, mol):
         raise NotImplementedError
 
     def create_dataset(self, *args, **kwargs):
-        raise NotImplementedError
+        return NumpyTupleDataset(*args)
 
     def process(self, filepath):
         # Not used now...

diff --git a/chainer_chemistry/datasets/numpy_tuple_dataset.py b/chainer_chemistry/datasets/numpy_tuple_dataset.py
@@ -4,6 +4,7 @@
 import numpy
 
 from chainer_chemistry.dataset.indexers.numpy_tuple_dataset_feature_indexer import NumpyTupleDatasetFeatureIndexer  # NOQA
+from chainer_chemistry.dataset.converters import concat_mols
 
 
 class NumpyTupleDataset(object):
@@ -48,6 +49,10 @@ def __len__(self):
     def get_datasets(self):
         return self._datasets
 
+    @property
+    def converter(self):
+        return concat_mols
+
     @property
     def features(self):
         """Extract features according to the specified index.

diff --git a/chainer_chemistry/models/gin.py b/chainer_chemistry/models/gin.py
@@ -73,7 +73,7 @@ def __init__(self, out_dim, node_embedding=False, hidden_channels=16,
         self.weight_tying = weight_tying
         self.n_edge_types = n_edge_types
 
-    def __call__(self, batch, is_real_node=None):
+    def __call__(self, atom_array, adj, is_real_node=None):
         """forward propagation
 
         Args:
@@ -91,8 +91,6 @@ def __call__(self, batch, is_real_node=None):
         Returns:
             numpy.ndarray: final molecule representation
         """
-        atom_array, adj = batch.x, batch.adj
-
         if atom_array.dtype == self.xp.int32:
             h = self.embed(atom_array)  # (minibatch, max_num_atoms)
         else:

diff --git a/chainer_chemistry/models/prediction/graph_conv_predictor.py b/chainer_chemistry/models/prediction/graph_conv_predictor.py
@@ -41,8 +41,8 @@ def __init__(
             self.label_scaler = label_scaler
         self.postprocess_fn = postprocess_fn or chainer.functions.identity
 
-    def __call__(self, dataset):
-        x = self.graph_conv(dataset)
+    def __call__(self, *args, **kwargs):
+        x = self.graph_conv(*args, **kwargs)
 
         if self.mlp:
             x = self.mlp(x)

diff --git a/chainer_chemistry/models/prediction/regressor.py b/chainer_chemistry/models/prediction/regressor.py
@@ -5,6 +5,7 @@
 from chainer import cuda, Variable  # NOQA
 from chainer import reporter
 from chainer_chemistry.models.prediction.base import BaseForwardModel
+from chainer_chemistry.dataset.graph_dataset.base_graph_data import BaseGraphData  # NOQA
 
 
 class Regressor(BaseForwardModel):
@@ -102,7 +103,10 @@ def __call__(self, *args, **kwargs):
         """
 
         # --- Separate `args` and `t` ---
-        if isinstance(self.label_key, int):
+        if isinstance(args[0], BaseGraphData):
+            # for graph dataset
+            t = args[0].y
+        elif isinstance(self.label_key, int):
             if not (-len(args) <= self.label_key < len(args)):
                 msg = 'Label key %d is out of bounds' % self.label_key
                 raise ValueError(msg)

diff --git a/chainer_chemistry/models/prediction/regressor2.py b/chainer_chemistry/models/prediction/regressor2.py
diff --git a/examples/qm9/train_qm9.py b/examples/qm9/train_qm9.py
@@ -14,7 +14,7 @@
 from chainer_chemistry import datasets as D
 from chainer_chemistry.datasets import NumpyTupleDataset
 from chainer_chemistry.links.scaler.standard_scaler import StandardScaler
-from chainer_chemistry.models.prediction.regressor2 import Regressor
+from chainer_chemistry.models.prediction.regressor import Regressor
 from chainer_chemistry.models.prediction import set_up_predictor
 from chainer_chemistry.utils import run_train
 
@@ -122,9 +122,11 @@ def main():
     if args.scale == 'standardize':
         print('Fit StandardScaler to the labels.')
         scaler = StandardScaler()
-        y = numpy.array([data.y for data in dataset])
-        print('y', y.shape)
-        scaler.fit(y)
+        if isinstance(dataset, NumpyTupleDataset):
+            scaler.fit(dataset.get_datasets()[-1])
+        else:
+            y = numpy.array([data.y for data in dataset])
+            scaler.fit(y)
     else:
         print('No standard scaling was selected.')
         scaler = None