Refactor code2class model

SpirinEgor · SpirinEgor · commit a8697c65598c · 2021-08-22T12:03:11.000+05:00
diff --git a/code2seq/code2class_wrapper.py b/code2seq/code2class_wrapper.py
@@ -26,7 +26,7 @@ def train_code2class(config: DictConfig):
         print_config(config, fields=["model", "data", "train", "optimizer"])
 
     # Load data module
-    data_module = PathContextDataModule(config.data_folder, config.data)
+    data_module = PathContextDataModule(config.data_folder, config.data, is_class=True)
     data_module.prepare_data()
     data_module.setup()
 
diff --git a/code2seq/data/path_context_data_module.py b/code2seq/data/path_context_data_module.py
@@ -20,11 +20,12 @@ class PathContextDataModule(LightningDataModule):
 
     _vocabulary: Optional[Vocabulary] = None
 
-    def __init__(self, data_dir: str, config: DictConfig):
+    def __init__(self, data_dir: str, config: DictConfig, is_class: bool = False):
         super().__init__()
         self._config = config
         self._data_dir = data_dir
         self._name = basename(data_dir)
+        self._is_class = is_class
 
     @property
     def vocabulary(self) -> Vocabulary:
@@ -45,7 +46,7 @@ def setup(self, stage: Optional[str] = None):
             print("Can't find vocabulary, collect it from train holdout")
             build_from_scratch(join(self._data_dir, f"{self._train}.c2s"), Vocabulary)
         vocabulary_path = join(self._data_dir, Vocabulary.vocab_filename)
-        self._vocabulary = Vocabulary(vocabulary_path, self._config.max_labels, self._config.max_tokens)
+        self._vocabulary = Vocabulary(vocabulary_path, self._config.max_labels, self._config.max_tokens, self._is_class)
 
     @staticmethod
     def collate_wrapper(batch: List[Optional[LabeledPathContext]]) -> BatchedLabeledPathContext:
diff --git a/code2seq/data/path_context_dataset.py b/code2seq/data/path_context_dataset.py
@@ -23,8 +23,6 @@ def __init__(self, data_file: str, config: DictConfig, vocabulary: Vocabulary, r
         self._vocab = vocabulary
         self._random_context = random_context
 
-        self._label_unk = vocabulary.label_to_id[vocabulary.UNK]
-
         self._line_offsets = get_lines_offsets(data_file)
         self._n_samples = len(self._line_offsets)
 
@@ -49,7 +47,10 @@ def __getitem__(self, index) -> Optional[LabeledPathContext]:
         raw_path_contexts = raw_path_contexts[:n_contexts]
 
         # Tokenize label
-        label = self._tokenize_label(raw_label)
+        if self._config.max_label_parts == 1:
+            label = self.tokenize_class(raw_label, self._vocab.label_to_id)
+        else:
+            label = self.tokenize_label(raw_label, self._vocab.label_to_id, self._config.max_label_parts)
 
         # Tokenize paths
         try:
@@ -61,30 +62,40 @@ def __getitem__(self, index) -> Optional[LabeledPathContext]:
 
         return LabeledPathContext(label, paths)
 
-    def _tokenize_label(self, raw_label: str) -> torch.Tensor:
-        label = torch.full((self._config.max_label_parts + 1,), self._vocab.label_to_id[self._vocab.PAD])
-        label[0] = self._vocab.label_to_id[self._vocab.SOS]
-        sublabels = raw_label.split(self._separator)[: self._config.max_label_parts]
-        label[1 : len(sublabels) + 1] = torch.tensor(
-            [self._vocab.label_to_id.get(sl, self._label_unk) for sl in sublabels]
-        )
-        if len(sublabels) < self._config.max_label_parts:
-            label[len(sublabels) + 1] = self._vocab.label_to_id[self._vocab.EOS]
+    @staticmethod
+    def tokenize_class(raw_class: str, vocab: Dict[str, int]) -> torch.Tensor:
+        return torch.tensor([vocab[raw_class]], dtype=torch.long)
+
+    @staticmethod
+    def tokenize_label(raw_label: str, vocab: Dict[str, int], max_parts: Optional[int]) -> torch.Tensor:
+        sublabels = raw_label.split(PathContextDataset._separator)
+        max_parts = max_parts or len(sublabels)
+        label_unk = vocab[Vocabulary.UNK]
+
+        label = torch.full((max_parts + 1,), vocab[Vocabulary.PAD], dtype=torch.long)
+        label[0] = vocab[Vocabulary.SOS]
+        sub_tokens_ids = [vocab.get(st, label_unk) for st in sublabels[:max_parts]]
+        label[1 : len(sub_tokens_ids) + 1] = torch.tensor(sub_tokens_ids)
+
+        if len(sublabels) < max_parts:
+            label[len(sublabels) + 1] = vocab[Vocabulary.EOS]
+
         return label
 
-    def _tokenize_token(self, token: str, vocab: Dict[str, int], max_parts: Optional[int]) -> torch.Tensor:
-        sub_tokens = token.split(self._separator)
+    @staticmethod
+    def tokenize_token(token: str, vocab: Dict[str, int], max_parts: Optional[int]) -> torch.Tensor:
+        sub_tokens = token.split(PathContextDataset._separator)
         max_parts = max_parts or len(sub_tokens)
-        token_unk = vocab[self._vocab.UNK]
+        token_unk = vocab[Vocabulary.UNK]
 
-        result = torch.full((max_parts,), vocab[self._vocab.PAD], dtype=torch.long)
+        result = torch.full((max_parts,), vocab[Vocabulary.PAD], dtype=torch.long)
         sub_tokens_ids = [vocab.get(st, token_unk) for st in sub_tokens[:max_parts]]
         result[: len(sub_tokens_ids)] = torch.tensor(sub_tokens_ids)
         return result
 
     def _get_path(self, raw_path: List[str]) -> Path:
         return Path(
-            from_token=self._tokenize_token(raw_path[0], self._vocab.token_to_id, self._config.max_token_parts),
-            path_node=self._tokenize_token(raw_path[1], self._vocab.node_to_id, self._config.path_length),
-            to_token=self._tokenize_token(raw_path[2], self._vocab.token_to_id, self._config.max_token_parts),
+            from_token=self.tokenize_token(raw_path[0], self._vocab.token_to_id, self._config.max_token_parts),
+            path_node=self.tokenize_token(raw_path[1], self._vocab.node_to_id, self._config.path_length),
+            to_token=self.tokenize_token(raw_path[2], self._vocab.token_to_id, self._config.max_token_parts),
         )
diff --git a/code2seq/data/typed_path_context_dataset.py b/code2seq/data/typed_path_context_dataset.py
@@ -14,9 +14,9 @@ def __init__(self, data_file: str, config: DictConfig, vocabulary: TypedVocabula
 
     def _get_path(self, raw_path: List[str]) -> TypedPath:
         return TypedPath(
-            from_type=self._tokenize_token(raw_path[0], self._vocab.type_to_id, self._config.max_type_parts),
-            from_token=self._tokenize_token(raw_path[1], self._vocab.token_to_id, self._config.max_token_parts),
-            path_node=self._tokenize_token(raw_path[2], self._vocab.node_to_id, self._config.path_length),
-            to_token=self._tokenize_token(raw_path[3], self._vocab.token_to_id, self._config.max_token_parts),
-            to_type=self._tokenize_token(raw_path[4], self._vocab.type_to_id, self._config.max_type_parts),
+            from_type=self.tokenize_token(raw_path[0], self._vocab.type_to_id, self._config.max_type_parts),
+            from_token=self.tokenize_token(raw_path[1], self._vocab.token_to_id, self._config.max_token_parts),
+            path_node=self.tokenize_token(raw_path[2], self._vocab.node_to_id, self._config.path_length),
+            to_token=self.tokenize_token(raw_path[3], self._vocab.token_to_id, self._config.max_token_parts),
+            to_type=self.tokenize_token(raw_path[4], self._vocab.type_to_id, self._config.max_type_parts),
         )
diff --git a/code2seq/data/vocabulary.py b/code2seq/data/vocabulary.py
@@ -8,6 +8,19 @@
 
 
 class Vocabulary(BaseVocabulary):
+    def __init__(
+        self,
+        vocabulary_file: str,
+        max_labels: Optional[int] = None,
+        max_tokens: Optional[int] = None,
+        is_class: bool = False,
+    ):
+        super().__init__(vocabulary_file, max_labels, max_tokens)
+        if is_class:
+            self._label_to_id = {
+                token[0]: i for i, token in enumerate(self._counters[self.LABEL].most_common(max_labels))
+            }
+
     @staticmethod
     def _process_raw_sample(raw_sample: str, counters: Dict[str, CounterType[str]], context_seq: List[str]):
         label, *path_contexts = raw_sample.split(" ")
diff --git a/code2seq/model/code2class.py b/code2seq/model/code2class.py
@@ -28,15 +28,15 @@ def __init__(self, model_config: DictConfig, optimizer_config: DictConfig, vocab
             vocabulary.node_to_id[Vocabulary.PAD],
         )
 
-        self._classifier = Classifier(model_config, self._num_classes)
+        self._classifier = Classifier(model_config, len(vocabulary.label_to_id))
 
         metrics: Dict[str, Metric] = {
             f"{holdout}_acc": Accuracy(num_classes=len(vocabulary.label_to_id)) for holdout in ["train", "val", "test"]
         }
         self.__metrics = MetricCollection(metrics)
 
     def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
-        return configure_optimizers_alon(self._config.hyper_parameters, self.parameters())
+        return configure_optimizers_alon(self._optim_config, self.parameters())
 
     def forward(  # type: ignore
         self,
@@ -45,7 +45,7 @@ def forward(  # type: ignore
         to_token: torch.Tensor,
         contexts_per_label: torch.Tensor,
     ) -> torch.Tensor:
-        encoded_paths = self.__encoder(from_token, path_nodes, to_token)
+        encoded_paths = self._encoder(from_token, path_nodes, to_token)
         output_logits = self._classifier(encoded_paths, contexts_per_label)
         return output_logits
 
@@ -54,11 +54,12 @@ def forward(  # type: ignore
     def _shared_step(self, batch: BatchedLabeledPathContext, step: str) -> Dict:
         # [batch size; num_classes]
         logits = self(batch.from_token, batch.path_nodes, batch.to_token, batch.contexts_per_label)
-        loss = torch.nn.functional.cross_entropy(logits, batch.labels.squeeze(0))
+        labels = batch.labels.squeeze(0)
+        loss = torch.nn.functional.cross_entropy(logits, labels)
 
         with torch.no_grad():
             predictions = logits.argmax(-1)
-            accuracy = self.__metrics[f"{step}_acc"](predictions, batch.labels)
+            accuracy = self.__metrics[f"{step}_acc"](predictions, labels)
 
         return {f"{step}/loss": loss, f"{step}/accuracy": accuracy}
 
@@ -78,7 +79,7 @@ def test_step(self, batch: BatchedLabeledPathContext, batch_idx: int) -> Dict:
 
     def _shared_epoch_end(self, outputs: List[Dict], step: str):
         with torch.no_grad():
-            mean_loss = torch.stack([out["loss"] for out in outputs]).mean()
+            mean_loss = torch.stack([out[f"{step}/loss"] for out in outputs]).mean()
             accuracy = self.__metrics[f"{step}_acc"].compute()
             log = {f"{step}/loss": mean_loss, f"{step}/accuracy": accuracy}
         self.log_dict(log, on_step=False, on_epoch=True)
diff --git a/code2seq/model/modules/path_encoder.py b/code2seq/model/modules/path_encoder.py
@@ -32,8 +32,14 @@ def __init__(
 
         concat_size = self._calculate_concat_size(config.embedding_size, config.encoder_rnn_size, self.num_directions)
         self.embedding_dropout = nn.Dropout(config.encoder_dropout)
-        self.linear = nn.Linear(concat_size, config.decoder_size, bias=False)
-        self.norm = nn.LayerNorm(config.decoder_size)
+        if "decoder_size" in config:
+            out_size = config["decoder_size"]
+        elif "classifier_size" in config:
+            out_size = config["classifier_size"]
+        else:
+            raise ValueError("Specify out size of encoder")
+        self.linear = nn.Linear(concat_size, out_size, bias=False)
+        self.norm = nn.LayerNorm(out_size)
 
     @staticmethod
     def _calculate_concat_size(embedding_size: int, rnn_size: int, num_directions: int) -> int:
diff --git a/config/code2class-poj104.yaml b/config/code2class-poj104.yaml
@@ -1,72 +1,54 @@
-hydra:
-  run:
-    dir: .
-  output_subdir: null
-  job_logging: null
-  hydra_logging: null
+data_folder: ../data/poj-104/poj-104-code2seq
 
-name: code2class
+checkpoint: null
 
 seed: 7
-num_workers: 2
-log_offline: false
+log_offline: true
+# Training in notebooks (e.g. Google Colab) may crash with too small value
+progress_bar_refresh_rate: 1
+print_config: true
 
-# data keys
-data_folder: /permanent-data
-vocabulary_name: vocabulary.pkl
-train_holdout: train
-val_holdout: val
-test_holdout: test
+data:
+  url: https://s3.eu-west-1.amazonaws.com/datasets.ml.labs.aws.intellij.net/poj-104/poj-104-code2seq.tar.gz
+  num_workers: 0
 
-save_every_epoch: 1
-val_every_epoch: 1
-log_every_epoch: 10
-progress_bar_refresh_rate: 1
+  max_labels: null
+  max_label_parts: 1
+  max_tokens: 190000
+  max_token_parts: 5
+  path_length: 9
 
-hyper_parameters:
-  n_epochs: 3000
-  patience: 10
-  batch_size: 512
-  test_batch_size: 512
-  clip_norm: 5
   max_context: 200
   random_context: true
-  shuffle_data: true
-
-  optimizer: "Momentum"
-  nesterov: true
-  learning_rate: 0.01
-  weight_decay: 0
-  decay_gamma: 0.95
 
-dataset:
-  name: poj_104
-  target:
-    max_parts: 1
-    is_wrapped: false
-    is_splitted: false
-    vocabulary_size: 27000
-  token:
-    max_parts: 5
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: 190000
-  path:
-    max_parts: 9
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: null
+  batch_size: 512
+  test_batch_size: 768
 
-encoder:
+model:
+  # Encoder
   embedding_size: 128
-  rnn_size: 128
+  encoder_dropout: 0.25
+  encoder_rnn_size: 128
   use_bi_rnn: true
-  embedding_dropout: 0.25
   rnn_num_layers: 1
-  rnn_dropout: 0.5
 
-classifier:
-  n_hidden_layers: 2
-  hidden_size: 128
-  classifier_input_size: 256
+  # Classifier
+  classifier_layers: 2
+  classifier_size: 128
   activation: relu
+
+optimizer:
+  optimizer: "Momentum"
+  nesterov: true
+  lr: 0.01
+  weight_decay: 0
+  decay_gamma: 0.95
+
+train:
+  n_epochs: 10
+  patience: 10
+  clip_norm: 5
+  teacher_forcing: 1.0
+  val_every_epoch: 1
+  save_every_epoch: 1
+  log_every_n_steps: 10
diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,5 @@ torchmetrics==0.5.0
 
 tqdm==4.62.1
 wandb==0.12.0
-omegaconf==2.1.0
-commode-utils==0.3.7
+omegaconf==2.1.1
+commode-utils==0.3.8