Refactor typed code2seq model

SpirinEgor · SpirinEgor · commit 6547bf16d20b · 2021-08-22T12:26:46.000+05:00
diff --git a/code2seq/data/typed_path_context_data_module.py b/code2seq/data/typed_path_context_data_module.py
@@ -1,13 +1,12 @@
+from os.path import exists, join
 from typing import List, Optional
 
+from commode_utils.vocabulary import build_from_scratch
 from omegaconf import DictConfig
 
-from code2seq.data import (
-    PathContextDataModule,
-    TypedPathContextDataset,
-    BatchedLabeledTypedPathContext,
-    LabeledTypedPathContext,
-)
+from code2seq.data.path_context import LabeledTypedPathContext, BatchedLabeledTypedPathContext
+from code2seq.data.path_context_data_module import PathContextDataModule
+from code2seq.data.typed_path_context_dataset import TypedPathContextDataset
 from code2seq.data.vocabulary import TypedVocabulary
 
 
@@ -24,6 +23,15 @@ def collate_wrapper(batch: List[Optional[LabeledTypedPathContext]]) -> BatchedLa
     def _create_dataset(self, holdout_file: str, random_context: bool) -> TypedPathContextDataset:
         return TypedPathContextDataset(holdout_file, self._config, self._vocabulary, random_context)
 
+    def setup(self, stage: Optional[str] = None):
+        if not exists(join(self._data_dir, TypedVocabulary.vocab_filename)):
+            print("Can't find vocabulary, collect it from train holdout")
+            build_from_scratch(join(self._data_dir, f"{self._train}.c2s"), TypedVocabulary)
+        vocabulary_path = join(self._data_dir, TypedVocabulary.vocab_filename)
+        self._vocabulary = TypedVocabulary(
+            vocabulary_path, self._config.max_labels, self._config.max_tokens, self._config.max_types
+        )
+
     @property
     def vocabulary(self) -> TypedVocabulary:
         if self._vocabulary is None:
diff --git a/code2seq/model/code2seq.py b/code2seq/model/code2seq.py
@@ -1,4 +1,4 @@
-from typing import Tuple, List, Dict
+from typing import Tuple, List, Dict, Optional
 
 import torch
 from commode_utils.losses import SequenceCrossEntropyLoss
@@ -43,9 +43,9 @@ def __init__(
         }
         self.__metrics = MetricCollection(metrics)
 
-        self.__encoder = self._get_encoder(model_config)
+        self._encoder = self._get_encoder(model_config)
         decoder_step = LSTMDecoderStep(model_config, len(vocabulary.label_to_id), self.__pad_idx)
-        self.__decoder = Decoder(
+        self._decoder = Decoder(
             decoder_step, len(vocabulary.label_to_id), vocabulary.label_to_id[vocabulary.SOS], teacher_forcing
         )
 
@@ -78,23 +78,28 @@ def forward(  # type: ignore
         output_length: int,
         target_sequence: torch.Tensor = None,
     ) -> torch.Tensor:
-        encoded_paths = self.__encoder(from_token, path_nodes, to_token)
-        output_logits = self.__decoder(encoded_paths, contexts_per_label, output_length, target_sequence)
+        encoded_paths = self._encoder(from_token, path_nodes, to_token)
+        output_logits = self._decoder(encoded_paths, contexts_per_label, output_length, target_sequence)
         return output_logits
 
     # ========== Model step ==========
 
-    def _shared_step(self, batch: BatchedLabeledPathContext, step: str) -> Dict:
-        target_sequence = batch.labels if step == "train" else None
-        # [seq length; batch size; vocab size]
-        logits = self(
+    def logits_from_batch(
+        self, batch: BatchedLabeledPathContext, target_sequence: Optional[torch.Tensor]
+    ) -> torch.Tensor:
+        return self(
             batch.from_token,
             batch.path_nodes,
             batch.to_token,
             batch.contexts_per_label,
             batch.labels.shape[0],
             target_sequence,
         )
+
+    def _shared_step(self, batch: BatchedLabeledPathContext, step: str) -> Dict:
+        target_sequence = batch.labels if step == "train" else None
+        # [seq length; batch size; vocab size]
+        logits = self.logits_from_batch(batch, target_sequence)
         loss = self.__loss(logits[1:], batch.labels[1:])
 
         with torch.no_grad():
diff --git a/code2seq/model/typed_code2seq.py b/code2seq/model/typed_code2seq.py
@@ -1,5 +1,9 @@
+from typing import Optional
+
+import torch
 from omegaconf import DictConfig
 
+from code2seq.data.path_context import BatchedLabeledTypedPathContext
 from code2seq.data.vocabulary import TypedVocabulary
 from code2seq.model import Code2Seq
 from code2seq.model.modules import TypedPathEncoder, PathEncoder
@@ -26,3 +30,32 @@ def _get_encoder(self, config: DictConfig) -> PathEncoder:
             len(self._vocabulary.type_to_id),
             self._vocabulary.type_to_id[TypedVocabulary.PAD],
         )
+
+    def forward(  # type: ignore
+        self,
+        from_type: torch.Tensor,
+        from_token: torch.Tensor,
+        path_nodes: torch.Tensor,
+        to_token: torch.Tensor,
+        to_type: torch.Tensor,
+        contexts_per_label: torch.Tensor,
+        output_length: int,
+        target_sequence: torch.Tensor = None,
+    ) -> torch.Tensor:
+        encoded_paths = self._encoder(from_type, from_token, path_nodes, to_token, to_type)
+        output_logits = self._decoder(encoded_paths, contexts_per_label, output_length, target_sequence)
+        return output_logits
+
+    def logits_from_batch(
+        self, batch: BatchedLabeledTypedPathContext, target_sequence: Optional[torch.Tensor]
+    ) -> torch.Tensor:
+        return self(
+            batch.from_type,
+            batch.from_token,
+            batch.path_nodes,
+            batch.to_token,
+            batch.to_type,
+            batch.contexts_per_label,
+            batch.labels.shape[0],
+            target_sequence,
+        )
diff --git a/config/typed-code2seq-java-small.yaml b/config/typed-code2seq-java-small.yaml
@@ -1,81 +1,58 @@
-hydra:
-  run:
-    dir: .
-  output_subdir: null
-  job_logging: null
-  hydra_logging: null
+data_folder: ../data/code2seq/java-small
 
-name: typed-code2seq
+checkpoint: null
 
 seed: 7
-num_workers: 2
 log_offline: false
-
-resume_from_checkpoint: null
-
-# data keys
-data_folder: /permanent-data
-vocabulary_name: vocabulary.pkl
-train_holdout: train
-val_holdout: val
-test_holdout: test
-
-save_every_epoch: 1
-val_every_epoch: 1
-log_every_epoch: 10
+# Training in notebooks (e.g. Google Colab) may crash with too small value
 progress_bar_refresh_rate: 1
+print_config: true
+
+data:
+  url: https://s3.eu-west-1.amazonaws.com/datasets.ml.labs.aws.intellij.net/java-paths-methods/java-small.tar.gz
+  num_workers: 4
+
+  # Each token appears at least 5 times
+  max_labels: 5332
+  max_label_parts: 7
+  # Each token appears at least 5 times
+  max_tokens: 18240
+  max_token_parts: 5
+  max_typed: null
+  max_type_parts: 5
+  path_length: 9
 
-hyper_parameters:
-  n_epochs: 3000
-  patience: 10
-  batch_size: 512
-  test_batch_size: 512
-  clip_norm: 5
   max_context: 200
   random_context: true
-  shuffle_data: true
-
-  optimizer: "Momentum"
-  nesterov: true
-  learning_rate: 0.01
-  weight_decay: 0
-  decay_gamma: 0.95
 
-dataset:
-  name: java-small-psi
-  target:
-    max_parts: 7
-    is_wrapped: true
-    is_splitted: true
-    vocabulary_size: 11316
-  token:
-    max_parts: 5
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: 73904
-  path:
-    max_parts: 9
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: null
-  type:
-    max_parts: 5
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: null
+  batch_size: 512
+  test_batch_size: 768
 
-encoder:
+model:
+  # Encoder
   embedding_size: 128
-  rnn_size: 128
+  encoder_dropout: 0.25
+  encoder_rnn_size: 128
   use_bi_rnn: true
-  embedding_dropout: 0.25
   rnn_num_layers: 1
-  rnn_dropout: 0.5
 
-decoder:
+  # Decoder
   decoder_size: 320
-  embedding_size: 128
-  num_decoder_layers: 1
+  decoder_num_layers: 1
   rnn_dropout: 0.5
-  teacher_forcing: 1
-  beam_width: 0
+
+optimizer:
+  optimizer: "Momentum"
+  nesterov: true
+  lr: 0.01
+  weight_decay: 0
+  decay_gamma: 0.95
+
+train:
+  n_epochs: 10
+  patience: 10
+  clip_norm: 5
+  teacher_forcing: 1.0
+  val_every_epoch: 1
+  save_every_epoch: 1
+  log_every_n_steps: 10
diff --git a/config/typed-code2seq-java-test.yaml b/config/typed-code2seq-java-test.yaml
@@ -1,81 +1,55 @@
-hydra:
-  run:
-    dir: .
-  output_subdir: null
-  job_logging: null
-  hydra_logging: null
+data_folder: ../data/code2seq/java-test-typed
 
-name: typed-code2seq
+checkpoint: null
 
 seed: 7
-num_workers: 0
 log_offline: true
+# Training in notebooks (e.g. Google Colab) may crash with too small value
+progress_bar_refresh_rate: 1
+print_config: true
 
-resume_from_checkpoint: null
-
-# data keys
-data_folder: /permanent-data
-vocabulary_name: vocabulary.pkl
-train_holdout: train
-val_holdout: val
-test_holdout: test
+data:
+  num_workers: 0
 
-save_every_epoch: 1
-val_every_epoch: 1
-log_every_epoch: 10
-progress_bar_refresh_rate: 1
+  max_labels: null
+  max_label_parts: 7
+  max_tokens: null
+  max_token_parts: 5
+  max_types: null
+  max_type_parts: 5
+  path_length: 9
 
-hyper_parameters:
-  n_epochs: 5
-  patience: 10
-  batch_size: 5
-  test_batch_size: 5
-  clip_norm: 5
   max_context: 200
   random_context: true
-  shuffle_data: true
-
-  optimizer: "Momentum"
-  nesterov: true
-  learning_rate: 0.01
-  weight_decay: 0
-  decay_gamma: 0.95
 
-dataset:
-  name: java-test-psi
-  target:
-    max_parts: 7
-    is_wrapped: true
-    is_splitted: true
-    vocabulary_size: 11316
-  token:
-    max_parts: 5
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: 73904
-  path:
-    max_parts: 9
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: null
-  type:
-    max_parts: 5
-    is_wrapped: false
-    is_splitted: true
-    vocabulary_size: null
+  batch_size: 5
+  test_batch_size: 10
 
-encoder:
+model:
+  # Encoder
   embedding_size: 10
-  rnn_size: 10
+  encoder_dropout: 0.25
+  encoder_rnn_size: 10
   use_bi_rnn: true
-  embedding_dropout: 0.25
   rnn_num_layers: 1
-  rnn_dropout: 0.5
 
-decoder:
+  # Decoder
   decoder_size: 20
-  embedding_size: 10
-  num_decoder_layers: 1
+  decoder_num_layers: 1
   rnn_dropout: 0.5
-  teacher_forcing: 1
-  beam_width: 0
+
+optimizer:
+  optimizer: "Momentum"
+  nesterov: true
+  lr: 0.01
+  weight_decay: 0
+  decay_gamma: 0.95
+
+train:
+  n_epochs: 5
+  patience: 10
+  clip_norm: 10
+  teacher_forcing: 1.0
+  val_every_epoch: 1
+  save_every_epoch: 1
+  log_every_n_steps: 10