backup

konstantinosKokos · konstantinosKokos · commit 7be34a1b5abf · 2023-05-06T21:06:59.000+02:00
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -4,11 +4,11 @@
 
 
 samples = []
-for i, file in enumerate(parse_dir('../stdlib/json')):
+for i, file in enumerate(parse_dir('../stdlib', version='simplified')):
     anonymous = enum_references(file)
     scope, holes = tokenize_file(anonymous)
     if len(holes) != 0:
         samples.append((scope, holes))
 
-with open('../data/tokenized.p', 'wb') as f:
+with open('../data/tokenized_sim.p', 'wb') as f:
     pickle.dump(samples, f)
diff --git a/scripts/train.py b/scripts/train.py
@@ -1,12 +1,13 @@
 import pickle
 
-import torch
+import sys
+sys.path.extend(['../'])
 
+import torch
 from src.Name.neural.batching import make_collator, Sampler
 from src.Name.neural.training import TrainWrapper
 from src.Name.neural.utils import make_schedule, binary_stats, macro_binary_stats
 from torch import device as _device
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import LambdaLR
 from math import ceil
@@ -15,11 +16,11 @@
     tokenized = pickle.load(f)
 
 
+
 dim = 128
 num_epochs = 100
-encoder_layers = 3
-num_iters = 4
-batch_size = 4
+num_layers = 8
+batch_size = 2
 backprop_every = 1
 num_holes = 4
 max_scope_size = 150
@@ -38,13 +39,10 @@
 
 epoch_size = train_sampler.itersize(batch_size * backprop_every, num_holes)
 
-model = TrainWrapper(num_layers=encoder_layers, num_iters=num_iters, dim=dim,
-                     max_scope_size=max_scope_size, max_db_index=max_db_index).to(device)
+model = TrainWrapper(num_layers=num_layers, dim=dim, max_db_index=max_db_index).to(device)
 
-lemma_loss_fn = BCEWithLogitsLoss(reduction='sum', pos_weight=torch.tensor(50., device=device))
-lm_loss_fn = CrossEntropyLoss(reduction='sum')
 
-opt = AdamW(model.parameters(), lr=1)
+opt = AdamW(model.parameters(), lr=1, weight_decay=1e-02)
 scheduler = LambdaLR(opt,
                      make_schedule(warmup_steps=3 * epoch_size,
                                    total_steps=100 * epoch_size,
@@ -61,22 +59,24 @@
     train_epoch = train_sampler.iter(batch_size, num_holes)
     model.train()
 
-    for batch_id, batch in enumerate(train_epoch):
-        lemma_preds, gold_labels, (lm_hits, lm_total), lemma_loss, lm_loss = model.compute_losses(collator(batch, 0.1))
-        loss = lemma_loss + lm_loss
-        loss.backward()
+    with torch.autograd.set_detect_anomaly(True):
+        for batch_id, batch in enumerate(train_epoch):
+            collated = collator(batch, 0.1, 0.5)
+            lemma_preds, gold_labels, (lm_hits, lm_total), lemma_loss, lm_loss = model.compute_losses(collated)
+            loss = lemma_loss + lm_loss
+            loss.backward()
 
-        if (batch_id + 1) % backprop_every == 0:
-            opt.step()
-            scheduler.step()
-            opt.zero_grad(set_to_none=True)
+            if (batch_id + 1) % backprop_every == 0:
+                opt.step()
+                scheduler.step()
+                opt.zero_grad(set_to_none=True)
 
-        epoch_lemma_loss += lemma_loss.item()
-        epoch_lm_loss += lm_loss.item()
-        epoch_lemma_preds += lemma_preds
-        epoch_lemma_correct += gold_labels
-        epoch_lm_hits += lm_hits
-        epoch_lm_total += lm_total
+            epoch_lemma_loss += lemma_loss.item()
+            epoch_lm_loss += lm_loss.item()
+            epoch_lemma_preds += lemma_preds
+            epoch_lemma_correct += gold_labels
+            epoch_lm_hits += lm_hits
+            epoch_lm_total += lm_total
 
     print('=' * 64)
     print(f'Epoch {epoch_id}')
@@ -100,7 +100,7 @@
 
     with torch.no_grad():
         for file in dev_sampler.filtered:
-            lemma_preds, gold_labels, _, lemma_loss, _ = model.compute_losses(collator([file], 0.0))
+            lemma_preds, gold_labels, _, lemma_loss, _ = model.compute_losses(collator([file], 0.0, 1))
             epoch_dev_loss += lemma_loss.item()
 
             epoch_lemma_preds += lemma_preds
diff --git a/src/Name/data/reader.py b/src/Name/data/reader.py
@@ -154,58 +154,66 @@ def substitute(self, names: dict[Name, Other]) -> LevelType[Other]:
         return self
 
 
-def parse_dir(directory: str, must_contain: str | None = None) -> Iterator[File[str]]:
+def parse_dir(directory: str, must_contain: str | None = None, version: str = 'original') -> Iterator[File[str]]:
     for file in listdir(directory):
-        if must_contain is None or must_contain in file:
-            yield parse_file(path.join(directory, file))
+        if (must_contain is None or must_contain in file) and file.endswith('.json'):
+            print(f'Parsing {file}')
+            yield parse_file(path.join(directory, file), version)
 
 
-def parse_file(filepath: str) -> File[str]:
+def parse_file(filepath: str, version: str) -> File[str]:
     with open(filepath, 'r') as f:
-        return parse_data(load(f))
+        return parse_data(load(f), version)
 
 
-def parse_data(data_json: dict) -> File[str]:
+def parse_data(data_json: dict, version: str) -> File[str]:
     return File(name=data_json['scope']['name'],
-                scope=[parse_declaration(d) for d in data_json['scope']['item']],
-                holes=[parse_holes(s) for s in data_json['samples']])
+                scope=[parse_declaration(d, version) for d in data_json['scope']['item']],
+                holes=[parse_holes(s, version) for s in data_json['samples']])
 
 
-def parse_holes(hole_json: dict) -> Hole[str]:
+def parse_holes(hole_json: dict, version: str) -> Hole[str]:
     context_json = hole_json['ctx']['thing']
     goal_type_json = hole_json['goal']
     goal_term_json = hole_json['term']
     goal_names_used = hole_json['namesUsed']
-    context = [Declaration(name=c['name'], type=parse_type(c['item'])) for c in context_json]
+    context = [Declaration(name=c['name'], type=parse_type(c['item'], version)) for c in context_json]
 
     return Hole(
         goal_type=reduce(lambda result, argument: PiType(argument, result),
                          reversed(context),
-                         parse_type(goal_type_json['thing'])),  # type: ignore
-        goal_term=parse_type(goal_term_json['thing']),
+                         parse_type(goal_type_json['thing'], version)),  # type: ignore
+        goal_term=parse_type(goal_term_json['thing']['original'], version),
         names_used=[Reference(name) for name in goal_names_used])
 
 
-def parse_declaration(dec_json: dict) -> Declaration[str]:
-    return Declaration(name=dec_json['name'], type=parse_type(dec_json['item']['thing']))
+def parse_declaration(dec_json: dict, version: str) -> Declaration[str]:
+    return Declaration(name=dec_json['name'], type=parse_type(dec_json['item']['thing'], version))
 
 
-def parse_type(type_json: dict) -> AgdaType[str]:
+def parse_type(type_json: dict, which: str) -> AgdaType[str]:
+    def go(_type_json: dict) -> AgdaType[str]: return parse_type(_type_json, which)
+
+    if which in type_json.keys():
+        if (tmp := type_json[which]) is not None:
+            type_json = tmp
+        else:
+            type_json = type_json['original']
+
     match type_json['tag']:
         case 'Pi':
             left, right = type_json['contents']
             name, type_json = left['name'], left['item']
-            return PiType(argument=(Declaration(name=name, type=parse_type(type_json))
-                                    if name != '_' else parse_type(type_json)),
-                          result=parse_type(right))
+            return PiType(argument=(Declaration(name=name, type=go(type_json)) if name != '_' else go(type_json)),
+                          result=go(right))
         case 'App':
             head, args = type_json['contents']
             head_type = parse_head(head)
-            arg_types = [parse_type(arg) for arg in args]
+            arg_types = [go(arg) for arg in args]
             return reduce(AppType, arg_types, head_type)  # type: ignore
         case 'Lam':
             contents = type_json['contents']
-            return LamType(abstraction=contents['name'], body=parse_type(contents['item']))
+            return LamType(abstraction=contents['name'], body=go(contents['item']))
         case 'Sort':
             return SortType(type_json['contents'].replace(' ', '_'))
         case 'Lit':
diff --git a/src/Name/data/tokenization.py b/src/Name/data/tokenization.py
@@ -1,3 +1,5 @@
+import pdb
+
 from .reader import File
 from .internal import AgdaTree, DontCare, DeBruijn, Reference, OpNames, agda_to_tree
 from .tree import enumerate_nodes, flatten
diff --git a/src/Name/neural/batching.py b/src/Name/neural/batching.py
@@ -1,5 +1,3 @@
-import pdb
-
 import torch
 from torch import Tensor, device
 from ..data.tokenization import TokenizedSample, TokenizedFile, TokenizedTree
@@ -8,12 +6,37 @@
 from typing import Iterator, Callable
 from itertools import groupby
 from torch.nn.functional import pad as _pad
+from random import random
+from itertools import takewhile
+
+NineTensors = tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]
+
+
+def filter_unreferenced(file: TokenizedFile, negative_sampling: float) -> TokenizedFile:
+    scope, goals = file
 
-EightTensors = tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]
+    def refers_to(tree: TokenizedTree, excluding: set[int]) -> set[int]:
+        direct = {tv for tt, tv, _, _ in tree if tt == 3 and tv not in excluding}
+        excluding |= direct
+        return {indirect
+                for reference in direct
+                for indirect in refers_to(scope[reference], excluding)} | direct
+
+    def rename(tree: TokenizedTree, using: dict[int, int]) -> TokenizedTree:
+        return [(tt, using[tv] if tt == 3 else tv, np, using[tp]) for tt, tv, np, tp in tree]
+
+    all_references = set.union(*[refers_to(tree, set()) for tree in [*scope, *[goal_type for goal_type, _ in goals]]])
+    all_references |= {ref for _, names_used in goals for ref in names_used}
+    removed = [idx for idx in range(len(scope)) if idx not in all_references or random() > negative_sampling]
+    renames = {kept: kept - sum(map(lambda _: 1, takewhile(lambda r: r < kept, removed))) for kept in range(len(scope))}
+    renames[-1] = -1
+    return ([rename(tree, renames) for idx, tree in enumerate(scope) if idx not in removed],
+            [(rename(goal_type, renames), [renames[ref] for ref in names_used]) for goal_type, names_used in goals])
 
 
 def make_collator(cast_to: device = device('cpu'),
-                  pad_value: int = -1,) -> Callable[[list[TokenizedSample], float], EightTensors]:
+                  pad_value: int = -1,
+                  goal_id: int = -1) -> Callable[[list[TokenizedSample], float, float], NineTensors]:
     def _longt(xs) -> Tensor:
         return torch.tensor(xs, device=cast_to, dtype=torch.long)
 
@@ -26,7 +49,9 @@ def pad_tree(tree: TokenizedTree, to: int) -> Tensor:
     def pad_seq(file: list[Tensor]) -> Tensor:
         return pad_sequence(file, padding_value=pad_value)
 
-    def collator(samples: list[TokenizedSample], lm_chance: float) -> EightTensors:
+    def collator(samples: list[TokenizedSample], lm_chance: float, negative_sampling: float) -> NineTensors:
+        # samples = [filter_unreferenced(sample, negative_sampling) for sample in samples]
+
         num_scopes = len(samples)
         scope_sizes, goal_sizes = zip(*[(len(scope), len(holes)) for scope, holes in samples])
         most_trees = max(x+y for x, y in zip(scope_sizes, goal_sizes))
@@ -58,15 +83,15 @@ def collator(samples: list[TokenizedSample], lm_chance: float) -> EightTensors:
         dense_batch.masked_scatter_(lm_mask.unsqueeze(-1), masked_refs)
         batch_pointers = torch.arange(0, num_scopes, device=cast_to).view(-1, 1, 1) * torch.ones_like(token_padding_mask)
         batch_pointers = batch_pointers[lm_mask]
-
-        # is_goal = (dense_batch[:, :, :, -1] == goal_id).all(dim=-1) & tree_padding_mask
-        # scope_attention_mask = (~is_goal & tree_padding_mask).unsqueeze(-2).expand(-1, most_trees, -1)
-        # diag_mask = torch.eye(most_trees, dtype=torch.bool, device=cast_to).unsqueeze(0).expand(num_scopes, -1, -1)
-        # tree_attention_mask = scope_attention_mask | (diag_mask & tree_padding_mask.unsqueeze(-1))
+        
+        is_goal = (dense_batch[:, :, :, -1] == goal_id).all(dim=-1) & tree_padding_mask
+        scope_attention_mask = (~is_goal & tree_padding_mask).unsqueeze(-2).expand(-1, most_trees, -1)
+        diag_mask = torch.eye(most_trees, dtype=torch.bool, device=cast_to).unsqueeze(0).expand(num_scopes, -1, -1)
+        tree_attention_mask = scope_attention_mask | (diag_mask & tree_padding_mask.unsqueeze(-1))
         return (dense_batch.permute(-1, 0, 1, 2),
                 token_attention_mask,
                 tree_padding_mask,
-                # tree_attention_mask,
+                tree_attention_mask,
                 edge_index,
                 gold_labels,
                 lm_mask,
diff --git a/src/Name/neural/embedding.py b/src/Name/neural/embedding.py
@@ -21,7 +21,7 @@ def __init__(self, dim: int):
     def embed_positions(self, positions: list[int]) -> Tensor:
         # todo: this can be made much more efficient by reusing subsequence maps
         word_seq = [torch.tensor(self.node_pos_to_path(pos), device=self.primitives.device, dtype=torch.long)
-                    if pos > 0 else torch.tensor([])
+                    if pos > 0 else torch.empty(0, device=self.primitives.device, dtype=torch.long)
                     for pos in positions]
         word_ten = pad_sequence(word_seq, padding_value=2)
         maps = self.identity.repeat(len(positions), 1)
@@ -68,21 +68,20 @@ def __init__(self,
                  num_ops: int,
                  num_leaves: int,
                  dim: int,
-                 max_scope_size: int = 250,
                  max_db_index: int = 50):
         super(TokenEmbedder, self).__init__()
         self.num_leaves = num_leaves
         self.num_ops = num_ops
-        self.max_scope_size = max_scope_size
         self.max_db_size = max_db_index
+        self.dim = dim
         # ops, leaves, [sos], [ref], [oos], [mask]
-        self.fixed_embeddings = Embedding(num_embeddings=num_ops+num_leaves+4, embedding_dim=dim // 2)
+        self.fixed_embeddings = Embedding(num_embeddings=num_ops+num_leaves+3, embedding_dim=dim // 2)
         self.path_encoder = BinaryPathEncoder.orthogonal(dim // 2)
         self.db_encoder = SequentialPositionEncoder(dim // 2, freq=max_db_index)
 
     def forward(self, dense_batch: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         token_types, token_values, node_positions, tree_positions = dense_batch
-        num_scopes, num_entries, _ = token_types.shape
+        num_scopes, num_entries, num_tokens = token_types.shape
 
         sos_mask = token_types == 0
         op_mask = token_types == 1
@@ -98,8 +97,8 @@ def forward(self, dense_batch: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         content_embeddings[sos_mask] = self.fixed_embeddings.weight[0]
         content_embeddings[op_mask] = self.fixed_embeddings.forward(token_values[op_mask] + 1)
         content_embeddings[leaf_mask] = self.fixed_embeddings.forward(token_values[leaf_mask] + self.num_ops + 1)
-        content_embeddings[ref_mask] = self.fixed_embeddings.weight[-3]
-        content_embeddings[oos_mask] = self.fixed_embeddings.weight[-2]
+        content_embeddings[ref_mask] = self.fixed_embeddings.weight[-2]
+        content_embeddings[oos_mask] = self.fixed_embeddings.weight[-1]
         content_embeddings[lm_mask] = self.fixed_embeddings.weight[-1]
         content_embeddings[db_mask] = self.db_encoder.forward(token_values[db_mask])
 
diff --git a/src/Name/neural/encoders.py b/src/Name/neural/encoders.py
diff --git a/src/Name/neural/model.py b/src/Name/neural/model.py
diff --git a/src/Name/neural/utils.py b/src/Name/neural/utils.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+import pdb`
	`2`	`+`
`1`	`3`	`from .reader import File`
`2`	`4`	`from .internal import AgdaTree, DontCare, DeBruijn, Reference, OpNames, agda_to_tree`
`3`	`5`	`from .tree import enumerate_nodes, flatten`