address #19 by allowing for an option to attend to raw read memory positional embeddings on first step

lucidrains · lucidrains · commit 90de2ac64c1c · 2023-08-31T12:39:21.000-07:00
diff --git a/recurrent_memory_transformer_pytorch/recurrent_memory_transformer.py b/recurrent_memory_transformer_pytorch/recurrent_memory_transformer.py
@@ -229,6 +229,7 @@ def __init__(
         memory_not_causal = True,            # flash attention behaves a bit more optimally if causal mask is not explicitly passed in - but if the memories perform better without a causal mask, it is necessary to have this turned on
         add_write_to_next_write_mem = False, # add the write memories of previous step to the next write step - thanks to @IcarusWizard for pointing out this discrepancy
         next_write_mem_stop_grad = True,     # whether to stop gradient of previous read memory -> next write memory
+        always_have_read_memories = True,    # whether to always have read memories, even on the first step, so to make the model onnx-able
         resi_dual_scale = 1.,                # in the case of overflows in fp16 on the prenorm branch, set this to a value less than 1.
     ):
         super().__init__()
@@ -306,6 +307,11 @@ def __init__(
         self.add_write_to_next_write_mem = add_write_to_next_write_mem
         self.next_write_mem_stop_grad = next_write_mem_stop_grad
 
+        # allow for attending to raw read memory positional embeddings on first step
+        # hack to make it onnx-able and should not hurt
+
+        self.always_have_read_memories = always_have_read_memories
+
     def init_memory(self, batch):
         return repeat(self.memory_tokens, 'm d -> b m d', b = batch)
 
@@ -350,6 +356,9 @@ def forward(
         if exists(read_memories):
             read_mem_length = mem_length
             read_memories = read_memories + self.read_memory_emb
+        elif self.always_have_read_memories:
+            read_mem_length = mem_length
+            read_memories = repeat(self.read_memory_emb, 'n d -> b n d', b = b)
         else:
             read_mem_length = 0
             read_memories = x[:, 0:0]
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'recurrent-memory-transformer-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.5.3',
+  version = '0.5.4',
   license='MIT',
   description = 'Recurrent Memory Transformer - Pytorch',
   author = 'Phil Wang',