add cross attention based text conditioning for fine transformer too

lucidrains · lucidrains · commit 5ef6cbfeaa3c · 2024-05-10T16:53:21.000-07:00
diff --git a/meshgpt_pytorch/meshgpt_pytorch.py b/meshgpt_pytorch/meshgpt_pytorch.py
@@ -1139,6 +1139,9 @@ def __init__(
             attn_flash = flash_attn,
             attn_dropout = dropout,
             ff_dropout = dropout,
+            cross_attend = condition_on_text,
+            cross_attn_dim_context = cross_attn_dim_context,
+            cross_attn_num_mem_kv = cross_attn_num_mem_kv,
             **attn_kwargs
         )
 
@@ -1338,9 +1341,11 @@ def forward_on_codes(
                 cond_drop_prob = cond_drop_prob
             )
 
+            text_embed, text_mask = maybe_dropped_text_embeds
+
             attn_context_kwargs = dict(
-                context = maybe_dropped_text_embeds.embed,
-                context_mask = maybe_dropped_text_embeds.mask
+                context = text_embed,
+                context_mask = text_mask
             )
 
         # take care of codes that may be flattened
@@ -1471,8 +1476,8 @@ def forward_on_codes(
 
         if self.condition_on_text:
             pooled_text_embed = masked_mean(
-                maybe_dropped_text_embeds.embed,
-                maybe_dropped_text_embeds.mask,
+                text_embed,
+                text_mask,
                 dim = 1
             )
 
@@ -1512,15 +1517,25 @@ def forward_on_codes(
                     ck, cv = map(lambda t: t[:, -1, :, :curr_vertex_pos], (ck, cv))
                     attn_intermediate.cached_kv = (ck, cv)
 
-        one_face = fine_vertex_codes.shape[1] == 1
+        num_faces = fine_vertex_codes.shape[1]
+        one_face = num_faces == 1
 
         fine_vertex_codes = rearrange(fine_vertex_codes, 'b nf n d -> (b nf) n d')
 
         if one_face:
             fine_vertex_codes = fine_vertex_codes[:, :(curr_vertex_pos + 1)]
 
+        fine_attn_context_kwargs = dict()
+
+        if self.condition_on_text:
+            fine_attn_context_kwargs = dict(
+                context = repeat(text_embed, 'b ... -> (b nf) ...', nf = num_faces),
+                context_mask = repeat(text_mask, 'b ... -> (b nf) ...', nf = num_faces)
+            )
+
         attended_vertex_codes, fine_cache = self.fine_decoder(
             fine_vertex_codes,
+            **fine_attn_context_kwargs,
             cache = fine_cache,
             return_hiddens = True
         )
diff --git a/meshgpt_pytorch/version.py b/meshgpt_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.2.2'
+__version__ = '1.2.3'

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.2.2'`
	`1`	`+__version__ = '1.2.3'`