address first set of fine tokens not being conditioned #80

lucidrains · lucidrains · commit b5e8d106ff67 · 2024-05-10T10:53:35.000-07:00
diff --git a/meshgpt_pytorch/meshgpt_pytorch.py b/meshgpt_pytorch/meshgpt_pytorch.py
@@ -116,6 +116,20 @@ def pad_to_length(t, length, dim = -1, value = 0, right = True):
     padding = (0, remainder) if right else (remainder, 0)
     return pad_at_dim(t, padding, dim = dim, value = value)
 
+def masked_mean(tensor, mask, dim = -1, eps = 1e-5):
+    if not exists(mask):
+        return tensor.mean(dim = dim)
+
+    mask = rearrange(mask, '... -> ... 1')
+    tensor = tensor.masked_fill(~mask, 0.)
+
+    total_el = mask.sum(dim = dim)
+    num = tensor.sum(dim = dim)
+    den = total_el.float().clamp(min = eps)
+    mean = num / den
+    mean = mean.masked_fill(total_el == 0, 0.)
+    return mean
+
 # continuous embed
 
 def ContinuousEmbed(dim_cont):
@@ -1039,9 +1053,14 @@ def __init__(
         self.codebook_size = autoencoder.codebook_size
         self.num_quantizers = autoencoder.num_quantizers
 
-        self.sos_token = nn.Parameter(torch.randn(dim_fine))
         self.eos_token_id = self.codebook_size
 
+        # the fine transformer sos token
+        # as well as a projection of pooled text embeddings to condition it
+        # (todo) - sos token should be moved to the coarse transformer stage
+
+        self.sos_token = nn.Parameter(torch.randn(dim_fine))
+
         # they use axial positional embeddings
 
         assert divisible_by(max_seq_len, self.num_vertices_per_face * self.num_quantizers), f'max_seq_len ({max_seq_len}) must be divisible by (3 x {self.num_quantizers}) = {3 * self.num_quantizers}' # 3 or 4 vertices per face, with D codes per vertex
@@ -1067,7 +1086,11 @@ def __init__(
                 model_types = text_condition_model_types,
                 cond_drop_prob = text_condition_cond_drop_prob
             )
-            cross_attn_dim_context = self.conditioner.dim_latent
+
+            dim_text = self.conditioner.dim_latent
+            cross_attn_dim_context = dim_text
+
+            self.to_sos_text_cond = nn.Linear(dim_text, dim_fine)
 
         # for summarizing the vertices of each face
 
@@ -1437,10 +1460,24 @@ def forward_on_codes(
 
         attended_face_codes = self.maybe_project_coarse_to_fine(attended_face_codes)
 
-        # auto prepend sos token
+        # repeat sos token across batch
 
         sos = repeat(self.sos_token, 'd -> b d', b = batch)
 
+        # condition sos token if needed
+
+        if self.condition_on_text:
+            pooled_text_embed = masked_mean(
+                maybe_dropped_text_embeds.embed,
+                maybe_dropped_text_embeds.mask,
+                dim = 1
+            )
+
+            sos_cond = self.to_sos_text_cond(pooled_text_embed)
+            sos = sos + sos_cond
+
+        # auto prepend sos token
+
         attended_face_codes_with_sos, _ = pack([sos, attended_face_codes], 'b * d')
 
         grouped_codes = pad_to_length(grouped_codes, attended_face_codes_with_sos.shape[-2], dim = 1)
diff --git a/meshgpt_pytorch/version.py b/meshgpt_pytorch/version.py
@@ -1 +1 @@
-__version__ = '1.1.1'
+__version__ = '1.2.0'

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.1.1'`
	`1`	`+__version__ = '1.2.0'`