huggingface
diff --git a/‎gemma3n_forward_test.py
Lines changed: 42 additions & 7 deletions b/‎gemma3n_forward_test.py
Lines changed: 42 additions & 7 deletions
diff --git a/‎src/transformers/models/gemma3p5/convert_gemma3p5_weights.py
Lines changed: 22 additions & 23 deletions b/‎src/transformers/models/gemma3p5/convert_gemma3p5_weights.py
Lines changed: 22 additions & 23 deletions
@@ -5,17 +5,52 @@
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoTokenizer,
-    model_addition_debugger_context
+    Gemma3ImageProcessorFast,
+    Gemma3Processor,
+    model_addition_debugger_context,
 )
 
-model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g251_safetensors"
+model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g348_safetensors"
 
+image_processor = Gemma3ImageProcessorFast(size={"height": 768, "width": 768})
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
-print(type(model.config))
-print(type(model.config.audio_config))
-print(type(model.config.text_config))
-print(type(model.config.vision_config))
+processor = Gemma3Processor(
+    tokenizer=tokenizer,
+    image_processor=image_processor,
+    chat_template=tokenizer.chat_template,
+)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "/usr/local/google/home/ryanmullins/Downloads/cat.jpeg"},
+            {"type": "text", "text": "Describe this image in detail."}
+        ]
+    }
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+input_len = inputs["input_ids"].shape[-1]
+
+print(inputs)
+
+model = AutoModelForImageTextToText.from_pretrained(model_id)
+inputs = inputs.to(model.device, dtype=torch.bfloat16)
+
+with torch.inference_mode():
+    generation = model.generate(**inputs, max_new_tokens=16, do_sample=False)
+    generation = generation[0][input_len:]
+
+decoded = processor.decode(generation, skip_special_tokens=True)
+print(decoded)
+
 # model.to(dtype=torch.bfloat16)
 # input_ids = tokenizer("The capitol of France is ", return_tensors="pt")
 
 
@@ -37,14 +37,11 @@
 import timm
 
 from transformers import (
-    AutoConfig,
     Gemma3p5Config,
-    Gemma3p5ForCausalLM,
     Gemma3p5ForConditionalGeneration,
-    Gemma3ImageProcessor,
+    Gemma3ImageProcessorFast,
     Gemma3Processor,
     Gemma3NanoAudioConfig,
-    Gemma3NanoAudioEncoder,
     Gemma3p5TextConfig,
     Gemma3p5VisionConfig,
     GemmaTokenizerFast,
@@ -153,6 +150,7 @@
             intermediate_size=2048 * 4,
             num_hidden_layers=30,
             activation_sparsity_pattern=(0.95,)*10 + (0.0,)*20,
+            num_kv_shared_layers=10,
         ),
         vision_config=Gemma3p5VisionConfig(),
         audio_config=Gemma3NanoAudioConfig(),
@@ -182,7 +180,7 @@
 )
 
 _INCLUDE_CHAT_TEMPLATE = flags.DEFINE_bool(
-    name="include_chat_template", default=False, help="If true, will save the default chat template with the tokenizer"
+    name="include_chat_template", default=True, help="If true, will save the default chat template with the tokenizer"
 )
 
 _OUTPUT_PATH = flags.DEFINE_string(
@@ -641,12 +639,14 @@ def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> No
         elif param == "mm_input_embedding_extra":
             update_tree("embed_vision.embedding.weight", value, config.vision_config.torch_dtype)
         elif path.endswith("mm_hard_embedding_norm"):
-            update_tree("embed_vision.embedding_norm.weight", value, config.vision_config.torch_dtype)
+            update_tree("embed_vision.hard_embedding_norm.weight", value, config.vision_config.torch_dtype)
         elif path.endswith("mm_input_projection"):
             update_tree(
                 "embed_vision.embedding_projection.weight", value.transpose(), config.vision_config.torch_dtype
             )
-        if path.startswith(_TRANSFORMER_PARAMETER):
+        elif path.endswith("mm_soft_embedding_norm"):
+            update_tree("embed_vision.soft_embedding_norm.weight", value, config.vision_config.torch_dtype)
+        elif path.startswith(_TRANSFORMER_PARAMETER):
             for path, weights in convert_transformer_weights(config.text_config, path, param, value):
                 update_tree(f"language_model.{path}", weights, config.text_config.torch_dtype)
         elif _MOBILE_NET_PREFIX in path:
@@ -720,23 +720,22 @@ def main(*args):
     tokenizer.save_pretrained(output_path)
     logging.info("Saved GemmaTokenizer for %s to %s", variant, output_path)
 
-    # # if variant != _VARIANT_GEMMA_3_2B:
-    # #     image_processor = Gemma3ImageProcessor(
-    # #         image_seq_length=256,
-    # #         image_mean=(0.5,) * 3,
-    # #         image_std=(0.5,) * 3,
-    # #         size={"height": 896, "width": 896},
-    # #         resample=PILImageResampling.BILINEAR,
-    # #     )
-    # #     processor = Gemma3Processor(
-    # #         image_processor=image_processor,
-    # #         tokenizer=tokenizer,
-    # #         chat_template=tokenizer.chat_template,
-    # #     )
-    # #     processor.save_pretrained(output_path)
-    # #     logging.info("Saved Gemma3Processor for %s to %s", variant, output_path)
-    # #     del processor
+    image_processor = Gemma3ImageProcessorFast(
+        image_seq_length=256,
+        image_mean=(0.5,) * 3,
+        image_std=(0.5,) * 3,
+        size={"height": 768, "width": 768},
+        resample=PILImageResampling.BILINEAR,
+    )
+    processor = Gemma3Processor(
+        image_processor=image_processor,
+        tokenizer=tokenizer,
+        chat_template=tokenizer.chat_template,
+    )
+    processor.save_pretrained(output_path)
+    logging.info("Saved Gemma3Processor for %s to %s", variant, output_path)
 
+    del processor
     del tokenizer
 
     generation_config = GenerationConfig(