huggingface
diff --git a/‎cat.jpeg
88.5 KB b/‎cat.jpeg
88.5 KB
diff --git a/‎gemma3n_forward_test.py
Lines changed: 91 additions & 69 deletions b/‎gemma3n_forward_test.py
Lines changed: 91 additions & 69 deletions
diff --git a/‎speech.wav
134 KB b/‎speech.wav
134 KB
diff --git a/‎speech2.wav
533 KB b/‎speech2.wav
533 KB
diff --git a/‎src/transformers/models/auto/feature_extraction_auto.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/feature_extraction_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/gemma3p5/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/gemma3p5/__init__.py
Lines changed: 2 additions & 0 deletions
@@ -1,33 +1,94 @@
 import numpy as np
 import torch
-from transformers import (
-    AutoModel,
-    AutoModelForCausalLM,
-    AutoModelForImageTextToText,
-    AutoTokenizer,
-    Gemma3ImageProcessorFast,
-    Gemma3Processor,
-    model_addition_debugger_context,
-)
+from transformers import AutoModelForImageTextToText, AutoProcessor
 
-model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g348_safetensors"
+model_id = "gg-hf-gm/gemma-3n-E4B-it"
 
-image_processor = Gemma3ImageProcessorFast(size={"height": 768, "width": 768})
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-processor = Gemma3Processor(
-    tokenizer=tokenizer,
-    image_processor=image_processor,
-    chat_template=tokenizer.chat_template,
-)
+processor = AutoProcessor.from_pretrained(model_id)
 
 messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "image": "/usr/local/google/home/ryanmullins/Downloads/cat.jpeg"},
-            {"type": "text", "text": "Describe this image in detail."}
-        ]
-    }
+    # {
+    #     "role": "user",
+    #     "content": [
+    #         {"type": "text", "text": "What is the capital of France?"}
+    #     ]
+    # }
+    # {
+    #     "role": "user",
+    #     "content": [
+    #         {"type": "image", "image": "cat.jpeg"},
+    #         {"type": "text", "text": "Describe this image in detail."}
+    #     ]
+    # }
+    # {
+    #     "role": "user",
+    #     "content": [
+    #         {"type": "text", "text": "Transcribe the following speech segment in English:"},
+    #         {"type": "audio", "audio": "speech.wav"},
+    #         # Send a text to Mike. I'll be home late tomorrow.
+    #         {"type": "audio", "audio": "speech2.wav"},
+    #     ]
+    # }
+    # {
+    #     "role": "user",
+    #     "content": [
+    #         {"type": "text", "text": "What is the capital of France?"}
+    #     ]
+    # }
+    # [
+    #     {
+    #         "role": "user",
+    #         "content": [
+    #             {"type": "text", "text": "What is the capital of France?"}
+    #         ]
+    #     }
+    # ],
+    # [
+    #     {
+    #         "role": "user",
+    #         "content": [
+    #             {"type": "text", "text": "What is the capital of France?"}
+    #         ]
+    #     }
+    # ],
+    # [
+    #     {
+    #         "role": "user",
+    #         "content": [
+    #             {"type": "image", "image": "cat.jpeg"},
+    #             {"type": "text", "text": "Describe this image in detail."}
+    #         ]
+    #     }
+    # ],
+    # [
+    #     {
+    #         "role": "user",
+    #         "content": [
+    #             {"type": "image", "image": "cat.jpeg"},
+    #             {"type": "text", "text": "Describe this image in detail."}
+    #         ]
+    #     }
+    # ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the following speech segment in English:"},
+                {"type": "audio", "audio": "speech.wav"},
+                # Send a text to Mike. I'll be home late tomorrow.
+            ]
+        },
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the following speech segment in English:"},
+                {"type": "audio", "audio": "speech2.wav"},
+                # pious means to enter through. Their mouth are very tough and even a sharp
+            ]
+        },
+    ]
 ]
 
 inputs = processor.apply_chat_template(
@@ -39,54 +100,15 @@
 )
 input_len = inputs["input_ids"].shape[-1]
 
-print(inputs)
+print(f"{inputs.input_ids.shape=}")
 
-model = AutoModelForImageTextToText.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(model_id).to(dtype=torch.bfloat16)
 inputs = inputs.to(model.device, dtype=torch.bfloat16)
 
 with torch.inference_mode():
     generation = model.generate(**inputs, max_new_tokens=16, do_sample=False)
-    generation = generation[0][input_len:]
-
-decoded = processor.decode(generation, skip_special_tokens=True)
-print(decoded)
-
-# model.to(dtype=torch.bfloat16)
-# input_ids = tokenizer("The capitol of France is ", return_tensors="pt")
-
-# with model_addition_debugger_context(
-#     model=model,
-#     debug_path="/usr/local/google/home/ryanmullins/nano3/g251_debug",
-#     do_prune_layers=False,
-#     use_repr=False,
-# ):
-#     outputs = model.forward(**input_ids)
-
-
-# model_id = "/usr/local/google/home/ryanmullins/nano3/checkpoints/g251_vision_encoder"
-# vision_encoder = AutoModel.from_pretrained(model_id)
-# print(type(vision_encoder))
-# print(vision_encoder.config)
-
-
-# model_id = "/usr/local/google/home/ryanmullins/git/gemma-3p5-audio-encoder"
-# model = Gemma3p5AudioEncoder.from_pretrained(model_id)
-# audio_config = model.config
-
-# batch_size = 1
-# seq_len = 80  # Example input sequence length (make it odd to test padding)
-# pad_len = 40
-
-# rng = np.random.default_rng(seed=42)
-# audio_mel = rng.normal(size=(batch_size, audio_config.input_feat_size, seq_len)).astype(np.float32)
-# audio_mel_mask_np = np.zeros((batch_size, seq_len), dtype=bool)
-# if seq_len >= pad_len:  # Ensure pad_len is not out of bounds
-#     audio_mel_mask_np[:, -pad_len:] = True  # Pad the end
+    generation = generation[:, input_len:]
+    print(f"{generation=}")
 
-# with model_addition_debugger_context(
-#     model=model,
-#     debug_path="/usr/local/google/home/ryanmullins/nano3/gemma3n_audio_encoder_debug",
-#     do_prune_layers=False,
-#     use_repr=False,
-# ):
-#     outputs = model.forward(torch.from_numpy(audio_mel), torch.from_numpy(audio_mel_mask_np))
+decoded = processor.batch_decode(generation, skip_special_tokens=True)
+print(f"{decoded=}")
@@ -60,6 +60,7 @@
         ("dpt", "DPTFeatureExtractor"),
         ("encodec", "EncodecFeatureExtractor"),
         ("flava", "FlavaFeatureExtractor"),
+        ("gemma3p5", "Gemma3p5AudioFeatureExtractor"),
         ("glpn", "GLPNFeatureExtractor"),
         ("granite_speech", "GraniteSpeechFeatureExtractor"),
         ("groupvit", "CLIPFeatureExtractor"),
 
@@ -19,7 +19,9 @@
 
 if TYPE_CHECKING:
     from .configuration_gemma3p5 import *
+    from .feature_extraction_gemm3p5 import *
     from .modeling_gemma3p5 import *
+    from .processing_gemma3p5 import *
 else:
     import sys