[BUFG][just images for now]

kyegomez · Jan 9, 2024 · dc002a3 · dc002a3
1 parent 9f8da27
commit dc002a3
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 3 deletions.
diff --git a/example.py b/example.py
@@ -24,10 +24,10 @@
 img = torch.randn(1, 3, 64, 64)  # Reduced height and width from 128
 
 # Audio shape: [batch, audio_seq_len, dim]
-audio = torch.randn(1, 32)  # Reduced audio_seq_len from 64
+# audio = torch.randn(1, 32)  # Reduced audio_seq_len from 64
 
 # Apply model to text and img
-y = model(text, img, audio)
+y = model(text, img)
 
 # Output shape: [batch, seq_len, dim]
 print(y)
diff --git a/gemini_torch/model.py b/gemini_torch/model.py
@@ -131,7 +131,11 @@ def forward(
                 img_emb = self.img_to_transformer(img)
 
                 # Concatenate text, image, and audio embeddings
-                x = torch.cat((text, img_emb, audio_emb), dim=1)
+                x = torch.cat((text, img_emb, audio_emb))
+
+            if exists(img):
+                # Process image input
+                x = self.img_to_text_embedding(img)
             else:
                 x = text