From 5324ce32a375bff79994eaf2a86e108539d9f9df Mon Sep 17 00:00:00 2001 From: Kye Date: Thu, 7 Dec 2023 12:19:08 -0800 Subject: [PATCH] [TOKENIZER EXAMPLE] --- README.md | 24 ++++++++++++++++++++++++ gemini_torch/tokenizer.py | 10 ---------- tokenizer.py | 12 ++++++++++++ 3 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 tokenizer.py diff --git a/README.md b/README.md index 8ece16c..da940f5 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ To implement this model effectively, I intend to initially focus on the image em - qk norm - no pos embeds - kv cache + ```python import torch from gemini_torch import Gemini @@ -108,6 +109,29 @@ print(y.shape) ``` ------ + + +## Tokenizer +- We're using the same tokenizer as LLAMA with special tokens denoting the beginning and end of the multi modality tokens. +- Does not fully process img, audio, or videos now we need help on that + +```python +from gemini_torch.tokenizer import MultimodalSentencePieceTokenizer + +# Example usage +tokenizer_name = "hf-internal-testing/llama-tokenizer" +tokenizer = MultimodalSentencePieceTokenizer(tokenizer_name=tokenizer_name) + +# Encoding and decoding examples +encoded_audio = tokenizer.encode("Audio description", modality="audio") +decoded_audio = tokenizer.decode(encoded_audio) + +print("Encoded audio:", encoded_audio) +print("Decoded audio:", decoded_audio) + + +``` + ### `ImgToTransformer` - takes in img -> patches -> reshapes to [B, SEQLEN, Dim] to align with transformer ```python diff --git a/gemini_torch/tokenizer.py b/gemini_torch/tokenizer.py index a3ac1ed..a523f1a 100644 --- a/gemini_torch/tokenizer.py +++ b/gemini_torch/tokenizer.py @@ -150,13 +150,3 @@ def decode(self, tokens: List[int]) -> str: return self.sp_model.decode(tokens) -# Example usage -tokenizer_name = "hf-internal-testing/llama-tokenizer" -tokenizer = MultimodalSentencePieceTokenizer(tokenizer_name=tokenizer_name) - -# Encoding and decoding examples -encoded_audio = tokenizer.encode("Audio description", modality="audio") -decoded_audio = tokenizer.decode(encoded_audio) - -print("Encoded audio:", encoded_audio) -print("Decoded audio:", decoded_audio) diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..9d04293 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,12 @@ +from gemini_torch.tokenizer import MultimodalSentencePieceTokenizer + +# Example usage +tokenizer_name = "hf-internal-testing/llama-tokenizer" +tokenizer = MultimodalSentencePieceTokenizer(tokenizer_name=tokenizer_name) + +# Encoding and decoding examples +encoded_audio = tokenizer.encode("Audio description", modality="audio") +decoded_audio = tokenizer.decode(encoded_audio) + +print("Encoded audio:", encoded_audio) +print("Decoded audio:", decoded_audio)