rhysdg · rhysdg · Jul 9, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ Last of all the aim here is to keep up with the latest optimised foundation mode
 - SigLIP is available and recommended by default given the innovation made at with it's loss function leading to better inference. model types however can be changed at instantiation with:
 
   ```python
-  onnx_model = OnnxClip(batch_size=16, type='siglip_full')
+  onnx_model = OnnxLip(batch_size=16, type='siglip_full')
   ```
 
 - Notice also cosine similrity at `get_similarity_scores` is adusted to handle multiple context - in other words a handful of text embedding can be sent as 'contexts', and send to the function to be evaluated against a single image or a batch of images.
@@ -143,7 +143,7 @@ Last of all the aim here is to keep up with the latest optimised foundation mode
 
   for k,v in contexts.items():
       print(f'\ncontext: {k}\n')
-      for text, p in zip(texts[k], probs[k][0]):
+      for text, p in zip(texts[k], probs[k]):
           print(f"Probability that the image is '{text}': {p:.3f}")
   ```
 

diff --git a/clip/model.py b/clip/model.py
@@ -88,14 +88,14 @@ def get_probabilities(image_embedding: list,
       if image_embedding.ndim == 1:
           # Convert to 2-D array using x[np.newaxis, :]
           # and remove the extra dimension at the end.
-          res_dict[key] = softmax(get_similarity_scores(
+          res_dict[key] = softmax(get_probabilities(
               image_embedding[np.newaxis, :], query
           )[0])
 
       if query.ndim == 1:
           # Convert to 2-D array using x[np.newaxis, :]
           # and remove the extra dimension at the end.
-          res_dict[key] = softmax(get_similarity_scores(
+          res_dict[key] = softmax(get_probabilities(
               image_embedding, query[np.newaxis, :]
           )[:, 0])
 
@@ -136,11 +136,15 @@ def __init__(
                 passing large amounts of data (perhaps ~100 or more).
 
         """ 
+        assert device in ['cpu', 'cuda'], 'please use either cuda or cpu!'
 
         self.providers = [
-                    'CUDAExecutionProvider',
                     'CPUExecutionProvider'
                 ]
+
+        if device == 'cuda':
+            self.providers.insert(0, 'CUDAExecutionProvider')
+
         if trt:
             self.providers.insert(0, 'TensorrtExecutionProvider')
 
@@ -167,6 +171,7 @@ def __init__(
 
         self.image_model, self.text_model = self._load_models(model)
 
+
         if 'siglip' in type:
             #currently only supporting 384
             assert size in [384, 224], 'please choose either a 384, or 224 input size for SigLIP!'
@@ -246,7 +251,7 @@ def _load_model(self, path: str):
 
             # `providers` need to be set explicitly since ORT 1.9
             return ort.InferenceSession(
-                path, providers=ort.get_available_providers()
+                path, providers=self.providers
             )
 
     def get_image_embeddings(
@@ -268,9 +273,15 @@ def get_image_embeddings(
         """
         if not with_batching or self._batch_size is None:
             # Preprocess images
-            images = [
-                self._preprocessor.encode_image(image) for image in images
-            ]
+            if 'siglip' in self.type:
+                images = [
+                    np.expand_dims(self._siglip_preprocessor(image).numpy(), 0) for image in images
+                ]
+            else:
+                images = [
+                    self._preprocessor.encode_image(image) for image in images
+                ]
+
 
 
             if not images:
@@ -320,17 +331,16 @@ def get_text_embeddings(
 
             if self.type == 'siglip':
 
-                text = self._siglip_tokenizer(incoming, 
+                text = self._siglip_tokenizer(texts, 
                         return_tensors='np', 
                         padding="max_length",
                         truncation=True
                         )
                 if len(text) == 0:
                     return self._get_empty_embedding()
 
-                incoming = {"input_ids": text}
-
-                hidden, pooled = self.text_model.run(None, incoming)
+                #text is already in a input_ids keypair here 
+                hidden, pooled = self.text_model.run(None, {'input_ids': text['input_ids'].astype(np.int64)})
 
                 #needs adjusting to a list followed by np.concatenate
                 self.hidden_text =  hidden

diff --git a/clip/siglip_image_processor.py b/clip/siglip_image_processor.py
@@ -1,6 +1,8 @@
 import numbers
 import random
 import warnings
+import collections
+from itertools import repeat
 from dataclasses import dataclass, asdict
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
@@ -415,4 +417,4 @@ def image_transform(
             ToTensor(),
             normalize,
         ])
-        return Compose(transforms)
+        return Compose(transforms)
diff --git a/clip/siglip_tokenizer.py b/clip/siglip_tokenizer.py
@@ -25,7 +25,7 @@
 
 from .utils.tokenization_utils import PreTrainedTokenizer
 from .utils.tokenization_utils_base import AddedToken
-from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
+from .utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
 
 if TYPE_CHECKING:
     from ...tokenization_utils_base import TextInput
@@ -40,8 +40,6 @@
 SPIECE_UNDERLINE = "▁"
 
 
-
-
 class SiglipTokenizer(PreTrainedTokenizer):
     """
     Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

diff --git a/clip/utils/sentencepiece_model_pb2_new.py b/clip/utils/sentencepiece_model_pb2_new.py
diff --git a/clip/utils/tokenization_utils.py b/clip/utils/tokenization_utils.py
@@ -433,8 +433,10 @@ def __init__(self, **kwargs):
 
         # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
         # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
+        #Adding str(token) to resolve AddedToken unshashable type
+
         self._add_tokens(
-            [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
+            [token for token in self.all_special_tokens_extended if str(token) not in self._added_tokens_encoder],
             special_tokens=True,
         )
 
@@ -552,8 +554,13 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
             elif special_tokens:
                 # doing token.special=True changes the normalization! will fix in rust
                 # this is important and the only reason why the AddedTokens in each class are normalized by default
-                token.__setstate__({"special": True, "normalized": token.normalized})
-            if token in self._added_tokens_decoder:
+                #token.__setstate__({"special": True, "normalized": token.normalized})
+                #token.__setstate__({"special": True, "normalized": token.normalized})
+                token.special = True
+                token.normalized = token.normalized
+
+            #resolving unhashable type AddedToke wiht str(token)    
+            if str(token) in self._added_tokens_decoder:
                 continue
             if not token.special and token.normalized and getattr(self, "do_lower_case", False):
                 # Normalize if requested
@@ -576,9 +583,10 @@ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_to
         self._update_trie()
         return added_tokens
 
+    #Adding str(token) to resolve AddedToken unshashable type
     def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
         for token in self._added_tokens_decoder.values():
-            if token not in self.tokens_trie._tokens:
+            if str(token) not in self.tokens_trie._tokens:
                 self.tokens_trie.add(token.content)
         for token in unique_no_split_tokens:
             if token not in self.tokens_trie._tokens: