ultralytics · glenn-jocher · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ The device to run the model can be optionally specified, and the default is to u
 
 Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model
 
-______________________________________________________________________
+---
 
 The model returned by `clip.load()` supports the following methods:
 

diff --git a/clip/clip.py b/clip/clip.py
@@ -42,6 +42,9 @@
 
 
 def _download(url: str, root: str):
+    """Downloads a file from the provided URL to the root directory, ensuring file integrity via SHA256 checksum
+    validation.
+    """
     os.makedirs(root, exist_ok=True)
     filename = os.path.basename(url)
 
@@ -76,10 +79,14 @@ def _download(url: str, root: str):
 
 
 def _convert_image_to_rgb(image):
+    """Convert an image to RGB format using the PIL library."""
     return image.convert("RGB")
 
 
 def _transform(n_px):
+    """Apply a series of image transformations including resizing, center cropping, RGB conversion, tensor conversion,
+    and normalization.
+    """
     return Compose(
         [
             Resize(n_px, interpolation=BICUBIC),

diff --git a/clip/model.py b/clip/model.py
@@ -11,6 +11,7 @@ class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self, inplanes, planes, stride=1):
+        """Initializes the Bottleneck module with given input planes, output planes, and stride."""
         super().__init__()
 
         # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
@@ -44,6 +45,7 @@ def __init__(self, inplanes, planes, stride=1):
             )
 
     def forward(self, x: torch.Tensor):
+        """Process input tensor `x` through the defined network layers and return the output tensor."""
         identity = x
 
         out = self.relu1(self.bn1(self.conv1(x)))
@@ -61,6 +63,9 @@ def forward(self, x: torch.Tensor):
 
 class AttentionPool2d(nn.Module):
     def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        """Initializes AttentionPool2d with spatial dimension, embedding dimension, number of heads, and optional output
+        dimension.
+        """
         super().__init__()
         self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
         self.k_proj = nn.Linear(embed_dim, embed_dim)
@@ -70,6 +75,9 @@ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim:
         self.num_heads = num_heads
 
     def forward(self, x):
+        """Executes the forward pass of the model using multi-head attention on input tensor 'x', returning the
+        processed data.
+        """
         x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
         x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
         x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
@@ -107,6 +115,9 @@ class ModifiedResNet(nn.Module):
     """
 
     def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        """Initialize model with customizable layers, output dimensions, attention heads, input resolution, and width
+        parameters.
+        """
         super().__init__()
         self.output_dim = output_dim
         self.input_resolution = input_resolution
@@ -134,13 +145,18 @@ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
         self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
 
     def _make_layer(self, planes, blocks, stride=1):
+        """Constructs a sequential layer of Bottleneck blocks with the given planes, number of blocks, and stride."""
         layers = [Bottleneck(self._inplanes, planes, stride)]
 
         self._inplanes = planes * Bottleneck.expansion
         layers.extend(Bottleneck(self._inplanes, planes) for _ in range(1, blocks))
         return nn.Sequential(*layers)
 
     def forward(self, x):
+        """Forward pass through the network stem, applying convolutions, batch normalization, ReLU activations, and
+        average pooling.
+        """
+
         def stem(x):
             x = self.relu1(self.bn1(self.conv1(x)))
             x = self.relu2(self.bn2(self.conv2(x)))
@@ -163,18 +179,21 @@ class LayerNorm(nn.LayerNorm):
     """Subclass torch's LayerNorm to handle fp16."""
 
     def forward(self, x: torch.Tensor):
+        """Performs forward pass through the LayerNorm, converting input to float32 and back to its original type."""
         orig_type = x.dtype
         ret = super().forward(x.type(torch.float32))
         return ret.type(orig_type)
 
 
 class QuickGELU(nn.Module):
     def forward(self, x: torch.Tensor):
+        """Applies the QuickGELU activation function to an input tensor."""
         return x * torch.sigmoid(1.702 * x)
 
 
 class ResidualAttentionBlock(nn.Module):
     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        """Initializes the ResidualAttentionBlock with model dimension, number of heads, and optional attention mask."""
         super().__init__()
 
         self.attn = nn.MultiheadAttention(d_model, n_head)
@@ -192,28 +211,37 @@ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
         self.attn_mask = attn_mask
 
     def attention(self, x: torch.Tensor):
+        """Compute scaled dot-product attention using query, key, and value tensors, with optional attention mask
+        adjustment.
+        """
         self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
         return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
 
     def forward(self, x: torch.Tensor):
+        """Performs forward pass through the network, applying attention and MLP layers sequentially."""
         x = x + self.attention(self.ln_1(x))
         x = x + self.mlp(self.ln_2(x))
         return x
 
 
 class Transformer(nn.Module):
     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        """Initializes the Transformer model with specified width, layers, heads, and optional attention mask."""
         super().__init__()
         self.width = width
         self.layers = layers
         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
 
     def forward(self, x: torch.Tensor):
+        """Process the input tensor 'x' through a sequence of residual attention blocks."""
         return self.resblocks(x)
 
 
 class VisionTransformer(nn.Module):
     def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        """Initialize a VisionTransformer with given input resolution, patch size, width, layers, heads, and output
+        dimension.
+        """
         super().__init__()
         self.input_resolution = input_resolution
         self.output_dim = output_dim
@@ -230,6 +258,7 @@ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: i
         self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
 
     def forward(self, x: torch.Tensor):
+        """Processes input tensor through embedding, layer normalization, and transformer layers."""
         x = self.conv1(x)  # shape = [*, width, grid, grid]
         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
@@ -314,6 +343,7 @@ def __init__(
         self.initialize_parameters()
 
     def initialize_parameters(self):
+        """Initialize the parameters of the token and positional embeddings with normal distributions."""
         nn.init.normal_(self.token_embedding.weight, std=0.02)
         nn.init.normal_(self.positional_embedding, std=0.01)
 
@@ -343,7 +373,9 @@ def initialize_parameters(self):
             nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5)
 
     def build_attention_mask(self):
-        # lazily create causal attention mask, with full attention between the vision tokens
+        """Create a causal attention mask with full attention between vision tokens, using an additive attention mask
+        filled with -inf.
+        """
         # pytorch uses additive attention mask; fill with -inf
         mask = torch.empty(self.context_length, self.context_length)
         mask.fill_(float("-inf"))
@@ -352,12 +384,15 @@ def build_attention_mask(self):
 
     @property
     def dtype(self):
+        """Return the data type of the weights of the first convolutional layer in the visual model."""
         return self.visual.conv1.weight.dtype
 
     def encode_image(self, image):
+        """Encodes an input image using the visual model and returns the encoded representation."""
         return self.visual(image.type(self.dtype))
 
     def encode_text(self, text):
+        """Encodes input text using the token embedding and converts it to the specified data type."""
         x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
 
         x = x + self.positional_embedding.type(self.dtype)
@@ -373,6 +408,7 @@ def encode_text(self, text):
         return x
 
     def forward(self, image, text):
+        """Processes input image and text data through encoder modules and returns the respective features."""
         image_features = self.encode_image(image)
         text_features = self.encode_text(text)
 
@@ -414,6 +450,7 @@ def _convert_weights_to_fp16(l):
 
 
 def build_model(state_dict: dict):
+    """Builds and returns a CLIP model from the provided state dictionary."""
     vit = "visual.proj" in state_dict
 
     if vit:

diff --git a/clip/simple_tokenizer.py b/clip/simple_tokenizer.py
@@ -9,6 +9,7 @@
 
 @lru_cache()
 def default_bpe():
+    """Returns the file path to the default BPE vocabulary file 'bpe_simple_vocab_16e6.txt.gz'."""
     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 
 
@@ -49,19 +50,26 @@ def get_pairs(word):
 
 
 def basic_clean(text):
+    """Clean text by fixing encoding issues and unescaping HTML entities, then stripping extraneous whitespace."""
     text = ftfy.fix_text(text)
     text = html.unescape(html.unescape(text))
     return text.strip()
 
 
 def whitespace_clean(text):
+    """Clean text by collapsing multiple whitespace characters into a single space and trimming leading/trailing
+    whitespace.
+    """
     text = re.sub(r"\s+", " ", text)
     text = text.strip()
     return text
 
 
 class SimpleTokenizer(object):
     def __init__(self, bpe_path: str = default_bpe()):
+        """Initialize the SimpleTokenizer object with byte pair encoding (BPE) paths and set up encoders, decoders, and
+        patterns.
+        """
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
         merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
@@ -81,6 +89,7 @@ def __init__(self, bpe_path: str = default_bpe()):
         )
 
     def bpe(self, token):
+        """Apply byte pair encoding (BPE) to a given token and cache the result."""
         if token in self.cache:
             return self.cache[token]
         word = tuple(token[:-1]) + (f"{token[-1]}</w>",)
@@ -122,6 +131,7 @@ def bpe(self, token):
         return word
 
     def encode(self, text):
+        """Converts input text to BPE tokens using byte-pair encoding and pre-defined tokenization rules."""
         bpe_tokens = []
         text = whitespace_clean(basic_clean(text)).lower()
         for token in re.findall(self.pat, text):
@@ -130,5 +140,6 @@ def encode(self, text):
         return bpe_tokens
 
     def decode(self, tokens):
+        """Decodes a list of BPE tokens into a UTF-8 string, replacing '</w>' with a space."""
         text = "".join([self.decoder[token] for token in tokens])
         return bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
diff --git a/hubconf.py b/hubconf.py
@@ -40,6 +40,7 @@ def entrypoint(**kwargs):
 
 
 def tokenize():
+    """Returns the _tokenize function for tokenizing input data."""
     return _tokenize
 
 

diff --git a/tests/test_consistency.py b/tests/test_consistency.py
@@ -8,6 +8,7 @@
 
 @pytest.mark.parametrize("model_name", clip.available_models())
 def test_consistency(model_name):
+    """Test consistency between JIT and non-JIT model outputs using CLIP for given model names."""
     device = "cpu"
     jit_model, transform = clip.load(model_name, device=device, jit=True)
     py_model, _ = clip.load(model_name, device=device, jit=False)
-Original file line number
+Diff line change
@@ Expand Up @@
     Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model
-    ______________________________________________________________________
+    ---
     The model returned by `clip.load()` supports the following methods:
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,6 +40,7 @@ def entrypoint(**kwargs):


		def tokenize():
		"""Returns the _tokenize function for tokenizing input data."""
		return _tokenize


Expand Down