Skip to content

Commit

Permalink
Ultralytics Refactor https://ultralytics.com/actions (#5)
Browse files Browse the repository at this point in the history
Co-authored-by: UltralyticsAssistant <[email protected]>
  • Loading branch information
glenn-jocher and UltralyticsAssistant authored Jun 20, 2024
1 parent a2b2205 commit 54694e4
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 2 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ The device to run the model can be optionally specified, and the default is to u

Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model

______________________________________________________________________
---

The model returned by `clip.load()` supports the following methods:

Expand Down
7 changes: 7 additions & 0 deletions clip/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@


def _download(url: str, root: str):
"""Downloads a file from the provided URL to the root directory, ensuring file integrity via SHA256 checksum
validation.
"""
os.makedirs(root, exist_ok=True)
filename = os.path.basename(url)

Expand Down Expand Up @@ -76,10 +79,14 @@ def _download(url: str, root: str):


def _convert_image_to_rgb(image):
"""Convert an image to RGB format using the PIL library."""
return image.convert("RGB")


def _transform(n_px):
"""Apply a series of image transformations including resizing, center cropping, RGB conversion, tensor conversion,
and normalization.
"""
return Compose(
[
Resize(n_px, interpolation=BICUBIC),
Expand Down
39 changes: 38 additions & 1 deletion clip/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
"""Initializes the Bottleneck module with given input planes, output planes, and stride."""
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self, inplanes, planes, stride=1):
)

def forward(self, x: torch.Tensor):
"""Process input tensor `x` through the defined network layers and return the output tensor."""
identity = x

out = self.relu1(self.bn1(self.conv1(x)))
Expand All @@ -61,6 +63,9 @@ def forward(self, x: torch.Tensor):

class AttentionPool2d(nn.Module):
def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
"""Initializes AttentionPool2d with spatial dimension, embedding dimension, number of heads, and optional output
dimension.
"""
super().__init__()
self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
Expand All @@ -70,6 +75,9 @@ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim:
self.num_heads = num_heads

def forward(self, x):
"""Executes the forward pass of the model using multi-head attention on input tensor 'x', returning the
processed data.
"""
x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
Expand Down Expand Up @@ -107,6 +115,9 @@ class ModifiedResNet(nn.Module):
"""

def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
"""Initialize model with customizable layers, output dimensions, attention heads, input resolution, and width
parameters.
"""
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution
Expand Down Expand Up @@ -134,13 +145,18 @@ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
"""Constructs a sequential layer of Bottleneck blocks with the given planes, number of blocks, and stride."""
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
layers.extend(Bottleneck(self._inplanes, planes) for _ in range(1, blocks))
return nn.Sequential(*layers)

def forward(self, x):
"""Forward pass through the network stem, applying convolutions, batch normalization, ReLU activations, and
average pooling.
"""

def stem(x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
Expand All @@ -163,18 +179,21 @@ class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
"""Performs forward pass through the LayerNorm, converting input to float32 and back to its original type."""
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):
def forward(self, x: torch.Tensor):
"""Applies the QuickGELU activation function to an input tensor."""
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
"""Initializes the ResidualAttentionBlock with model dimension, number of heads, and optional attention mask."""
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
Expand All @@ -192,28 +211,37 @@ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
"""Compute scaled dot-product attention using query, key, and value tensors, with optional attention mask
adjustment.
"""
self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
"""Performs forward pass through the network, applying attention and MLP layers sequentially."""
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
"""Initializes the Transformer model with specified width, layers, heads, and optional attention mask."""
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])

def forward(self, x: torch.Tensor):
"""Process the input tensor 'x' through a sequence of residual attention blocks."""
return self.resblocks(x)


class VisionTransformer(nn.Module):
def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
"""Initialize a VisionTransformer with given input resolution, patch size, width, layers, heads, and output
dimension.
"""
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
Expand All @@ -230,6 +258,7 @@ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: i
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
"""Processes input tensor through embedding, layer normalization, and transformer layers."""
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
Expand Down Expand Up @@ -314,6 +343,7 @@ def __init__(
self.initialize_parameters()

def initialize_parameters(self):
"""Initialize the parameters of the token and positional embeddings with normal distributions."""
nn.init.normal_(self.token_embedding.weight, std=0.02)
nn.init.normal_(self.positional_embedding, std=0.01)

Expand Down Expand Up @@ -343,7 +373,9 @@ def initialize_parameters(self):
nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5)

def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
"""Create a causal attention mask with full attention between vision tokens, using an additive attention mask
filled with -inf.
"""
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.context_length, self.context_length)
mask.fill_(float("-inf"))
Expand All @@ -352,12 +384,15 @@ def build_attention_mask(self):

@property
def dtype(self):
"""Return the data type of the weights of the first convolutional layer in the visual model."""
return self.visual.conv1.weight.dtype

def encode_image(self, image):
"""Encodes an input image using the visual model and returns the encoded representation."""
return self.visual(image.type(self.dtype))

def encode_text(self, text):
"""Encodes input text using the token embedding and converts it to the specified data type."""
x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]

x = x + self.positional_embedding.type(self.dtype)
Expand All @@ -373,6 +408,7 @@ def encode_text(self, text):
return x

def forward(self, image, text):
"""Processes input image and text data through encoder modules and returns the respective features."""
image_features = self.encode_image(image)
text_features = self.encode_text(text)

Expand Down Expand Up @@ -414,6 +450,7 @@ def _convert_weights_to_fp16(l):


def build_model(state_dict: dict):
"""Builds and returns a CLIP model from the provided state dictionary."""
vit = "visual.proj" in state_dict

if vit:
Expand Down
11 changes: 11 additions & 0 deletions clip/simple_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

@lru_cache()
def default_bpe():
"""Returns the file path to the default BPE vocabulary file 'bpe_simple_vocab_16e6.txt.gz'."""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")


Expand Down Expand Up @@ -49,19 +50,26 @@ def get_pairs(word):


def basic_clean(text):
"""Clean text by fixing encoding issues and unescaping HTML entities, then stripping extraneous whitespace."""
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()


def whitespace_clean(text):
"""Clean text by collapsing multiple whitespace characters into a single space and trimming leading/trailing
whitespace.
"""
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text


class SimpleTokenizer(object):
def __init__(self, bpe_path: str = default_bpe()):
"""Initialize the SimpleTokenizer object with byte pair encoding (BPE) paths and set up encoders, decoders, and
patterns.
"""
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
Expand All @@ -81,6 +89,7 @@ def __init__(self, bpe_path: str = default_bpe()):
)

def bpe(self, token):
"""Apply byte pair encoding (BPE) to a given token and cache the result."""
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (f"{token[-1]}</w>",)
Expand Down Expand Up @@ -122,6 +131,7 @@ def bpe(self, token):
return word

def encode(self, text):
"""Converts input text to BPE tokens using byte-pair encoding and pre-defined tokenization rules."""
bpe_tokens = []
text = whitespace_clean(basic_clean(text)).lower()
for token in re.findall(self.pat, text):
Expand All @@ -130,5 +140,6 @@ def encode(self, text):
return bpe_tokens

def decode(self, tokens):
"""Decodes a list of BPE tokens into a UTF-8 string, replacing '</w>' with a space."""
text = "".join([self.decoder[token] for token in tokens])
return bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
1 change: 1 addition & 0 deletions hubconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def entrypoint(**kwargs):


def tokenize():
"""Returns the _tokenize function for tokenizing input data."""
return _tokenize


Expand Down
1 change: 1 addition & 0 deletions tests/test_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

@pytest.mark.parametrize("model_name", clip.available_models())
def test_consistency(model_name):
"""Test consistency between JIT and non-JIT model outputs using CLIP for given model names."""
device = "cpu"
jit_model, transform = clip.load(model_name, device=device, jit=True)
py_model, _ = clip.load(model_name, device=device, jit=False)
Expand Down

0 comments on commit 54694e4

Please sign in to comment.