Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ultralytics Code Refactor https://ultralytics.com/actions #5

Merged
merged 2 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ The device to run the model can be optionally specified, and the default is to u

Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model

______________________________________________________________________
---

The model returned by `clip.load()` supports the following methods:

Expand Down
7 changes: 7 additions & 0 deletions clip/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@


def _download(url: str, root: str):
"""Downloads a file from the provided URL to the root directory, ensuring file integrity via SHA256 checksum
validation.
"""
os.makedirs(root, exist_ok=True)
filename = os.path.basename(url)

Expand Down Expand Up @@ -76,10 +79,14 @@ def _download(url: str, root: str):


def _convert_image_to_rgb(image):
"""Convert an image to RGB format using the PIL library."""
return image.convert("RGB")


def _transform(n_px):
"""Apply a series of image transformations including resizing, center cropping, RGB conversion, tensor conversion,
and normalization.
"""
return Compose(
[
Resize(n_px, interpolation=BICUBIC),
Expand Down
39 changes: 38 additions & 1 deletion clip/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Bottleneck(nn.Module):
expansion = 4

def __init__(self, inplanes, planes, stride=1):
"""Initializes the Bottleneck module with given input planes, output planes, and stride."""
super().__init__()

# all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self, inplanes, planes, stride=1):
)

def forward(self, x: torch.Tensor):
"""Process input tensor `x` through the defined network layers and return the output tensor."""
identity = x

out = self.relu1(self.bn1(self.conv1(x)))
Expand All @@ -61,6 +63,9 @@ def forward(self, x: torch.Tensor):

class AttentionPool2d(nn.Module):
def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
"""Initializes AttentionPool2d with spatial dimension, embedding dimension, number of heads, and optional output
dimension.
"""
super().__init__()
self.positional_embedding = nn.Parameter(torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
self.k_proj = nn.Linear(embed_dim, embed_dim)
Expand All @@ -70,6 +75,9 @@ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim:
self.num_heads = num_heads

def forward(self, x):
"""Executes the forward pass of the model using multi-head attention on input tensor 'x', returning the
processed data.
"""
x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC
x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
Expand Down Expand Up @@ -107,6 +115,9 @@ class ModifiedResNet(nn.Module):
"""

def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
"""Initialize model with customizable layers, output dimensions, attention heads, input resolution, and width
parameters.
"""
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution
Expand Down Expand Up @@ -134,13 +145,18 @@ def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)

def _make_layer(self, planes, blocks, stride=1):
"""Constructs a sequential layer of Bottleneck blocks with the given planes, number of blocks, and stride."""
layers = [Bottleneck(self._inplanes, planes, stride)]

self._inplanes = planes * Bottleneck.expansion
layers.extend(Bottleneck(self._inplanes, planes) for _ in range(1, blocks))
return nn.Sequential(*layers)

def forward(self, x):
"""Forward pass through the network stem, applying convolutions, batch normalization, ReLU activations, and
average pooling.
"""

def stem(x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
Expand All @@ -163,18 +179,21 @@ class LayerNorm(nn.LayerNorm):
"""Subclass torch's LayerNorm to handle fp16."""

def forward(self, x: torch.Tensor):
"""Performs forward pass through the LayerNorm, converting input to float32 and back to its original type."""
orig_type = x.dtype
ret = super().forward(x.type(torch.float32))
return ret.type(orig_type)


class QuickGELU(nn.Module):
def forward(self, x: torch.Tensor):
"""Applies the QuickGELU activation function to an input tensor."""
return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
"""Initializes the ResidualAttentionBlock with model dimension, number of heads, and optional attention mask."""
super().__init__()

self.attn = nn.MultiheadAttention(d_model, n_head)
Expand All @@ -192,28 +211,37 @@ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
self.attn_mask = attn_mask

def attention(self, x: torch.Tensor):
"""Compute scaled dot-product attention using query, key, and value tensors, with optional attention mask
adjustment.
"""
self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

def forward(self, x: torch.Tensor):
"""Performs forward pass through the network, applying attention and MLP layers sequentially."""
x = x + self.attention(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x


class Transformer(nn.Module):
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
"""Initializes the Transformer model with specified width, layers, heads, and optional attention mask."""
super().__init__()
self.width = width
self.layers = layers
self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])

def forward(self, x: torch.Tensor):
"""Process the input tensor 'x' through a sequence of residual attention blocks."""
return self.resblocks(x)


class VisionTransformer(nn.Module):
def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
"""Initialize a VisionTransformer with given input resolution, patch size, width, layers, heads, and output
dimension.
"""
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
Expand All @@ -230,6 +258,7 @@ def __init__(self, input_resolution: int, patch_size: int, width: int, layers: i
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

def forward(self, x: torch.Tensor):
"""Processes input tensor through embedding, layer normalization, and transformer layers."""
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
Expand Down Expand Up @@ -314,6 +343,7 @@ def __init__(
self.initialize_parameters()

def initialize_parameters(self):
"""Initialize the parameters of the token and positional embeddings with normal distributions."""
nn.init.normal_(self.token_embedding.weight, std=0.02)
nn.init.normal_(self.positional_embedding, std=0.01)

Expand Down Expand Up @@ -343,7 +373,9 @@ def initialize_parameters(self):
nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5)

def build_attention_mask(self):
# lazily create causal attention mask, with full attention between the vision tokens
"""Create a causal attention mask with full attention between vision tokens, using an additive attention mask
filled with -inf.
"""
# pytorch uses additive attention mask; fill with -inf
mask = torch.empty(self.context_length, self.context_length)
mask.fill_(float("-inf"))
Expand All @@ -352,12 +384,15 @@ def build_attention_mask(self):

@property
def dtype(self):
"""Return the data type of the weights of the first convolutional layer in the visual model."""
return self.visual.conv1.weight.dtype

def encode_image(self, image):
"""Encodes an input image using the visual model and returns the encoded representation."""
return self.visual(image.type(self.dtype))

def encode_text(self, text):
"""Encodes input text using the token embedding and converts it to the specified data type."""
x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]

x = x + self.positional_embedding.type(self.dtype)
Expand All @@ -373,6 +408,7 @@ def encode_text(self, text):
return x

def forward(self, image, text):
"""Processes input image and text data through encoder modules and returns the respective features."""
image_features = self.encode_image(image)
text_features = self.encode_text(text)

Expand Down Expand Up @@ -414,6 +450,7 @@ def _convert_weights_to_fp16(l):


def build_model(state_dict: dict):
"""Builds and returns a CLIP model from the provided state dictionary."""
vit = "visual.proj" in state_dict

if vit:
Expand Down
11 changes: 11 additions & 0 deletions clip/simple_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

@lru_cache()
def default_bpe():
"""Returns the file path to the default BPE vocabulary file 'bpe_simple_vocab_16e6.txt.gz'."""
return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")


Expand Down Expand Up @@ -49,19 +50,26 @@ def get_pairs(word):


def basic_clean(text):
"""Clean text by fixing encoding issues and unescaping HTML entities, then stripping extraneous whitespace."""
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text))
return text.strip()


def whitespace_clean(text):
"""Clean text by collapsing multiple whitespace characters into a single space and trimming leading/trailing
whitespace.
"""
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text


class SimpleTokenizer(object):
def __init__(self, bpe_path: str = default_bpe()):
"""Initialize the SimpleTokenizer object with byte pair encoding (BPE) paths and set up encoders, decoders, and
patterns.
"""
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
Expand All @@ -81,6 +89,7 @@ def __init__(self, bpe_path: str = default_bpe()):
)

def bpe(self, token):
"""Apply byte pair encoding (BPE) to a given token and cache the result."""
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (f"{token[-1]}</w>",)
Expand Down Expand Up @@ -122,6 +131,7 @@ def bpe(self, token):
return word

def encode(self, text):
"""Converts input text to BPE tokens using byte-pair encoding and pre-defined tokenization rules."""
bpe_tokens = []
text = whitespace_clean(basic_clean(text)).lower()
for token in re.findall(self.pat, text):
Expand All @@ -130,5 +140,6 @@ def encode(self, text):
return bpe_tokens

def decode(self, tokens):
"""Decodes a list of BPE tokens into a UTF-8 string, replacing '</w>' with a space."""
text = "".join([self.decoder[token] for token in tokens])
return bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
1 change: 1 addition & 0 deletions hubconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def entrypoint(**kwargs):


def tokenize():
"""Returns the _tokenize function for tokenizing input data."""
return _tokenize


Expand Down
1 change: 1 addition & 0 deletions tests/test_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

@pytest.mark.parametrize("model_name", clip.available_models())
def test_consistency(model_name):
"""Test consistency between JIT and non-JIT model outputs using CLIP for given model names."""
device = "cpu"
jit_model, transform = clip.load(model_name, device=device, jit=True)
py_model, _ = clip.load(model_name, device=device, jit=False)
Expand Down