diff --git a/keras_hub/src/models/video_swin/video_swin_backbone.py b/keras_hub/src/models/video_swin/video_swin_backbone.py index 3a4fc66879..44fc9fea07 100644 --- a/keras_hub/src/models/video_swin/video_swin_backbone.py +++ b/keras_hub/src/models/video_swin/video_swin_backbone.py @@ -45,36 +45,36 @@ class VideoSwinBackbone(Backbone): """A Video Swin Transformer backbone model. Args: - input_shape (tuple[int], optional): The size of the input video in + input_shape : The size of the input video in `(depth, height, width, channel)` format. Defaults to `(32, 224, 224, 3)`. - include_rescaling (bool, optional): Whether to rescale the inputs. If + include_rescaling : Whether to rescale the inputs. If set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]. Defaults to `False`. - patch_size (int | tuple(int)): The patch size for depth, height, and width + patch_size : The patch size for depth, height, and width dimensions respectively. Default: (2,4,4). - embed_dim (int): Number of linear projection output channels. + embed_dim : Number of linear projection output channels. Default to 96. - depth (tuple[int]): Depth of each Swin Transformer stage. + depth : Depth of each Swin Transformer stage. Default to [2, 2, 6, 2] - num_heads (tuple[int]): Number of attention head of each stage. + num_heads : Number of attention head of each stage. Default to [3, 6, 12, 24] - window_size (int): The window size for depth, height, and width + window_size : The window size for depth, height, and width dimensions respectively. Default to [8, 7, 7]. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + mlp_ratio : Ratio of mlp hidden dim to embedding dim. Default to 4. - qkv_bias (bool): If True, add a learnable bias to query, key, value. + qkv_bias : If True, add a learnable bias to query, key, value. Default to True. - qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + qk_scale : Override default qk scale of head_dim ** -0.5 if set. Default to None. - dropout_rate (float): Float between 0 and 1. Fraction of the input units to drop. + dropout_rate : Float between 0 and 1. Fraction of the input units to drop. Default: 0. - attn_dropout_rate (float): Float between 0 and 1. Attention dropout rate. + attn_dropout_rate : Float between 0 and 1. Attention dropout rate. Default: 0. - drop_path_rate (float): Float between 0 and 1. Stochastic depth rate. + drop_path_rate : Float between 0 and 1. Stochastic depth rate. Default: 0.2. - patch_norm (bool): If True, add layer normalization after patch embedding. + patch_norm : If True, add layer normalization after patch embedding. Default to False. Example: diff --git a/keras_hub/src/models/video_swin/video_swin_layers.py b/keras_hub/src/models/video_swin/video_swin_layers.py index 47fb471a39..1a9c34102c 100644 --- a/keras_hub/src/models/video_swin/video_swin_layers.py +++ b/keras_hub/src/models/video_swin/video_swin_layers.py @@ -159,9 +159,9 @@ def compute_mask(depth, height, width, window_size, shift_size): shifted windows based on the provided window size and shift size. Args: - depth (int): Depth (number of frames) of the input video. - height (int): Height of the video frames. - width (int): Width of the video frames. + depth : Depth (number of frames) of the input video. + height: Height of the video frames. + width : Width of the video frames. window_size (tuple[int]): Size of the sliding window in each dimension (depth, height, width). shift_size (tuple[int]): Size of the shifting step in each dimension @@ -213,11 +213,11 @@ class MLP(keras.layers.Layer): """A Multilayer perceptron(MLP) layer. Args: - hidden_dim (int): The number of units in the hidden layer. - output_dim (int): The number of units in the output layer. - drop_rate (float): Float between 0 and 1. Fraction of the + hidden_dim : The number of units in the hidden layer. + output_dim : The number of units in the output layer. + dropout_rate: Float between 0 and 1. Fraction of the input units to drop. - activation (str): Activation to use in the hidden layers. + activation : Activation to use in the hidden layers. Default is `"gelu"`. References: @@ -272,9 +272,9 @@ class VideoSwinPatchingAndEmbedding(keras.Model): dimensional space. Args: - patch_size (int): Size of the patch along each dimension + patch_size : Size of the patch along each dimension (depth, height, width). Default: (2,4,4). - embedding_dim (int): Number of linear projection output channels. Default: 96. + embedding_dim : Number of linear projection output channels. Default: 96. norm_layer (keras.layers, optional): Normalization layer. Default: None References: @@ -346,8 +346,8 @@ class VideoSwinPatchMerging(keras.layers.Layer): by concatenation and linear projection. Args: - input_dim (int): Number of input channels in the feature maps. - norm_layer (keras.layers, optional): Normalization layer. + input_dim : Number of input channels in the feature maps. + norm_layer : Normalization layer. Default: LayerNormalization References: @@ -418,13 +418,13 @@ class VideoSwinWindowAttention(keras.Model): It supports both of shifted and non-shifted window. Args: - input_dim (int): The number of input channels in the feature maps. - window_size (tuple[int]): The temporal length, height and width of the window. - num_heads (int): Number of attention heads. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set - attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0 - proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0 + input_dim : The number of input channels in the feature maps. + window_size : The temporal length, height and width of the window. + num_heads : Number of attention heads. + qkv_bias : If True, add a learnable bias to query, key, value. Default: True + qk_scale : Override default qk scale of head_dim ** -0.5 if set + attn_drop_rate : Dropout ratio of attention weight. Default: 0.0 + proj_drop_rate : Dropout ratio of output. Default: 0.0 References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) @@ -583,18 +583,18 @@ class VideoSwinBasicLayer(keras.Model): """A basic Video Swin Transformer layer for one stage. Args: - input_dim (int): Number of feature channels - depth (int): Depths of this stage. - num_heads (int): Number of attention head. - window_size (tuple[int]): Local window size. Default: (1,7,7). - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. - drop (float, optional): Dropout rate. Default: 0.0 - attn_drop (float, optional): Attention dropout rate. Default: 0.0 - drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 - norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization - downsample (keras.layers | None, optional): Downsample layer at the end of the layer. Default: None + input_dim : Number of feature channels + depth : Depth of this stage. + num_heads : Number of attention head. + window_size : Local window size. Default: (1,7,7). + mlp_ratio : Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias : If True, add a learnable bias to query, key, value. Default: True + qk_scale : Override default qk scale of head_dim ** -0.5 if set. + drop : Dropout rate. Default: 0.0 + attn_drop : Attention dropout rate. Default: 0.0 + drop_path : Stochastic depth rate. Default: 0.0 + norm_layer : Normalization layer. Default: LayerNormalization + downsample : Downsample layer at the end of the layer. Default: None References: - [Video Swin Transformer](https://arxiv.org/abs/2106.13230) @@ -743,20 +743,20 @@ class VideoSwinTransformerBlock(keras.Model): """Video Swin Transformer Block. Args: - input_dim (int): Number of feature channels. - num_heads (int): Number of attention heads. - window_size (tuple[int]): Local window size. Default: (2, 7, 7) - shift_size (tuple[int]): Shift size for SW-MSA. Default: (0, 0, 0) - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + input_dim : Number of feature channels. + num_heads : Number of attention heads. + window_size : Local window size. Default: (2, 7, 7) + shift_size : Shift size for SW-MSA. Default: (0, 0, 0) + mlp_ratio : Ratio of mlp hidden dim to embedding dim. Default: 4.0 - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. + qkv_bias : If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + qk_scale : Override default qk scale of head_dim ** -0.5 if set. Default: None - drop (float, optional): Dropout rate. Default: 0.0 - attn_drop (float, optionalc): Attention dropout rate. Default: 0.0 - drop_path (float, optional): Stochastic depth rate. Default: 0.0 - act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu + drop : Dropout rate. Default: 0.0 + attn_drop : Attention dropout rate. Default: 0.0 + drop_path : Stochastic depth rate. Default: 0.0 + act_layer : Activation layer. Default: gelu norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization