diff --git a/keras_hub/src/models/video_swin/video_swin_backbone.py b/keras_hub/src/models/video_swin/video_swin_backbone.py
index 3a4fc66879..44fc9fea07 100644
--- a/keras_hub/src/models/video_swin/video_swin_backbone.py
+++ b/keras_hub/src/models/video_swin/video_swin_backbone.py
@@ -45,36 +45,36 @@ class VideoSwinBackbone(Backbone):
     """A Video Swin Transformer backbone model.
 
     Args:
-        input_shape (tuple[int], optional): The size of the input video in
+        input_shape : The size of the input video in
             `(depth, height, width, channel)` format.
             Defaults to `(32, 224, 224, 3)`.
-        include_rescaling (bool, optional): Whether to rescale the inputs. If
+        include_rescaling : Whether to rescale the inputs. If
             set to `True`, inputs will be passed through a `Rescaling(1/255.0)` layer
             and normalize with mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
             Defaults to `False`.
-        patch_size (int | tuple(int)): The patch size for depth, height, and width
+        patch_size : The patch size for depth, height, and width
             dimensions respectively. Default: (2,4,4).
-        embed_dim (int): Number of linear projection output channels.
+        embed_dim : Number of linear projection output channels.
             Default to 96.
-        depth (tuple[int]): Depth of each Swin Transformer stage.
+        depth : Depth of each Swin Transformer stage.
             Default to [2, 2, 6, 2]
-        num_heads (tuple[int]): Number of attention head of each stage.
+        num_heads : Number of attention head of each stage.
             Default to [3, 6, 12, 24]
-        window_size (int): The window size for depth, height, and width
+        window_size : The window size for depth, height, and width
             dimensions respectively. Default to [8, 7, 7].
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        mlp_ratio : Ratio of mlp hidden dim to embedding dim.
             Default to 4.
-        qkv_bias (bool): If True, add a learnable bias to query, key, value.
+        qkv_bias : If True, add a learnable bias to query, key, value.
             Default to True.
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        qk_scale : Override default qk scale of head_dim ** -0.5 if set.
             Default to None.
-        dropout_rate (float): Float between 0 and 1. Fraction of the input units to drop.
+        dropout_rate : Float between 0 and 1. Fraction of the input units to drop.
             Default: 0.
-        attn_dropout_rate (float): Float between 0 and 1. Attention dropout rate.
+        attn_dropout_rate : Float between 0 and 1. Attention dropout rate.
             Default: 0.
-        drop_path_rate (float): Float between 0 and 1. Stochastic depth rate.
+        drop_path_rate : Float between 0 and 1. Stochastic depth rate.
             Default: 0.2.
-        patch_norm (bool): If True, add layer normalization after patch embedding.
+        patch_norm : If True, add layer normalization after patch embedding.
             Default to False.
 
     Example:
diff --git a/keras_hub/src/models/video_swin/video_swin_layers.py b/keras_hub/src/models/video_swin/video_swin_layers.py
index 47fb471a39..1a9c34102c 100644
--- a/keras_hub/src/models/video_swin/video_swin_layers.py
+++ b/keras_hub/src/models/video_swin/video_swin_layers.py
@@ -159,9 +159,9 @@ def compute_mask(depth, height, width, window_size, shift_size):
     shifted windows based on the provided window size and shift size.
 
     Args:
-        depth (int): Depth (number of frames) of the input video.
-        height (int): Height of the video frames.
-        width (int): Width of the video frames.
+        depth : Depth (number of frames) of the input video.
+        height: Height of the video frames.
+        width : Width of the video frames.
         window_size (tuple[int]): Size of the sliding window in each dimension
             (depth, height, width).
         shift_size (tuple[int]): Size of the shifting step in each dimension
@@ -213,11 +213,11 @@ class MLP(keras.layers.Layer):
     """A Multilayer perceptron(MLP) layer.
 
     Args:
-        hidden_dim (int): The number of units in the hidden layer.
-        output_dim (int): The number of units in the output layer.
-        drop_rate  (float): Float between 0 and 1. Fraction of the
+        hidden_dim  : The number of units in the hidden layer.
+        output_dim  : The number of units in the output layer.
+        dropout_rate: Float between 0 and 1. Fraction of the
             input units to drop.
-        activation (str): Activation to use in the hidden layers.
+        activation : Activation to use in the hidden layers.
             Default is `"gelu"`.
 
     References:
@@ -272,9 +272,9 @@ class VideoSwinPatchingAndEmbedding(keras.Model):
     dimensional space.
 
     Args:
-        patch_size (int): Size of the patch along each dimension
+        patch_size : Size of the patch along each dimension
             (depth, height, width). Default: (2,4,4).
-        embedding_dim (int): Number of linear projection output channels. Default: 96.
+        embedding_dim : Number of linear projection output channels. Default: 96.
         norm_layer (keras.layers, optional): Normalization layer. Default: None
 
     References:
@@ -346,8 +346,8 @@ class VideoSwinPatchMerging(keras.layers.Layer):
     by concatenation and linear projection.
 
     Args:
-        input_dim (int): Number of input channels in the feature maps.
-        norm_layer (keras.layers, optional): Normalization layer.
+        input_dim : Number of input channels in the feature maps.
+        norm_layer : Normalization layer.
             Default: LayerNormalization
 
     References:
@@ -418,13 +418,13 @@ class VideoSwinWindowAttention(keras.Model):
     It supports both of shifted and non-shifted window.
 
     Args:
-        input_dim (int): The number of input channels in the feature maps.
-        window_size (tuple[int]): The temporal length, height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0
+        input_dim : The number of input channels in the feature maps.
+        window_size : The temporal length, height and width of the window.
+        num_heads : Number of attention heads.
+        qkv_bias :  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale : Override default qk scale of head_dim ** -0.5 if set
+        attn_drop_rate : Dropout ratio of attention weight. Default: 0.0
+        proj_drop_rate : Dropout ratio of output. Default: 0.0
 
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
@@ -583,18 +583,18 @@ class VideoSwinBasicLayer(keras.Model):
     """A basic Video Swin Transformer layer for one stage.
 
     Args:
-        input_dim (int): Number of feature channels
-        depth (int): Depths of this stage.
-        num_heads (int): Number of attention head.
-        window_size (tuple[int]): Local window size. Default: (1,7,7).
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (keras.layers, optional): Normalization layer. Default: LayerNormalization
-        downsample (keras.layers | None, optional): Downsample layer at the end of the layer. Default: None
+        input_dim : Number of feature channels
+        depth : Depth of this stage.
+        num_heads : Number of attention head.
+        window_size : Local window size. Default: (1,7,7).
+        mlp_ratio : Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias : If True, add a learnable bias to query, key, value. Default: True
+        qk_scale : Override default qk scale of head_dim ** -0.5 if set.
+        drop : Dropout rate. Default: 0.0
+        attn_drop : Attention dropout rate. Default: 0.0
+        drop_path : Stochastic depth rate. Default: 0.0
+        norm_layer : Normalization layer. Default: LayerNormalization
+        downsample : Downsample layer at the end of the layer. Default: None
 
     References:
         - [Video Swin Transformer](https://arxiv.org/abs/2106.13230)
@@ -743,20 +743,20 @@ class VideoSwinTransformerBlock(keras.Model):
     """Video Swin Transformer Block.
 
     Args:
-        input_dim (int): Number of feature channels.
-        num_heads (int): Number of attention heads.
-        window_size (tuple[int]): Local window size. Default: (2, 7, 7)
-        shift_size (tuple[int]): Shift size for SW-MSA. Default: (0, 0, 0)
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        input_dim : Number of feature channels.
+        num_heads : Number of attention heads.
+        window_size : Local window size. Default: (2, 7, 7)
+        shift_size : Shift size for SW-MSA. Default: (0, 0, 0)
+        mlp_ratio : Ratio of mlp hidden dim to embedding dim.
             Default: 4.0
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value.
+        qkv_bias : If True, add a learnable bias to query, key, value.
             Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        qk_scale : Override default qk scale of head_dim ** -0.5 if set.
             Default: None
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optionalc): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (keras.layers.Activation, optional): Activation layer. Default: gelu
+        drop : Dropout rate. Default: 0.0
+        attn_drop : Attention dropout rate. Default: 0.0
+        drop_path : Stochastic depth rate. Default: 0.0
+        act_layer : Activation layer. Default: gelu
         norm_layer (keras.layers, optional): Normalization layer.
             Default: LayerNormalization