foundation-model-stack · JRosenkranz · Sep 9, 2024 · Aug 23, 2024 · Aug 23, 2024 · Sep 9, 2024
diff --git a/fms_extras/models/speculator.py b/fms_extras/models/speculator.py
@@ -108,7 +108,7 @@ def __init__(
     def reset_parameters(self):
         for m in self.modules():
             if isinstance(m, nn.Embedding) or isinstance(m, nn.Linear):
-                nn.init.trunc_normal_(m.weight, 0, 1 / math.sqrt(self.inner_dim))
+                nn.init.normal_(m.weight, 0, 1 / math.sqrt(self.inner_dim))
             elif isinstance(m, LayerNormParameterized) and hasattr(m, "weight"):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()

diff --git a/fms_extras/modules/attention.py b/fms_extras/modules/attention.py
@@ -348,8 +348,8 @@ def forward(
         # if use_cache=True, we return the hidden_state as well as the kv cache.
         # We only reduce the output, and keep the cache thread-local
         if use_cache:
-            out = reduce_from_tensor_model_parallel_region(out_par[0])
+            out = reduce_from_tensor_model_parallel_region(out_par[0], self.world_size)
             return out, out_par[1]
         else:
-            out = reduce_from_tensor_model_parallel_region(out_par)
+            out = reduce_from_tensor_model_parallel_region(out_par, self.world_size)
             return out
diff --git a/fms_extras/utils/cache/paged.py b/fms_extras/utils/cache/paged.py
@@ -263,7 +263,9 @@ def create(cls, kernel, *args, mutated_inputs=[], **kwargs) -> None:
                 tensor_args,
                 non_tensor_args,
                 unflatten_args,
-            ) = cls.process_kernel(kernel, *args, **kwargs)
+            ) = cls.process_kernel(
+                kernel, *args, **kwargs
+            )  # type: ignore
         for tensor_arg in tensor_args:
             tensor_arg.realize()