Modalities · mali-git · Mar 20, 2024 · Jan 30, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/.gitignore b/.gitignore
@@ -121,6 +121,8 @@ venv.bak/
 # Vscode
 .vscode/
 .history/
+env_modalities/*
+.devcontainer/*
 
 # Created by https://www.toptal.com/developers/gitignore/api/vim
 # Edit at https://www.toptal.com/developers/gitignore?templates=vim

diff --git a/config_files/config.yaml b/config_files/config.yaml
@@ -141,17 +141,13 @@ model:
     prediction_key: "logits"
     block_size: ${data.sequence_len}
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: 12
-    n_head: 12
+    n_layer_q: 12
+    n_head_kv: 12
     ffn_hidden: 2048
     n_embd: 768
     dropout: 0.0
     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
-    attention:
-      attention_type: pytorch_flash_attention
-      scaling_factor: 3
     activation: gelu
-    epsilon: 1e-5
     weight_init:
       mean: 0.0
       std: 0.02

diff --git a/config_files/config_example_mem_map_dataset.yaml b/config_files/config_example_mem_map_dataset.yaml
@@ -141,14 +141,13 @@ model:
     poe_type: NOPE
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: 12
-    n_head: 12
+    n_head_q: 12
+    n_head_kv: 12
     ffn_hidden: 2048
     n_embd: 768
     dropout: 0.0
     bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
     attention_config:
-      attention_type: pytorch_flash_attention
-      scaling_factor: 3
       qkv_transforms:
         - type_hint: RotaryTransform
           config:

diff --git a/config_files/config_example_openGPTx_dataset.yaml b/config_files/config_example_openGPTx_dataset.yaml
@@ -145,16 +145,13 @@ model:
     block_size: ${data.sequence_len}
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: 12
-    n_head: 12
+    n_head_q: 12
+    n_head_kv: 12
     ffn_hidden: 2048
     n_embd: 768
     dropout: 0.0
     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
-    attention:
-      attention_type: pytorch_flash_attention
-      scaling_factor: 3
     activation: fused_swiglu
-    epsilon: 1e-5
     weight_init:
       mean: 0.0
       std: 0.02

diff --git a/config_files/config_lorem_ipsum.yaml b/config_files/config_lorem_ipsum.yaml
@@ -195,22 +195,20 @@ model:
     prediction_key: ${loss_fn.config.prediction_key}
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: 2
-    n_head: 4
+    n_head_q: 8
+    n_head_kv: 8
     ffn_hidden: 128
     n_embd: 128
     dropout: 0.0
     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
     attention_config:
-      attention_type: pytorch_flash_attention
-      scaling_factor: 3
       qkv_transforms:
         - type_hint: RotaryTransform
           config:
             n_embd: ${model.config.n_embd}
-            n_head: ${model.config.n_head}
+            n_head: ${model.config.n_head_q} #it has to be head_q here
             seq_length_dim: -2
     activation_type: gelu
-    epsilon: 1e-5
     weight_init:
       mean: 0.0
       std: 0.02
@@ -285,7 +283,6 @@ evaluation_subscriber:
   config:
     local_rank: ${settings.cuda_env.local_rank}
     project: modalities
-    mode: ONLINE
+    mode: OFFLINE
     experiment_id: ${settings.experiment_id}
-    directory: "."
-
+    directory: "."
diff --git a/examples/getting_started/example_config.yaml b/examples/getting_started/example_config.yaml
@@ -114,14 +114,13 @@ model:
     block_size: ${data.sequence_len}
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: 12
-    n_head: 12
+    n_head_q: 12
+    n_head_kv: 12
     ffn_hidden: 2048
     n_embd: 768
     dropout: 0.0
     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
-    attention:
-      attention_type: pytorch_flash_attention
-      scaling_factor: 3
+    attention_type: pytorch_flash_attention
     activation: gelu
     epsilon: 1e-5
     weight_init:

diff --git a/examples/library_usage/config_lorem_ipsum.yaml b/examples/library_usage/config_lorem_ipsum.yaml
@@ -193,14 +193,13 @@ model:
     block_size: 256  # TODO reference this (same as sequence length)
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
     n_layer: 2
-    n_head: 4
+    n_head_q: 4
+    n_head_kv: 4
     ffn_hidden: 128
     n_embd: 128
     dropout: 0.0
     bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
-    attention:
-      attention_type: default_attention # pytorch_flash_attention
-      scaling_factor: 3
+    attention_type: default_attention # pytorch_flash_attention
     activation: gelu
     epsilon: 1e-5
     weight_init:

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,12 +21,15 @@ dependencies = [
     "jq",
     "xformers",
     "class_resolver",
-    "wandb"
+    "wandb",
+    "flash-attn"  # install this directly via `pip install flash-attn --no-build-isolation`
+
 ]
 
 [project.optional-dependencies]
 linting = ["pre-commit"]
 tests = ["pytest", "pytest-cov"]
+install_helper = ["ninja"]
 
 [project.scripts]
 modalities = "modalities.__main__:main"

diff --git a/src/modalities/config/look_up_enum.py b/src/modalities/config/look_up_enum.py
@@ -5,4 +5,4 @@ class LookupEnum(Enum):
     @classmethod
     def _missing_(cls, value: str) -> type:
         """constructs Enum by member name, if not constructable by value"""
-        return cls.__dict__[value]
+        return cls.__dict__[value]
diff --git a/src/modalities/config/utils.py b/src/modalities/config/utils.py
@@ -2,6 +2,7 @@
 
 from pydantic import BaseModel
 
+
 def convert_base_model_config_to_dict(config: BaseModel) -> Dict[Any, Any]:
-    """"Converts non-recursively a Pydantic BaseModel to a dictionary."""
+    """ "Converts non-recursively a Pydantic BaseModel to a dictionary."""
     return {key: getattr(config, key) for key in config.model_dump().keys()}
diff --git a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py
@@ -2,10 +2,10 @@
 from typing import Dict, Optional
 
 import rich
-import wandb
 from rich.console import Group
 from rich.panel import Panel
 
+import wandb
 from modalities.batch import EvaluationResultBatch
 from modalities.config.config import WandbMode
 from modalities.logging_broker.messages import Message