From d63f940f5586d62f33a82d352fa1233c322c646b Mon Sep 17 00:00:00 2001
From: Zhao Chen <zhaochen.zju@gmail.com>
Date: Tue, 15 Oct 2024 18:30:53 +0800
Subject: [PATCH] feat: more transformer definitions such as rope scailing

Signed-off-by: Zhao Chen <zhaochen.zju@gmail.com>
---
 docs/v2/architecture.md     | 30 ++++++++++++++++++++++
 specs-go/v2/architecture.go | 50 +++++++++++++++++++++++++++++++------
 specs-go/v2/architecture.md | 23 -----------------
 3 files changed, 73 insertions(+), 30 deletions(-)
 create mode 100644 docs/v2/architecture.md
 delete mode 100644 specs-go/v2/architecture.md

diff --git a/docs/v2/architecture.md b/docs/v2/architecture.md
new file mode 100644
index 0000000..40b36a7
--- /dev/null
+++ b/docs/v2/architecture.md
@@ -0,0 +1,30 @@
+# Architecture
+
+## Tensor naming convention
+
+[version].[vendor].[family].[name].[arch].[modality].[block_name].[layer_name].[tensor_name].[tensor_type]
+
+The dot in the name should be replaced with a underscore.
+
+### Naming Conventions
+
+- **version**: The version of the naming convention.
+- **vendor**: The vendor of the model.
+- **family**: The family of the model.
+- **name**: The name of the model.
+- **arch**: The architecture of the model.
+- **modality**: The modality of the model.
+- **block_name**: The name of the block.
+- **layer_name**: The name and 0-indexed layer number of the layer.
+- **tensor_name**: The name of the tensor.
+- **tensor_type**: Weight or bias of the tensor.
+
+### Example
+
+```plain
+v1.meta.llama-3_2-1b.transformer.text.decoder.layers_0.embedding.projection.weight
+```
+
+```plain
+v1.meta.llama-3_2-1b.transformer.text.decoder.layers_1.attention.query.weight
+```
diff --git a/specs-go/v2/architecture.go b/specs-go/v2/architecture.go
index d3d84b4..4c09bef 100644
--- a/specs-go/v2/architecture.go
+++ b/specs-go/v2/architecture.go
@@ -5,7 +5,7 @@ type Architecture struct {
 	// Transformer architecture
 	Transformer Transformer `json:"transformer"`
 
-	// TODO: Other architectures
+	// TODO: Other architectures, like mamba, etc.
 }
 
 // Transformer represents the transformer architecture.
@@ -30,6 +30,9 @@ type TransformerForCausalLM struct {
 	// The hidden size of the model
 	HiddenSize int `json:"hidden_size"`
 
+	// embedding
+	Embedding Embedding `json:"embedding"`
+
 	// Position embedding type
 	PositionEmbedding PositionEmbedding `json:"position_embedding"`
 
@@ -45,8 +48,11 @@ type TransformerForCausalLM struct {
 
 // TransformerLayer represents the transformer layer parameters.
 type TransformerLayer struct {
+	// Attention parameters
 	Attention Attention `json:"attention"`
-	MLP       MLP       `json:"mlp"`
+
+	// MLP parameters
+	MLP MLP `json:"mlp"`
 }
 
 // MLP represents the MLP (Multi-Layer Perceptron) parameters.
@@ -97,6 +103,9 @@ type Attention struct {
 	// Number of key-value heads
 	NumKeyValueHeads int `json:"num_key_value_heads"`
 
+	// The attention head dimension. If 0, it will default to hidden_size / NumAttentionHeads
+	HeadDim int `json:"head_dim"`
+
 	// Whether the attention has a residual connection
 	HasResidual bool `json:"has_residual"`
 
@@ -112,17 +121,35 @@ type Attention struct {
 
 // PositionEmbedding represents the position embedding type and parameters.
 type PositionEmbedding struct {
-	// Type of position embedding, e.g. 'rope', 'sinusoidal', 'alibi', etc.
+	// Type of position embedding, e.g. 'rope', 'alibi', etc.
 	Type string `json:"type"`
 
 	// The maximum number of position embeddings
 	MaxPositionEmbeddings int `json:"max_position_embeddings"`
 
-	// The base in signifying the rotary embedding period.
-	RotaryEmbeddingBase int `json:"rotary_embedding_base,omitempty"`
+	// Only used with 'RoPE'. The theta parameter in the RoPE position embedding.
+	RotaryEmbeddingTheta float64 `json:"rope_theta,omitempty"`
+
+	// Only used with 'RoPE'. The scaling configuration for the RoPE embeddings
+	RotaryEmbeddingScaling RotaryEmbeddingScaling `json:"rope_scaling,omitempty"`
+}
+
+// RotaryEmbeddingScaling represents the scaling configuration for the RoPE embeddings.
+type RotaryEmbeddingScaling struct {
+	// Type of scaling, can be one of ['default', 'linear', 'dynamic', 'llama3'], with 'default' being the original RoPE implementation.
+	Type string `json:"type"`
+
+	// The scaling factor
+	Factor float64 `json:"factor"`
+
+	// The original max position used during pretraining.
+	OriginalMaxPosition int `json:"original_max_position"`
 
-	// Fraction of hidden size to apply rotary embeddings to. Must be in [0,1].
-	RotaryEmbeddingFraction float64 `json:"rotary_embedding_fraction,omitempty"`
+	// Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+	LowFreqFactor float64 `json:"low_freq_factor"`
+
+	// Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+	HighFreqFactor float64 `json:"high_freq_factor"`
 }
 
 // Normalization represents the normalization parameters.
@@ -133,3 +160,12 @@ type Normalization struct {
 	// Epsilon for the normalization
 	Epsilon float64 `json:"epsilon"`
 }
+
+// Embedding represents the embedding parameters.
+type Embedding struct {
+	// Whether the embedding has a bias
+	HasBias bool `json:"has_bias"`
+
+	// Whether the embedding has a normalization
+	HasNorm bool `json:"has_norm"`
+}
diff --git a/specs-go/v2/architecture.md b/specs-go/v2/architecture.md
deleted file mode 100644
index b8ef930..0000000
--- a/specs-go/v2/architecture.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Architecture
-
-## Tensor naming convention
-
-[vendor].[family].[name].[arch].[modality].[block_name].[layer_number].[tensor_name].[tensor_type]
-
-### Naming Conventions
-
-- **vendor**: The vendor of the model.
-- **family**: The family of the model.
-- **name**: The name of the model.
-- **arch**: The architecture of the model.
-- **modality**: The modality of the model.
-- **block_name**: The name of the block.
-- **layer_number**: The layer number.
-- **tensor_name**: The name of the tensor.
-- **tensor_type**: The type of the tensor.
-
-### Example
-
-```
-meta.llama.llama3.2-1B.transformer.text.decoder.layer.0.self_attention.query.weight
-```