From 6e506b857f209daf1494cde1180dd9b857871d84 Mon Sep 17 00:00:00 2001
From: bobboli <bobboli0202@gmail.com>
Date: Wed, 16 Oct 2024 14:55:58 +0800
Subject: [PATCH 1/4] Lock the omniconfig = "== 0.1.5". -
 ConfigParser.parse_known_args returns 4 values rather than 3 in newer
 versions of omniconfig. - The current implementation of
 LlmRunConfig.parse_args relies on the older version.

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3cfd5be..d7825a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ lm_eval = ">= 0.4.2"
 accelerate = ">= 0.26.0"
 datasets = ">= 2.16.0"
 sentencepiece = ">= 0.1.99"
-omniconfig = ">= 0.1.5"
+omniconfig = "== 0.1.5"
 protobuf = ">= 5.26.0"
 
 [tool.poetry.group.dev.dependencies]

From 479950b775277911de77c64a1a7c3299462fdf48 Mon Sep 17 00:00:00 2001
From: bobboli <bobboli0202@gmail.com>
Date: Wed, 16 Oct 2024 14:57:12 +0800
Subject: [PATCH 2/4] Do not multiply gptq_config.damp_percentage by 0.1, which
 would lead to inconsistency with the config and thus confusing.

---
 lmquant/quant/functional/gptq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lmquant/quant/functional/gptq.py b/lmquant/quant/functional/gptq.py
index 8520fbf..0ceab5a 100644
--- a/lmquant/quant/functional/gptq.py
+++ b/lmquant/quant/functional/gptq.py
@@ -98,6 +98,7 @@ def gptq_quantize(  # noqa: C901
     # endregion
     # region step 5: get the inverse of the Hessian matrix
     stable_inv, num_inv_tries = False, 0
+    hessian_inv = None
     while (not stable_inv) and num_inv_tries < gptq_config.num_inv_tries:
         num_inv_tries += 1
         try:
@@ -105,7 +106,7 @@ def gptq_quantize(  # noqa: C901
             hessian_inv = torch.cholesky_inverse(hessian_inv)
             hessian_inv = torch.linalg.cholesky(hessian_inv, upper=True)
         except RuntimeError:
-            hessian_diag += (gptq_config.damp_percentage * 0.1) * hessian_diag_mean
+            hessian_diag += gptq_config.damp_percentage * hessian_diag_mean
             continue
         stable_inv = True
     if num_inv_tries > 1:
@@ -113,6 +114,7 @@ def gptq_quantize(  # noqa: C901
         logger.debug(
             "        - GPTQ Hessian is not stable %s %d tries.", "until" if stable_inv else "after", num_inv_tries
         )
+    assert stable_inv and hessian_inv is not None, "GPTQ Hessian is not stable! Consider increase damp_percentage."
     assert not hessian_inv.isinf().any(), "Inverse of Hessian matrix contains Inf."
     assert not hessian_inv.isnan().any(), "Inverse of Hessian matrix contains NaN."
     del hessian, hessian_diag, hessian_diag_mean, num_inv_tries

From 2f8c271e98a3262f7ea880c3f268ed4bb65cc070 Mon Sep 17 00:00:00 2001
From: bobboli <bobboli0202@gmail.com>
Date: Wed, 16 Oct 2024 15:00:11 +0800
Subject: [PATCH 3/4] Huggingface's kwargs may contain tuple such as
 position_embeddings. Refine the code to support it. See
 https://github.com/huggingface/transformers/blob/9d6998c759fc08d6a87e81adf26e59d9c932396b/src/transformers/models/llama/modeling_llama.py#L645

---
 lmquant/llm/dataset.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/lmquant/llm/dataset.py b/lmquant/llm/dataset.py
index a178afd..579d802 100644
--- a/lmquant/llm/dataset.py
+++ b/lmquant/llm/dataset.py
@@ -139,17 +139,25 @@ def _pre_layer_kwargs_hook(
         kwargs: dict[str, tp.Any],
         kwargs_cache: dict[str, tp.Any],
     ) -> None:
+        def _check_equality(_k, _v, _cached):
+            if isinstance(_v, DynamicCache):
+                assert _cached is None, f"kwargs_cache[{_k}] should be None"
+            elif isinstance(_v, torch.Tensor):
+                assert _v.allclose(_cached), f"kwargs_cache[{_k}] should be the same as kwargs[{_k}]"
+            elif isinstance(_v, tuple):
+                assert len(_v) == len(
+                    _cached), f"kwargs_cache[{_k}] is a tuple, and should have the same length as kwargs[{_k}]"
+                for i in range(len(_v)):
+                    _check_equality(_k, _v[i], _cached[i])
+            else:
+                assert _v == _cached, f"kwargs_cache[{_k}] should be the same as {_v}"
+
         if kwargs_cache:
             assert len(kwargs_cache) == len(kwargs), "kwargs_cache should have the same length as kwargs"
             for k, v in kwargs.items():
                 assert k in kwargs_cache, f"kwargs_cache should have the same keys as kwargs, but missing {k}"
                 cached = kwargs_cache[k]
-                if isinstance(v, DynamicCache):
-                    assert cached is None, f"kwargs_cache[{k}] should be None"
-                elif isinstance(v, torch.Tensor):
-                    assert v.allclose(cached), f"kwargs_cache[{k}] should be the same as kwargs[{k}]"
-                else:
-                    assert v == cached, f"kwargs_cache[{k}] should be the same as kwargs[{k}]"
+                _check_equality(k, v, cached)
         else:
             for k, v in kwargs.items():
                 if isinstance(v, DynamicCache):

From 9abc1d1506cfc4884b8ed7b4860373065a55bb16 Mon Sep 17 00:00:00 2001
From: bobboli <bobboli0202@gmail.com>
Date: Wed, 16 Oct 2024 15:29:24 +0800
Subject: [PATCH 4/4] Always use device_map. In Huggingface, if there is no
 enough GPU memory, the rest will be automatically offloaded to CPU or even
 disk.

---
 lmquant/llm/run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmquant/llm/run.py b/lmquant/llm/run.py
index 4d608c7..d84325f 100644
--- a/lmquant/llm/run.py
+++ b/lmquant/llm/run.py
@@ -75,7 +75,7 @@ def run(  # noqa: C901
     # region rotate model
     if needs_rotation:
         logger.info(f"* Building model {config.model.name} from {config.model.path}")
-        model, tokenizer = config.model.build(dtype=torch.float32, cpu=config.model.size > 30)
+        model, tokenizer = config.model.build(dtype=torch.float32)
         model = LlmModelStruct.build(model)
         config.quant.num_hidden_layers = model.config.num_hidden_layers
         if config.quant.develop_dtype is None: