From 6e506b857f209daf1494cde1180dd9b857871d84 Mon Sep 17 00:00:00 2001 From: bobboli Date: Wed, 16 Oct 2024 14:55:58 +0800 Subject: [PATCH 1/4] Lock the omniconfig = "== 0.1.5". - ConfigParser.parse_known_args returns 4 values rather than 3 in newer versions of omniconfig. - The current implementation of LlmRunConfig.parse_args relies on the older version. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3cfd5be..d7825a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ lm_eval = ">= 0.4.2" accelerate = ">= 0.26.0" datasets = ">= 2.16.0" sentencepiece = ">= 0.1.99" -omniconfig = ">= 0.1.5" +omniconfig = "== 0.1.5" protobuf = ">= 5.26.0" [tool.poetry.group.dev.dependencies] From 479950b775277911de77c64a1a7c3299462fdf48 Mon Sep 17 00:00:00 2001 From: bobboli Date: Wed, 16 Oct 2024 14:57:12 +0800 Subject: [PATCH 2/4] Do not multiply gptq_config.damp_percentage by 0.1, which would lead to inconsistency with the config and thus confusing. --- lmquant/quant/functional/gptq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lmquant/quant/functional/gptq.py b/lmquant/quant/functional/gptq.py index 8520fbf..0ceab5a 100644 --- a/lmquant/quant/functional/gptq.py +++ b/lmquant/quant/functional/gptq.py @@ -98,6 +98,7 @@ def gptq_quantize( # noqa: C901 # endregion # region step 5: get the inverse of the Hessian matrix stable_inv, num_inv_tries = False, 0 + hessian_inv = None while (not stable_inv) and num_inv_tries < gptq_config.num_inv_tries: num_inv_tries += 1 try: @@ -105,7 +106,7 @@ def gptq_quantize( # noqa: C901 hessian_inv = torch.cholesky_inverse(hessian_inv) hessian_inv = torch.linalg.cholesky(hessian_inv, upper=True) except RuntimeError: - hessian_diag += (gptq_config.damp_percentage * 0.1) * hessian_diag_mean + hessian_diag += gptq_config.damp_percentage * hessian_diag_mean continue stable_inv = True if num_inv_tries > 1: @@ -113,6 +114,7 @@ def gptq_quantize( # noqa: C901 logger.debug( " - GPTQ Hessian is not stable %s %d tries.", "until" if stable_inv else "after", num_inv_tries ) + assert stable_inv and hessian_inv is not None, "GPTQ Hessian is not stable! Consider increase damp_percentage." assert not hessian_inv.isinf().any(), "Inverse of Hessian matrix contains Inf." assert not hessian_inv.isnan().any(), "Inverse of Hessian matrix contains NaN." del hessian, hessian_diag, hessian_diag_mean, num_inv_tries From 2f8c271e98a3262f7ea880c3f268ed4bb65cc070 Mon Sep 17 00:00:00 2001 From: bobboli Date: Wed, 16 Oct 2024 15:00:11 +0800 Subject: [PATCH 3/4] Huggingface's kwargs may contain tuple such as position_embeddings. Refine the code to support it. See https://github.com/huggingface/transformers/blob/9d6998c759fc08d6a87e81adf26e59d9c932396b/src/transformers/models/llama/modeling_llama.py#L645 --- lmquant/llm/dataset.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lmquant/llm/dataset.py b/lmquant/llm/dataset.py index a178afd..579d802 100644 --- a/lmquant/llm/dataset.py +++ b/lmquant/llm/dataset.py @@ -139,17 +139,25 @@ def _pre_layer_kwargs_hook( kwargs: dict[str, tp.Any], kwargs_cache: dict[str, tp.Any], ) -> None: + def _check_equality(_k, _v, _cached): + if isinstance(_v, DynamicCache): + assert _cached is None, f"kwargs_cache[{_k}] should be None" + elif isinstance(_v, torch.Tensor): + assert _v.allclose(_cached), f"kwargs_cache[{_k}] should be the same as kwargs[{_k}]" + elif isinstance(_v, tuple): + assert len(_v) == len( + _cached), f"kwargs_cache[{_k}] is a tuple, and should have the same length as kwargs[{_k}]" + for i in range(len(_v)): + _check_equality(_k, _v[i], _cached[i]) + else: + assert _v == _cached, f"kwargs_cache[{_k}] should be the same as {_v}" + if kwargs_cache: assert len(kwargs_cache) == len(kwargs), "kwargs_cache should have the same length as kwargs" for k, v in kwargs.items(): assert k in kwargs_cache, f"kwargs_cache should have the same keys as kwargs, but missing {k}" cached = kwargs_cache[k] - if isinstance(v, DynamicCache): - assert cached is None, f"kwargs_cache[{k}] should be None" - elif isinstance(v, torch.Tensor): - assert v.allclose(cached), f"kwargs_cache[{k}] should be the same as kwargs[{k}]" - else: - assert v == cached, f"kwargs_cache[{k}] should be the same as kwargs[{k}]" + _check_equality(k, v, cached) else: for k, v in kwargs.items(): if isinstance(v, DynamicCache): From 9abc1d1506cfc4884b8ed7b4860373065a55bb16 Mon Sep 17 00:00:00 2001 From: bobboli Date: Wed, 16 Oct 2024 15:29:24 +0800 Subject: [PATCH 4/4] Always use device_map. In Huggingface, if there is no enough GPU memory, the rest will be automatically offloaded to CPU or even disk. --- lmquant/llm/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmquant/llm/run.py b/lmquant/llm/run.py index 4d608c7..d84325f 100644 --- a/lmquant/llm/run.py +++ b/lmquant/llm/run.py @@ -75,7 +75,7 @@ def run( # noqa: C901 # region rotate model if needs_rotation: logger.info(f"* Building model {config.model.name} from {config.model.path}") - model, tokenizer = config.model.build(dtype=torch.float32, cpu=config.model.size > 30) + model, tokenizer = config.model.build(dtype=torch.float32) model = LlmModelStruct.build(model) config.quant.num_hidden_layers = model.config.num_hidden_layers if config.quant.develop_dtype is None: