From 3dc4d9d715d099d6e140258e61666648a6c72e1f Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 20:12:36 +0000 Subject: [PATCH 1/4] clarify example --- examples/llama7b_w8a8_quantization.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index c894613ffbb..fab55d6f76c 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -16,19 +16,18 @@ num_bits: 8 type: "int" symmetric: true - strategy: "channel" input_activations: num_bits: 8 type: "int" + dynamic: true symmetric: true - dynamic: True strategy: "token" targets: ["Linear"] """ # setting device_map to auto to spread the model evenly across all available GPUs # load the model in as bfloat16 to save on memory and compute -model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base" +model_stub = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" model = SparseAutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) @@ -37,7 +36,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" +output_dir = "/network/sadkins/tinyllama-oneshot-w8a8-dynamic-token" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} From d333c27120d064b0b7681f9f2637263d59f6074e Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 20:15:21 +0000 Subject: [PATCH 2/4] cleanup --- examples/llama7b_sparse_quantized/README.md | 8 +++++--- examples/llama7b_w8a8_quantization.py | 7 ++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index c96b6e7ca43..f10bb0984ab 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -4,7 +4,8 @@ This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and qua The model is calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is required to run this example. -Follow the steps below, or to run the example as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` +Follow the steps below one by one in a code notebook, or run the full example script +as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` ## Step 1: Select a model, dataset, and recipe In this step, we select which model to use as a baseline for sparsification, a dataset to @@ -36,7 +37,8 @@ recipe = "2:4_w4a16_recipe.yaml" ## Step 2: Run sparsification using `apply` The `apply` function applies the given recipe to our model and dataset. -The hardcoded kwargs may be altered based on each model's needs. +The hardcoded kwargs may be altered based on each model's needs. This code snippet should +be run in the same Python instance as step 1. After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. ```python @@ -67,7 +69,7 @@ apply( ### Step 3: Compression The resulting model will be uncompressed. To save a final compressed copy of the model -run the following: +run the following in the same Python instance as the previous steps. ```python import torch diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index fab55d6f76c..c894613ffbb 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -16,18 +16,19 @@ num_bits: 8 type: "int" symmetric: true + strategy: "channel" input_activations: num_bits: 8 type: "int" - dynamic: true symmetric: true + dynamic: True strategy: "token" targets: ["Linear"] """ # setting device_map to auto to spread the model evenly across all available GPUs # load the model in as bfloat16 to save on memory and compute -model_stub = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" +model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base" model = SparseAutoModelForCausalLM.from_pretrained( model_stub, torch_dtype=torch.bfloat16, device_map="auto" ) @@ -36,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "/network/sadkins/tinyllama-oneshot-w8a8-dynamic-token" +output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} From 83aaa178cf2e6b4e5f0c02f4be585ab3498864b5 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 20:17:14 +0000 Subject: [PATCH 3/4] update examples --- examples/llama7b_w8a8_quantization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index c894613ffbb..1b4cfc73d77 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -16,12 +16,12 @@ num_bits: 8 type: "int" symmetric: true - strategy: "channel" + strategy: "tensor" input_activations: num_bits: 8 type: "int" symmetric: true - dynamic: True + dynamic: true strategy: "token" targets: ["Linear"] """ From 0828a6d19afb86cc98067b69240d6070fbe60257 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 20:20:41 +0000 Subject: [PATCH 4/4] update output name --- examples/llama7b_w8a8_quantization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index 1b4cfc73d77..702218f7db7 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -37,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" +output_dir = "./output_llama7b_w8a8_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"}