-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
103 lines (83 loc) · 3.16 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Thone AI ToRo v0.1 31/01/24
import torch
import pandas as pd
from trl import SFTTrainer
from functions import create_prompt, generate_response, tokenize_prompts, plot_data_lengths, plot_data_lengths_2
from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments
instruct_tune_dataset_1 = load_dataset("json", data_files="train.jsonl", split="train")
instruct_tune_dataset_2 = load_dataset("mosaicml/dolly_hhrlhf", split="train")
combined_dataset = concatenate_datasets([instruct_tune_dataset_1, instruct_tune_dataset_2])
print(create_prompt(combined_dataset[4]))
model_id = "mistralai/Mixtral-8x7B-v0.1"
nf4_config = BitsAndBytesConfig(
load_in_8bit=True
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map='auto',
quantization_config=nf4_config,
use_cache=False,
attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
prompt="""[INST]Use the provided input to create an instruction that could have been used to generate the response with an LLM. \nThere are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.[\INST]"""
generate_response(prompt, model)
tokenized_train_dataset = instruct_tune_dataset_1.map(tokenize_prompts)
plot_data_lengths(tokenized_train_dataset)
plot_data_lengths_2(tokenized_train_dataset)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head",
],
task_type="CAUSAL_LM"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
if torch.cuda.device_count() > 1: # If more than 1 GPU
print(torch.cuda.device_count())
model.is_parallelizable = True
model.model_parallel = True
args = TrainingArguments(
output_dir = "Mixtral_Toro_FineTuning",
#num_train_epochs=5,
max_steps = 1000, # comment out this line if you want to train in epochs
per_device_train_batch_size = 32,
warmup_steps = 0.03,
logging_steps=10,
save_strategy="epoch",
#evaluation_strategy="epoch",
evaluation_strategy="steps",
eval_steps=10, # comment out this line if you want to evaluate at the end of each epoch
learning_rate=2.5e-5,
bf16=True,
# lr_scheduler_type='constant',
)
max_seq_length = 1024
trainer = SFTTrainer(
model=model,
peft_config=peft_config,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
packing=True,
formatting_func=create_prompt, # this will aplly the create_prompt mapping to all training and test dataset
args=args,
train_dataset=instruct_tune_dataset_1,
)
trainer.train()
trainer.save_model("Mixtral_Toro_FineTuning")