-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
162 lines (138 loc) · 5.67 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import torch
import json
from dataclasses import dataclass
from datasets import load_from_disk
from main import log_dist
class JsonlDatasetPT(torch.utils.data.Dataset):
"""
用于加载json格式的数据集,用于预训练任务
"""
def __init__(
self,
data_path, ### 数据集路径
tokenizer, ### 分词器实例
max_length, ### 最大长度
):
## 加载数据集并进行词令化
self.dataset = []
with open(data_path, 'r', encoding="utf-8") as f:
for line in f:
text = json.loads(line)['text']
## 使用tokenizer对句子进行词令化
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=max_length,
padding='max_length',
return_tensors='pt',
truncation=True
)
input_ids = inputs["input_ids"].squeeze() ### shape: [max_length]
## 将词令化后的样本添加到datasets中
self.dataset.append({
'input_ids': input_ids,
})
log_dist(f"Loaded {len(self.dataset)} examples from {data_path}")
def __len__(self):
## 返回数据集大小
return len(self.dataset)
def __getitem__(self, idx):
## 返回一个样本
return self.dataset[idx]
def get_pt_dataset(args):
"""
加载已经词令化的数据集,用于预训练任务
"""
## 从磁盘加载数据集,注意该数据集必须通过save_to_disk()函数保存的
train_dataset = load_from_disk(args.data_path)
train_dataset = train_dataset.shuffle(seed=42)
return train_dataset
class JsonlDatasetSFT(torch.utils.data.Dataset):
"""
加载json格式的数据集,用于指令微调任务
"""
def __init__(
self,
data_path, ### 数据集路径
tokenizer, ### 分词器实例
max_length, ### 最大长度
):
super().__init__()
self.dataset = []
with open(data_path, 'r') as file:
for line in file:
sample = json.loads(line)
sentence = sample['instruction'] + sample['response']
## 使用tokenizer对句子进行词令化
tokenized = tokenizer(
sentence,
max_length=max_length,
padding="max_length",
truncation=True,
return_tensor='pt'
)
tokenized['input_ids'] = tokenized['input_ids'].squeeze(0)
tokenized['attention_mask'] = tokenized["attention_mask"].squeeze(0)
## 将词令化后的样本添加到dataset变量中
self.dataset.append(tokenized)
log_dist(f"Loaded {len(self.dataset)} examples from {data_path}")
def __len__(self):
## 返回数据集大小
length = len(self.dataset)
return length
def __getitem__(self, idx):
## 返回一个样本
return {
"input_ids": self.dataset[idx]['input_ids'],
"labels": self.dataset[idx]["input_ids"],
"attention_mask": self.dataset[idx]["attention_mask"]
}
@dataclass
class DataCollatorForPT(object):
"""
Data Collator函数,将多个样本训练拼接成一个batch,同时生成labels,用于计算loss。该函数用于预训练任务
"""
pad_token_id: int = 0
ignore_index: int = -100
max_length: int = -1 ### 默认不进行max_length截断
def __call__(self, instances: list) -> dict:
if self.max_length > 0:
input_ids = torch.stack([instance['input_ids'][:self.max_length] for instance in instances], dim=0) ### shape: 【batch_size, max_length]
else:
input_ids = torch.stack([instance['input_ids'] for instance in instances], dim=0) ### shape: [batch_size, max_length]
labels = input_ids.clone()
## 将labels中的pad部分置为ignore_index, 计算loss时要忽略
labels[labels == self.pad_token_id] = self.ignore_index
return dict(
input_ids=input_ids,
labels=labels,
)
@dataclass
class DataCollatorForSFT(object):
"""
Data Collator函数,将多个样本拼接成一个batch,同时生成labels和attention_mask, 用于计算loss。该函数用于指令微调任务
"""
pad_token_id: int = 0
def __call__(self, features):
len_ids = [len(feature['input_ids']) for feature in features] ### [14, 6, 7, 10, ....]
longest = max(len_ids) ### 14
input_ids_list = []
labels_list = []
## 从长到短排列
for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
ids = feature['input_ids']
labels = feature['labels']
labels = labels[:len(ids)] ## 截断
## padding补齐
ids += [self.pad_token_id] * (longest - ids_l)
## padding部分设置为-100,使得计算loss时对应值为一个很小的负数,达到忽略的效果
labels += [-100] * (longest - ids_l)
input_ids_list.append(torch.LongTensor(ids))
labels_list.append(torch.LongTensor(labels))
input_ids = torch.stack(input_ids_list) ### shape: [batch_size, longest]
labels = torch.stack(labels_list) ### shape: [batch_size, longest]
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": input_ids.ne(self.pad_token_id),
}