forked from rsanshierli/EasyBert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSentiment.py
169 lines (138 loc) · 7.32 KB
/
Sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
import torch
import torch.nn as nn
import numpy as np
from Sentiment.pytorch_pretrained import BertModel, BertTokenizer
class Config(object):
"""配置参数"""
def __init__(self):
self.model_name = 'bert'
self.class_list = ['中性', '积极', '消极'] # 类别名单
self.save_path = './Sentiment/Sentiment/saved_dict/bert.ckpt' # 模型训练结果
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.num_epochs = 3 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 5e-5 # 学习率
self.bert_path = './Sentiment/bert_pretrain'
self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
self.hidden_size = 768
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
self.bert = BertModel.from_pretrained(config.bert_path)
for param in self.bert.parameters():
param.requires_grad = True
self.fc = nn.Linear(config.hidden_size, config.num_classes)
def forward(self, x):
context = x[0] # 输入的句子
mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示,如:[1, 1, 1, 1, 0, 0]
_, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
out = self.fc(pooled)
return out
PAD, CLS = '[PAD]', '[CLS]' # padding符号, bert中综合信息符号
def clean(text):
# text = re.sub(r"(回复)?(//)?\s*@\S*?\s*(:| |$)", " ", text) # 去除正文中的@和回复/转发中的用户名
# text = re.sub(r"\[\S+\]", "", text) # 去除表情符号
# text = re.sub(r"#\S+#", "", text) # 保留话题内容
URL_REGEX = re.compile(
r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))',
re.IGNORECASE)
text = re.sub(URL_REGEX, "", text) # 去除网址
text = text.replace("转发微博", "") # 去除无意义的词语
text = re.sub(r"\s+", " ", text) # 合并正文中过多的空格
return text.strip()
def load_dataset(data, config):
pad_size = config.pad_size
contents = []
for line in data:
lin = clean(line)
token = config.tokenizer.tokenize(lin) # 分词
token = [CLS] + token # 句首加入CLS
seq_len = len(token)
mask = []
token_ids = config.tokenizer.convert_tokens_to_ids(token)
if pad_size:
if len(token) < pad_size:
mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
token_ids += ([0] * (pad_size - len(token)))
else:
mask = [1] * pad_size
token_ids = token_ids[:pad_size]
seq_len = pad_size
contents.append((token_ids, int(0), seq_len, mask))
return contents
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches # data
self.n_batches = len(batches) // batch_size
self.residue = False # 记录batch数量是否为整数
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device
def _to_tensor(self, datas):
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
return (x, seq_len, mask), y
def __next__(self): # 返回下一个迭代器对象,必须控制结束条件
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches
elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches
def __iter__(self): # 返回一个特殊的迭代器对象,这个迭代器对象实现了 __next__() 方法并通过 StopIteration 异常标识迭代的完成。
return self
def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches
def build_iterator(dataset, config):
iter = DatasetIterater(dataset, 1, config.device)
return iter
def match_label(pred, config):
label_list = config.class_list
return label_list[pred]
def final_predict(config, model, data_iter):
map_location = lambda storage, loc: storage
model.load_state_dict(torch.load(config.save_path, map_location=map_location))
model.eval()
predict_all = np.array([])
with torch.no_grad():
for texts, _ in data_iter:
outputs = model(texts)
pred = torch.max(outputs.data, 1)[1].cpu().numpy()
pred_label = [match_label(i, config) for i in pred]
predict_all = np.append(predict_all, pred_label)
return predict_all
def predict(text):
config = Config()
model = Model(config).to(config.device)
test_data = load_dataset(text, config)
test_iter = build_iterator(test_data, config)
result = final_predict(config, model, test_iter)
for i, j in enumerate(result):
print('text:{}'.format(text[i]))
print('label:{}'.format(j))
if __name__ == '__main__':
test = ['#你好2020#新年第一天元气满满的早起出门买早饭结果高估了自己抗冻能力回家成功冻发烧(大概是想告诉我2020要量力而行)然鹅这并不影响后续计划一出门立马生龙活虎新年和新??更配哦??看了误杀吃了大餐就让新的一年一直这样美滋滋下去吧??',
'大宝又感冒鼻塞咳嗽了,还有发烧。队友加班几天不回。感觉自己的情绪在家已然是随时引爆的状态。情绪一上来,容易对孩子说出自己都想不到的话来……2020年,真的要学会控制情绪,管理好家人健康。这是今年最大的目标。?',
'还要去输两天液,这天也太容易感冒发烧了,一定要多喝热水啊?',
'我太难了别人怎么发烧都没事就我一检查甲型流感?',
'果然是要病一场的喽回来第三天开始感冒今儿还发烧了喉咙眼睛都难受的一匹怎么样能不经意让我的毕设导师看到这条微博并给我放一天假呢?']
predict(test)