-
Notifications
You must be signed in to change notification settings - Fork 225
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0af6faa
commit ddf32c9
Showing
70 changed files
with
22,176 additions
and
1,924 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import json | ||
import numpy as np | ||
from keras.preprocessing.text import Tokenizer | ||
from keras.preprocessing.sequence import pad_sequences | ||
from gensim.models import word2vec | ||
import jieba | ||
import pickle | ||
|
||
jieba.setLogLevel('WARN') | ||
|
||
|
||
class DataPreprocess(): | ||
def __init__(self): | ||
self.texts_cut = None | ||
self.tokenizer = None | ||
self.tokenizer_fact = None | ||
|
||
def cut_texts(self, texts=None, need_cut=True, word_len=1, savepath=None): | ||
''' | ||
Use jieba to cut texts | ||
:param texts:list of texts | ||
:param need_cut:whether need cut text | ||
:param word_len:min length of words to keep,in order to delete stop-words | ||
:param savepath:path to save word list in json file | ||
:return: | ||
''' | ||
if need_cut: | ||
if word_len > 1: | ||
texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts] | ||
else: | ||
texts_cut = [jieba.lcut(one_text) for one_text in texts] | ||
else: | ||
if word_len > 1: | ||
texts_cut = [[word for word in text if len(word) >= word_len] for text in texts] | ||
else: | ||
texts_cut = texts | ||
|
||
if savepath is not None: | ||
with open(savepath, 'w') as f: | ||
json.dump(texts_cut, f) | ||
return texts_cut | ||
|
||
def text2seq(self, texts_cut=None, tokenizer=None, tokenizer_savapah=None, | ||
num_words=2000, maxlen=30, batchsize=10000): | ||
''' | ||
文本转序列,用于神经网络的ebedding层输入。训练集过大全部转换会内存溢出,每次放10000个样本 | ||
:param texts_cut: 分词后的文本列表 | ||
:param tokenizer:转换字典,keras的一个方法 | ||
:param tokenizer_savapah:字典保存路径 | ||
:param num_words:字典保留的高频词数量 | ||
:param maxlen:保留长度 | ||
:param batchsize:每次参与提取的文档数 | ||
:return:向量列表 | ||
eg. ata_transform.text2seq(texts_cut=train_fact_cut,num_words=2000, maxlen=500) | ||
''' | ||
texts_cut_len = len(texts_cut) | ||
|
||
if tokenizer is None: | ||
tokenizer = Tokenizer(num_words=num_words) | ||
n = 0 | ||
# 分批训练 | ||
while n < texts_cut_len: | ||
tokenizer.fit_on_texts(texts=texts_cut[n:n + batchsize]) | ||
n += batchsize | ||
if n < texts_cut_len: | ||
print('tokenizer finish fit %d samples' % n) | ||
else: | ||
print('tokenizer finish fit %d samples' % texts_cut_len) | ||
self.tokenizer = tokenizer | ||
|
||
if tokenizer_savapah: | ||
with open(tokenizer_savapah, mode='wb') as f: | ||
pickle.dump(tokenizer, f) | ||
|
||
# 全部转为数字序列 | ||
fact_seq = tokenizer.texts_to_sequences(texts=texts_cut) | ||
print('finish texts to sequences') | ||
|
||
# 内存不够,删除 | ||
del texts_cut | ||
|
||
n = 0 | ||
fact_pad_seq = [] | ||
# 分批执行pad_sequences | ||
while n < texts_cut_len: | ||
fact_pad_seq += list(pad_sequences(fact_seq[n:n + 10000], maxlen=maxlen, | ||
padding='post', value=0, dtype='int')) | ||
n += 10000 | ||
if n < texts_cut_len: | ||
print('finish pad sequences %d/%d' % (n, texts_cut_len)) | ||
else: | ||
print('finish pad sequences %d/%d' % (texts_cut_len, texts_cut_len)) | ||
return fact_pad_seq | ||
|
||
def text2vec(self, texts_cut=None, model_word2vec=None, | ||
word2vec_savepath=None, word2vec_loadpath=None, | ||
sg=1, size=128, window=5, min_count=1): | ||
''' | ||
文本的词语序列转为词向量序列,可以用于机器学习或者深度学习 | ||
:param texts_cut: 词语序列 | ||
:param model_word2vec: word2vec的模型 | ||
:param word2vec_savepath: word2vec保存路径 | ||
:param word2vec_loadpath: word2vec导入路径 | ||
:param sg: 0 CBOW,1 skip-gram | ||
:param size: the dimensionality of the feature vectors | ||
:param window: the maximum distance between the current and predicted word within a sentence | ||
:param min_count: ignore all words with total frequency lower than this | ||
:return: | ||
''' | ||
if model_word2vec is None: | ||
if word2vec_loadpath: | ||
model_word2vec = word2vec.Word2Vec.load(word2vec_loadpath) | ||
else: | ||
model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count) | ||
if word2vec_savepath: | ||
model_word2vec.save(word2vec_savepath) | ||
|
||
return [[model_word2vec[word] for word in text_cut if word in model_word2vec] for text_cut in texts_cut] | ||
|
||
def creat_label_set(self, labels): | ||
''' | ||
获取标签集合,用于one-hot | ||
:param labels: 原始标签集 | ||
:return: | ||
''' | ||
label_set = [] | ||
for i in labels: | ||
label_set += i | ||
return np.array(list(set(label_set))) | ||
|
||
def creat_label(self, label, label_set): | ||
''' | ||
构建标签one-hot | ||
:param label: 原始标签 | ||
:param label_set: 标签集合 | ||
:return: 标签one-hot形式的array | ||
eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set) | ||
''' | ||
label_zero = np.zeros(len(label_set)) | ||
label_zero[np.in1d(label_set, label)] = 1 | ||
return label_zero | ||
|
||
def creat_labels(self, labels=None, label_set=None): | ||
''' | ||
调用creat_label遍历标签列表生成one-hot二维数组 | ||
:param label: 原始标签集 | ||
:param label_set: 标签集合 | ||
:return: | ||
''' | ||
labels_one_hot = list(map(lambda x: self.creat_label(label=x, label_set=label_set), labels)) | ||
return labels_one_hot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
from .DataPreprocess import DataPreprocess | ||
from .models import CNN, RNN | ||
import numpy as np | ||
|
||
|
||
class TextClassification(): | ||
def __init__(self): | ||
pass | ||
|
||
def fit(self, x=None, y=None, model=None, | ||
method='CNN', epochs=10, batchsize=256, | ||
x_need_preprocess=True, y_need_preprocess=True, | ||
tokenizer=None, num_words=2000, maxlen=30, | ||
vec_size=128, output_shape=None, output_type='multiple'): | ||
self.tokenizer = tokenizer | ||
self.num_words = num_words | ||
self.maxlen = maxlen | ||
self.vec_size = vec_size | ||
|
||
# need process | ||
if x_need_preprocess: | ||
process = DataPreprocess() | ||
# cut texts | ||
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None) | ||
# use average length | ||
if maxlen is None: | ||
maxlen = int(np.array([len(x) for i in x_cut]).mean()) | ||
# texts to sequence | ||
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, tokenizer_savapah=None, | ||
num_words=num_words, maxlen=maxlen, batchsize=10000) | ||
x_seq = np.array(x_seq) | ||
x = x_seq | ||
self.num_words = num_words | ||
self.maxlen = maxlen | ||
self.tokenizer = process.tokenizer | ||
|
||
if y_need_preprocess: | ||
process = DataPreprocess() | ||
label_set = process.creat_label_set(y) | ||
labels = process.creat_labels(labels=y, label_set=label_set) | ||
labels = np.array(labels) | ||
output_shape = labels.shape[1] | ||
y = labels | ||
self.output_shape = output_shape | ||
self.label_set=label_set | ||
|
||
if model is None: | ||
if method == 'CNN': | ||
model = CNN(input_dim=num_words, input_length=maxlen, | ||
vec_size=vec_size, output_shape=output_shape, | ||
output_type=output_type) | ||
elif method == 'RNN': | ||
model = RNN(input_dim=num_words, input_length=maxlen, | ||
vec_size=vec_size, output_shape=output_shape, | ||
output_type=output_type) | ||
else: | ||
# maybe sklearn | ||
pass | ||
|
||
model.fit(x=x, y=y, epochs=epochs, batch_size=batchsize) | ||
self.model = model | ||
|
||
def predict(self, x=None, x_need_preprocess=True, | ||
tokenizer=None, num_words=None, maxlen=None): | ||
if x_need_preprocess: | ||
if tokenizer is not None: | ||
tokenizer = self.tokenizer | ||
if num_words is None: | ||
num_words = self.num_words | ||
if maxlen is None: | ||
maxlen = self.maxlen | ||
process = DataPreprocess() | ||
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None) | ||
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, | ||
num_words=num_words, maxlen=maxlen, batchsize=10000) | ||
x = np.array(x_seq) | ||
|
||
model = self.model | ||
y = model.predict(x=x) | ||
return y | ||
|
||
def label2toptag(self, predictions, labelset): | ||
labels = [] | ||
for prediction in predictions: | ||
label = labelset[prediction == prediction.max()] | ||
labels.append(label.tolist()) | ||
return labels | ||
|
||
def label2half(self, predictions, labelset): | ||
labels = [] | ||
for prediction in predictions: | ||
label = labelset[prediction > 0.5] | ||
labels.append(label.tolist()) | ||
return labels | ||
|
||
def label2tag(self, predictions, labelset): | ||
labels1=self.label2toptag(predictions, labelset) | ||
labels2 = self.label2half(predictions, labelset) | ||
labels = [] | ||
for i in range(len(predictions)): | ||
if len(labels2[i])==0: | ||
labels.append(labels1[i]) | ||
else: | ||
labels.append(labels2[i]) | ||
return labels |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .DataPreprocess import DataPreprocess | ||
from .TextClassification import TextClassification |
Oops, something went wrong.