Skip to content

Commit

Permalink
重构
Browse files Browse the repository at this point in the history
  • Loading branch information
renjunxiang committed Jun 5, 2018
1 parent 0af6faa commit ddf32c9
Show file tree
Hide file tree
Showing 70 changed files with 22,176 additions and 1,924 deletions.
512 changes: 51 additions & 461 deletions README.md

Large diffs are not rendered by default.

151 changes: 151 additions & 0 deletions TextClassification/DataPreprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import json
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import word2vec
import jieba
import pickle

jieba.setLogLevel('WARN')


class DataPreprocess():
def __init__(self):
self.texts_cut = None
self.tokenizer = None
self.tokenizer_fact = None

def cut_texts(self, texts=None, need_cut=True, word_len=1, savepath=None):
'''
Use jieba to cut texts
:param texts:list of texts
:param need_cut:whether need cut text
:param word_len:min length of words to keep,in order to delete stop-words
:param savepath:path to save word list in json file
:return:
'''
if need_cut:
if word_len > 1:
texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
else:
texts_cut = [jieba.lcut(one_text) for one_text in texts]
else:
if word_len > 1:
texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
else:
texts_cut = texts

if savepath is not None:
with open(savepath, 'w') as f:
json.dump(texts_cut, f)
return texts_cut

def text2seq(self, texts_cut=None, tokenizer=None, tokenizer_savapah=None,
num_words=2000, maxlen=30, batchsize=10000):
'''
文本转序列,用于神经网络的ebedding层输入。训练集过大全部转换会内存溢出,每次放10000个样本
:param texts_cut: 分词后的文本列表
:param tokenizer:转换字典,keras的一个方法
:param tokenizer_savapah:字典保存路径
:param num_words:字典保留的高频词数量
:param maxlen:保留长度
:param batchsize:每次参与提取的文档数
:return:向量列表
eg. ata_transform.text2seq(texts_cut=train_fact_cut,num_words=2000, maxlen=500)
'''
texts_cut_len = len(texts_cut)

if tokenizer is None:
tokenizer = Tokenizer(num_words=num_words)
n = 0
# 分批训练
while n < texts_cut_len:
tokenizer.fit_on_texts(texts=texts_cut[n:n + batchsize])
n += batchsize
if n < texts_cut_len:
print('tokenizer finish fit %d samples' % n)
else:
print('tokenizer finish fit %d samples' % texts_cut_len)
self.tokenizer = tokenizer

if tokenizer_savapah:
with open(tokenizer_savapah, mode='wb') as f:
pickle.dump(tokenizer, f)

# 全部转为数字序列
fact_seq = tokenizer.texts_to_sequences(texts=texts_cut)
print('finish texts to sequences')

# 内存不够,删除
del texts_cut

n = 0
fact_pad_seq = []
# 分批执行pad_sequences
while n < texts_cut_len:
fact_pad_seq += list(pad_sequences(fact_seq[n:n + 10000], maxlen=maxlen,
padding='post', value=0, dtype='int'))
n += 10000
if n < texts_cut_len:
print('finish pad sequences %d/%d' % (n, texts_cut_len))
else:
print('finish pad sequences %d/%d' % (texts_cut_len, texts_cut_len))
return fact_pad_seq

def text2vec(self, texts_cut=None, model_word2vec=None,
word2vec_savepath=None, word2vec_loadpath=None,
sg=1, size=128, window=5, min_count=1):
'''
文本的词语序列转为词向量序列,可以用于机器学习或者深度学习
:param texts_cut: 词语序列
:param model_word2vec: word2vec的模型
:param word2vec_savepath: word2vec保存路径
:param word2vec_loadpath: word2vec导入路径
:param sg: 0 CBOW,1 skip-gram
:param size: the dimensionality of the feature vectors
:param window: the maximum distance between the current and predicted word within a sentence
:param min_count: ignore all words with total frequency lower than this
:return:
'''
if model_word2vec is None:
if word2vec_loadpath:
model_word2vec = word2vec.Word2Vec.load(word2vec_loadpath)
else:
model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count)
if word2vec_savepath:
model_word2vec.save(word2vec_savepath)

return [[model_word2vec[word] for word in text_cut if word in model_word2vec] for text_cut in texts_cut]

def creat_label_set(self, labels):
'''
获取标签集合,用于one-hot
:param labels: 原始标签集
:return:
'''
label_set = []
for i in labels:
label_set += i
return np.array(list(set(label_set)))

def creat_label(self, label, label_set):
'''
构建标签one-hot
:param label: 原始标签
:param label_set: 标签集合
:return: 标签one-hot形式的array
eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set)
'''
label_zero = np.zeros(len(label_set))
label_zero[np.in1d(label_set, label)] = 1
return label_zero

def creat_labels(self, labels=None, label_set=None):
'''
调用creat_label遍历标签列表生成one-hot二维数组
:param label: 原始标签集
:param label_set: 标签集合
:return:
'''
labels_one_hot = list(map(lambda x: self.creat_label(label=x, label_set=label_set), labels))
return labels_one_hot
105 changes: 105 additions & 0 deletions TextClassification/TextClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from .DataPreprocess import DataPreprocess
from .models import CNN, RNN
import numpy as np


class TextClassification():
def __init__(self):
pass

def fit(self, x=None, y=None, model=None,
method='CNN', epochs=10, batchsize=256,
x_need_preprocess=True, y_need_preprocess=True,
tokenizer=None, num_words=2000, maxlen=30,
vec_size=128, output_shape=None, output_type='multiple'):
self.tokenizer = tokenizer
self.num_words = num_words
self.maxlen = maxlen
self.vec_size = vec_size

# need process
if x_need_preprocess:
process = DataPreprocess()
# cut texts
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
# use average length
if maxlen is None:
maxlen = int(np.array([len(x) for i in x_cut]).mean())
# texts to sequence
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, tokenizer_savapah=None,
num_words=num_words, maxlen=maxlen, batchsize=10000)
x_seq = np.array(x_seq)
x = x_seq
self.num_words = num_words
self.maxlen = maxlen
self.tokenizer = process.tokenizer

if y_need_preprocess:
process = DataPreprocess()
label_set = process.creat_label_set(y)
labels = process.creat_labels(labels=y, label_set=label_set)
labels = np.array(labels)
output_shape = labels.shape[1]
y = labels
self.output_shape = output_shape
self.label_set=label_set

if model is None:
if method == 'CNN':
model = CNN(input_dim=num_words, input_length=maxlen,
vec_size=vec_size, output_shape=output_shape,
output_type=output_type)
elif method == 'RNN':
model = RNN(input_dim=num_words, input_length=maxlen,
vec_size=vec_size, output_shape=output_shape,
output_type=output_type)
else:
# maybe sklearn
pass

model.fit(x=x, y=y, epochs=epochs, batch_size=batchsize)
self.model = model

def predict(self, x=None, x_need_preprocess=True,
tokenizer=None, num_words=None, maxlen=None):
if x_need_preprocess:
if tokenizer is not None:
tokenizer = self.tokenizer
if num_words is None:
num_words = self.num_words
if maxlen is None:
maxlen = self.maxlen
process = DataPreprocess()
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer,
num_words=num_words, maxlen=maxlen, batchsize=10000)
x = np.array(x_seq)

model = self.model
y = model.predict(x=x)
return y

def label2toptag(self, predictions, labelset):
labels = []
for prediction in predictions:
label = labelset[prediction == prediction.max()]
labels.append(label.tolist())
return labels

def label2half(self, predictions, labelset):
labels = []
for prediction in predictions:
label = labelset[prediction > 0.5]
labels.append(label.tolist())
return labels

def label2tag(self, predictions, labelset):
labels1=self.label2toptag(predictions, labelset)
labels2 = self.label2half(predictions, labelset)
labels = []
for i in range(len(predictions)):
if len(labels2[i])==0:
labels.append(labels1[i])
else:
labels.append(labels2[i])
return labels
2 changes: 2 additions & 0 deletions TextClassification/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .DataPreprocess import DataPreprocess
from .TextClassification import TextClassification
Loading

0 comments on commit ddf32c9

Please sign in to comment.