From 8771c29a969b0bfd66cbdcfe7b9866d56b8f468d Mon Sep 17 00:00:00 2001 From: tpoisonooo Date: Thu, 10 Oct 2024 12:49:16 +0800 Subject: [PATCH] feat(test_lda): add keywords (#392) --- tests/test_lda/step0_preprocess.py | 160 +++++++++++++++++++++++++++++ tests/test_lda/step1_countvec.py | 134 ++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 tests/test_lda/step0_preprocess.py create mode 100644 tests/test_lda/step1_countvec.py diff --git a/tests/test_lda/step0_preprocess.py b/tests/test_lda/step0_preprocess.py new file mode 100644 index 0000000..99540c1 --- /dev/null +++ b/tests/test_lda/step0_preprocess.py @@ -0,0 +1,160 @@ +import jieba +import jieba.posseg as jp +import pdb +import json +import os +import re +from multiprocessing import Process, cpu_count +# https://blog.csdn.net/xyisv/article/details/104482818 +import hashlib +import time +image_name = re.compile(r'[0-9a-f]{18,64}') +chapter2 = re.compile(r'[0-9]{1}\.[0-9]{1}') +chapter3 = re.compile(r'[0-9]{1}\.[0-9]{1}\.[0-9]{1}') + +def load_stopwords(): + sw = [] + with open('cn_en_stopwords.txt') as f: + for line in f: + if len(line.strip()) > 0: + sw.append(line.strip()) + return sw + +def load_documents(n:int = 1): + basedir = '/home/data/khj/workspace/huixiangdou/repodir.lda' + + docs = [] + for root, _, files in os.walk(basedir): + for file in files: + if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'): + pdb.set_trace() + else: + docs.append((file, os.path.join(root, file))) + + length = len(docs) + step = length // n + remainder = length % n + + result = [] + start = 0 + for i in range(n): + end = start + step + (1 if i < remainder else 0) + result.append(docs[start:end]) + start = end + + return result + +def load_newwords(): + words = [] + basename = './newwords' + files = os.listdir(basename) + for filename in files: + filepath = os.path.join(basename, filename) + with open(filepath, encoding='utf8') as f: + words += json.load(f) + print('load {}'.format(filepath)) + return words + +def content_hash(input_str:str): + # 创建一个新的sha256 hash对象 + hash_object = hashlib.sha256() + # 更新hash对象,参数是输入字符串的编码(bytes) + hash_object.update(input_str.encode()) + # 获取十六进制的hash值 + hex_dig = hash_object.hexdigest() + # 返回前6位 + return hex_dig[:6] + +def process_data(documents: list, pid: int): + # add newwords + t0 = time.time() + new_words = load_newwords() + for w in new_words: + jieba.add_word(w, tag='n') + + stop_words = load_stopwords() + print('{} start..'.format(pid)) + bad_patterns = [image_name, chapter2, chapter3] + + for filename,filepath in documents: + d = '' + with open(filepath) as f: + d = f.read() + # use half content + head_length = int(len(d) * 0.8) + d = d[0:head_length] + + cuts = [w.word for w in jp.cut(d)] + + filtered = [] + for c in cuts: + c = c.strip() + if c in stop_words: + continue + + if 'images' == c: + continue + + skip = False + for bad_pattern in bad_patterns: + if bad_pattern.match(c): + skip = True + break + if skip: + continue + + filtered.append(c) + + if len(filtered) < 1: + continue + new_content = ' '.join(filtered) + + if len(new_content) < 300: + continue + dirname = os.path.join('preprocess', str(pid)) + if not os.path.exists(dirname): + os.makedirs(dirname) + + hashname = content_hash(new_content) + outfilepath = os.path.join(dirname, hashname + '.md') + + with open('name_map.txt', 'a') as f: + f.write('{}\t {}'.format(hashname, filepath)) + f.write('\n') + + with open(outfilepath, 'w') as f: + f.write(new_content) + f.flush() + print('{} finish, timecost {}'.format(pid, time.time() - t0)) + +def _get_num_processes(): + num_processes = cpu_count() - 1 # Good habit to leave 1 core. + return num_processes + +def main(): + debug_mode = False + + processes = [] + split_documents = load_documents(n=_get_num_processes()) + for process_id, documents in enumerate(split_documents): + print(f'Distributing to process[{process_id}]...') + + if debug_mode: + process_data(documents, process_id) + else: + # convert NDArray back to a list, easier. + process = Process( + target=process_data, + args=( + documents, + process_id, + ), + ) + process.start() + print(f'Distributed to process[{process_id}].') + processes.append(process) + for process in processes: + process.join() + +if __name__ == '__main__': + main() diff --git a/tests/test_lda/step1_countvec.py b/tests/test_lda/step1_countvec.py new file mode 100644 index 0000000..48d10c8 --- /dev/null +++ b/tests/test_lda/step1_countvec.py @@ -0,0 +1,134 @@ +# Author: Olivier Grisel +# Lars Buitinck +# Chyi-Kwei Yau +# License: BSD 3 clause + +from time import time +import shutil +import matplotlib.pyplot as plt +import pdb +import os +import numpy as np + +from sklearn.datasets import fetch_20newsgroups +from sklearn.decomposition import LatentDirichletAllocation +from sklearn.feature_extraction.text import CountVectorizer +import jieba +import jieba.posseg as jp +import json +import re +from multiprocessing import Process, cpu_count +# https://blog.csdn.net/xyisv/article/details/104482818 +import pickle as pkl + +n_features = 2048 +n_components = 100 +n_top_words = 100 +batch_size = 128 + +def files(): + basedir = '/home/data/khj/workspace/huixiangdou/lda/preprocess' + + docs = [] + for root, _, files in os.walk(basedir): + for file in files: + if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'): + pdb.set_trace() + else: + docs.append((file, os.path.join(root, file))) + return docs + +def filecontents(dirname:str): + filepaths = files() + for _, filepath in filepaths: + with open(filepath) as f: + content = f.read() + if len(content) > 0: + yield content + +def load_namemap(): + namemap = dict() + with open('name_map.txt') as f: + for line in f: + parts = line.split('\t') + namemap[parts[0].strip()] = parts[1].strip() + return namemap + +# reference step https://blog.csdn.net/xyisv/article/details/104482818 +def plot_top_words(model, feature_names, n_top_words, title): + fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True) + axes = axes.flatten() + for topic_idx, topic in enumerate(model.components_): + top_features_ind = topic.argsort()[-n_top_words:] + top_features = feature_names[top_features_ind] + weights = topic[top_features_ind] + + ax = axes[topic_idx] + ax.barh(top_features, weights, height=0.7) + ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30}) + ax.tick_params(axis="both", which="major", labelsize=20) + for i in "top right left".split(): + ax.spines[i].set_visible(False) + fig.suptitle(title, fontsize=40) + + plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3) + plt.savefig('topic_centers.jpg') + +def build_topic(dirname: str='preprocess'): + namemap = load_namemap() + pdb.set_trace() + + tf_vectorizer = CountVectorizer( + max_df=0.95, min_df=2, max_features=n_features, stop_words="english" + ) + + t0 = time() + tf = tf_vectorizer.fit_transform(filecontents(dirname)) + print("BoW in %0.3fs." % (time() - t0)) + + lda = LatentDirichletAllocation( + n_components=n_components, + max_iter=5, + learning_method="online", + learning_offset=50.0, + random_state=0, + ) + t0 = time() + doc_types = lda.fit_transform(tf) + + pdb.set_trace() + print("lda train in %0.3fs." % (time() - t0)) + # transform(raw_documents)[source] + feature_names = tf_vectorizer.get_feature_names_out() + + models = {'CountVectorizer': tf_vectorizer, 'LatentDirichletAllocation': lda} + with open('lda_models.pkl', 'wb') as model_file: + pkl.dump(models, model_file) + + top_features_list = [] + for _, topic in enumerate(lda.components_): + top_features_ind = topic.argsort()[-n_top_words:] + top_features = feature_names[top_features_ind] + weights = topic[top_features_ind] + top_features_list.append(top_features.tolist()) + + with open(os.path.join('cluster', 'desc.json'), 'w') as f: + json_str = json.dumps(top_features_list, ensure_ascii=False) + f.write(json_str) + + filepaths = files() + + pdb.set_trace() + for file_id, doc_score in enumerate(doc_types): + basename, input_filepath = filepaths[file_id] + hashname = basename.split('.')[0] + source_filepath = namemap[hashname] + indices_np = np.where(doc_score > 0.1)[0] + for topic_id in indices_np: + target_dir = os.path.join('cluster', str(topic_id)) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + shutil.copy(source_filepath, target_dir) + +if __name__ == '__main__': + build_topic()