From 8771c29a969b0bfd66cbdcfe7b9866d56b8f468d Mon Sep 17 00:00:00 2001
From: tpoisonooo <khj.application@aliyun.com>
Date: Thu, 10 Oct 2024 12:49:16 +0800
Subject: [PATCH] feat(test_lda): add keywords (#392)

---
 tests/test_lda/step0_preprocess.py | 160 +++++++++++++++++++++++++++++
 tests/test_lda/step1_countvec.py   | 134 ++++++++++++++++++++++++
 2 files changed, 294 insertions(+)
 create mode 100644 tests/test_lda/step0_preprocess.py
 create mode 100644 tests/test_lda/step1_countvec.py

diff --git a/tests/test_lda/step0_preprocess.py b/tests/test_lda/step0_preprocess.py
new file mode 100644
index 0000000..99540c1
--- /dev/null
+++ b/tests/test_lda/step0_preprocess.py
@@ -0,0 +1,160 @@
+import jieba
+import jieba.posseg as jp
+import pdb
+import json
+import os
+import re
+from multiprocessing import Process, cpu_count
+# https://blog.csdn.net/xyisv/article/details/104482818
+import hashlib
+import time
+image_name = re.compile(r'[0-9a-f]{18,64}')
+chapter2 = re.compile(r'[0-9]{1}\.[0-9]{1}')
+chapter3 = re.compile(r'[0-9]{1}\.[0-9]{1}\.[0-9]{1}')
+
+def load_stopwords():
+    sw = []
+    with open('cn_en_stopwords.txt') as f:
+        for line in f:
+            if len(line.strip()) > 0:
+                sw.append(line.strip())
+    return sw
+
+def load_documents(n:int = 1):
+    basedir = '/home/data/khj/workspace/huixiangdou/repodir.lda'
+
+    docs = []
+    for root, _, files in os.walk(basedir):
+        for file in files:
+            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'):
+                pdb.set_trace()
+            else:
+                docs.append((file, os.path.join(root, file)))
+
+    length = len(docs)
+    step = length // n
+    remainder = length % n
+    
+    result = []
+    start = 0
+    for i in range(n):
+        end = start + step + (1 if i < remainder else 0)
+        result.append(docs[start:end])
+        start = end
+    
+    return result
+
+def load_newwords():
+    words = []
+    basename = './newwords'
+    files = os.listdir(basename)
+    for filename in files:
+        filepath = os.path.join(basename, filename)
+        with open(filepath, encoding='utf8') as f:
+            words += json.load(f)
+        print('load {}'.format(filepath))
+    return words
+
+def content_hash(input_str:str):
+    # 创建一个新的sha256 hash对象
+    hash_object = hashlib.sha256()
+    # 更新hash对象，参数是输入字符串的编码（bytes）
+    hash_object.update(input_str.encode())
+    # 获取十六进制的hash值
+    hex_dig = hash_object.hexdigest()
+    # 返回前6位
+    return hex_dig[:6]
+
+def process_data(documents: list, pid: int):
+    # add newwords
+    t0 = time.time()
+    new_words = load_newwords()
+    for w in new_words:
+        jieba.add_word(w, tag='n')
+
+    stop_words = load_stopwords()
+    print('{} start..'.format(pid))
+    bad_patterns = [image_name, chapter2, chapter3]
+
+    for filename,filepath in documents:
+        d = ''
+        with open(filepath) as f:
+            d = f.read()
+            # use half content
+            head_length = int(len(d) * 0.8)
+            d = d[0:head_length]
+
+        cuts = [w.word for w in jp.cut(d)]
+        
+        filtered = []
+        for c in cuts:
+            c = c.strip()
+            if c in stop_words:
+                continue
+
+            if 'images' == c:
+                continue
+
+            skip = False
+            for bad_pattern in bad_patterns:
+                if bad_pattern.match(c):
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            filtered.append(c)
+        
+        if len(filtered) < 1:
+            continue
+        new_content = ' '.join(filtered)
+        
+        if len(new_content) < 300:
+            continue
+        dirname = os.path.join('preprocess', str(pid))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+        
+        hashname = content_hash(new_content)
+        outfilepath = os.path.join(dirname, hashname + '.md')
+        
+        with open('name_map.txt', 'a') as f:
+            f.write('{}\t {}'.format(hashname, filepath))
+            f.write('\n')
+        
+        with open(outfilepath, 'w') as f:
+            f.write(new_content)
+            f.flush()
+    print('{} finish, timecost {}'.format(pid, time.time() - t0))
+
+def _get_num_processes():
+    num_processes = cpu_count() - 1  # Good habit to leave 1 core.
+    return num_processes
+
+def main():
+    debug_mode = False
+
+    processes = []
+    split_documents = load_documents(n=_get_num_processes())
+    for process_id, documents in enumerate(split_documents):
+        print(f'Distributing to process[{process_id}]...')
+
+        if debug_mode:
+            process_data(documents, process_id)
+        else:
+            # convert NDArray back to a list, easier.
+            process = Process(
+                target=process_data,
+                args=(
+                    documents,
+                    process_id,
+                ),
+            )
+            process.start()
+            print(f'Distributed to process[{process_id}].')
+            processes.append(process)
+    for process in processes:
+        process.join()
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/test_lda/step1_countvec.py b/tests/test_lda/step1_countvec.py
new file mode 100644
index 0000000..48d10c8
--- /dev/null
+++ b/tests/test_lda/step1_countvec.py
@@ -0,0 +1,134 @@
+# Author: Olivier Grisel <olivier.grisel@ensta.org>
+#         Lars Buitinck
+#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
+# License: BSD 3 clause
+
+from time import time
+import shutil
+import matplotlib.pyplot as plt
+import pdb
+import os
+import numpy as np
+
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.feature_extraction.text import CountVectorizer
+import jieba
+import jieba.posseg as jp
+import json
+import re
+from multiprocessing import Process, cpu_count
+# https://blog.csdn.net/xyisv/article/details/104482818
+import pickle as pkl
+
+n_features = 2048
+n_components = 100
+n_top_words = 100
+batch_size = 128
+
+def files():
+    basedir = '/home/data/khj/workspace/huixiangdou/lda/preprocess'
+
+    docs = []
+    for root, _, files in os.walk(basedir):
+        for file in files:
+            if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'):
+                pdb.set_trace()
+            else:
+                docs.append((file, os.path.join(root, file)))
+    return docs
+
+def filecontents(dirname:str):
+    filepaths = files()
+    for _, filepath in filepaths:
+        with open(filepath) as f:
+            content = f.read()
+            if len(content) > 0:
+                yield content
+
+def load_namemap():
+    namemap = dict()
+    with open('name_map.txt') as f:
+        for line in f:
+            parts = line.split('\t')
+            namemap[parts[0].strip()] = parts[1].strip()
+    return namemap
+
+# reference step https://blog.csdn.net/xyisv/article/details/104482818
+def plot_top_words(model, feature_names, n_top_words, title):
+    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
+    axes = axes.flatten()
+    for topic_idx, topic in enumerate(model.components_):
+        top_features_ind = topic.argsort()[-n_top_words:]
+        top_features = feature_names[top_features_ind]
+        weights = topic[top_features_ind]
+
+        ax = axes[topic_idx]
+        ax.barh(top_features, weights, height=0.7)
+        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
+        ax.tick_params(axis="both", which="major", labelsize=20)
+        for i in "top right left".split():
+            ax.spines[i].set_visible(False)
+        fig.suptitle(title, fontsize=40)
+
+    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
+    plt.savefig('topic_centers.jpg')
+
+def build_topic(dirname: str='preprocess'):
+    namemap = load_namemap()
+    pdb.set_trace()
+    
+    tf_vectorizer = CountVectorizer(
+        max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
+    )
+
+    t0 = time()
+    tf = tf_vectorizer.fit_transform(filecontents(dirname))
+    print("BoW in %0.3fs." % (time() - t0))
+
+    lda = LatentDirichletAllocation(
+        n_components=n_components,
+        max_iter=5,
+        learning_method="online",
+        learning_offset=50.0,
+        random_state=0,
+    )
+    t0 = time()
+    doc_types = lda.fit_transform(tf)
+    
+    pdb.set_trace()
+    print("lda train in %0.3fs." % (time() - t0))
+    # transform(raw_documents)[source]
+    feature_names = tf_vectorizer.get_feature_names_out()
+
+    models = {'CountVectorizer': tf_vectorizer, 'LatentDirichletAllocation': lda}
+    with open('lda_models.pkl', 'wb') as model_file:
+        pkl.dump(models, model_file)
+
+    top_features_list = []
+    for _, topic in enumerate(lda.components_):
+        top_features_ind = topic.argsort()[-n_top_words:]
+        top_features = feature_names[top_features_ind]
+        weights = topic[top_features_ind]
+        top_features_list.append(top_features.tolist())
+    
+    with open(os.path.join('cluster', 'desc.json'), 'w') as f:
+        json_str = json.dumps(top_features_list, ensure_ascii=False)
+        f.write(json_str)
+
+    filepaths = files()
+    
+    pdb.set_trace()
+    for file_id, doc_score in enumerate(doc_types):
+        basename, input_filepath = filepaths[file_id]
+        hashname = basename.split('.')[0]
+        source_filepath = namemap[hashname]
+        indices_np = np.where(doc_score > 0.1)[0]
+        for topic_id in indices_np:
+            target_dir = os.path.join('cluster', str(topic_id))
+            if not os.path.exists(target_dir):
+                os.makedirs(target_dir)
+            shutil.copy(source_filepath, target_dir)
+
+if __name__ == '__main__':
+    build_topic()