Skip to content

Commit

Permalink
feat(test_lda): add keywords (#392)
Browse files Browse the repository at this point in the history
  • Loading branch information
tpoisonooo authored Oct 10, 2024
1 parent ce0d486 commit 8771c29
Show file tree
Hide file tree
Showing 2 changed files with 294 additions and 0 deletions.
160 changes: 160 additions & 0 deletions tests/test_lda/step0_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import jieba
import jieba.posseg as jp
import pdb
import json
import os
import re
from multiprocessing import Process, cpu_count
# https://blog.csdn.net/xyisv/article/details/104482818
import hashlib
import time
image_name = re.compile(r'[0-9a-f]{18,64}')
chapter2 = re.compile(r'[0-9]{1}\.[0-9]{1}')
chapter3 = re.compile(r'[0-9]{1}\.[0-9]{1}\.[0-9]{1}')

def load_stopwords():
sw = []
with open('cn_en_stopwords.txt') as f:
for line in f:
if len(line.strip()) > 0:
sw.append(line.strip())
return sw

def load_documents(n:int = 1):
basedir = '/home/data/khj/workspace/huixiangdou/repodir.lda'

docs = []
for root, _, files in os.walk(basedir):
for file in files:
if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'):
pdb.set_trace()
else:
docs.append((file, os.path.join(root, file)))

length = len(docs)
step = length // n
remainder = length % n

result = []
start = 0
for i in range(n):
end = start + step + (1 if i < remainder else 0)
result.append(docs[start:end])
start = end

return result

def load_newwords():
words = []
basename = './newwords'
files = os.listdir(basename)
for filename in files:
filepath = os.path.join(basename, filename)
with open(filepath, encoding='utf8') as f:
words += json.load(f)
print('load {}'.format(filepath))
return words

def content_hash(input_str:str):
# 创建一个新的sha256 hash对象
hash_object = hashlib.sha256()
# 更新hash对象,参数是输入字符串的编码(bytes)
hash_object.update(input_str.encode())
# 获取十六进制的hash值
hex_dig = hash_object.hexdigest()
# 返回前6位
return hex_dig[:6]

def process_data(documents: list, pid: int):
# add newwords
t0 = time.time()
new_words = load_newwords()
for w in new_words:
jieba.add_word(w, tag='n')

stop_words = load_stopwords()
print('{} start..'.format(pid))
bad_patterns = [image_name, chapter2, chapter3]

for filename,filepath in documents:
d = ''
with open(filepath) as f:
d = f.read()
# use half content
head_length = int(len(d) * 0.8)
d = d[0:head_length]

cuts = [w.word for w in jp.cut(d)]

filtered = []
for c in cuts:
c = c.strip()
if c in stop_words:
continue

if 'images' == c:
continue

skip = False
for bad_pattern in bad_patterns:
if bad_pattern.match(c):
skip = True
break
if skip:
continue

filtered.append(c)

if len(filtered) < 1:
continue
new_content = ' '.join(filtered)

if len(new_content) < 300:
continue
dirname = os.path.join('preprocess', str(pid))
if not os.path.exists(dirname):
os.makedirs(dirname)

hashname = content_hash(new_content)
outfilepath = os.path.join(dirname, hashname + '.md')

with open('name_map.txt', 'a') as f:
f.write('{}\t {}'.format(hashname, filepath))
f.write('\n')

with open(outfilepath, 'w') as f:
f.write(new_content)
f.flush()
print('{} finish, timecost {}'.format(pid, time.time() - t0))

def _get_num_processes():
num_processes = cpu_count() - 1 # Good habit to leave 1 core.
return num_processes

def main():
debug_mode = False

processes = []
split_documents = load_documents(n=_get_num_processes())
for process_id, documents in enumerate(split_documents):
print(f'Distributing to process[{process_id}]...')

if debug_mode:
process_data(documents, process_id)
else:
# convert NDArray back to a list, easier.
process = Process(
target=process_data,
args=(
documents,
process_id,
),
)
process.start()
print(f'Distributed to process[{process_id}].')
processes.append(process)
for process in processes:
process.join()

if __name__ == '__main__':
main()
134 changes: 134 additions & 0 deletions tests/test_lda/step1_countvec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# Author: Olivier Grisel <[email protected]>
# Lars Buitinck
# Chyi-Kwei Yau <[email protected]>
# License: BSD 3 clause

from time import time
import shutil
import matplotlib.pyplot as plt
import pdb
import os
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import jieba
import jieba.posseg as jp
import json
import re
from multiprocessing import Process, cpu_count
# https://blog.csdn.net/xyisv/article/details/104482818
import pickle as pkl

n_features = 2048
n_components = 100
n_top_words = 100
batch_size = 128

def files():
basedir = '/home/data/khj/workspace/huixiangdou/lda/preprocess'

docs = []
for root, _, files in os.walk(basedir):
for file in files:
if file.endswith('.jpg') or file.endswith('.png') or file.endswith('.jpeg'):
pdb.set_trace()
else:
docs.append((file, os.path.join(root, file)))
return docs

def filecontents(dirname:str):
filepaths = files()
for _, filepath in filepaths:
with open(filepath) as f:
content = f.read()
if len(content) > 0:
yield content

def load_namemap():
namemap = dict()
with open('name_map.txt') as f:
for line in f:
parts = line.split('\t')
namemap[parts[0].strip()] = parts[1].strip()
return namemap

# reference step https://blog.csdn.net/xyisv/article/details/104482818
def plot_top_words(model, feature_names, n_top_words, title):
fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
axes = axes.flatten()
for topic_idx, topic in enumerate(model.components_):
top_features_ind = topic.argsort()[-n_top_words:]
top_features = feature_names[top_features_ind]
weights = topic[top_features_ind]

ax = axes[topic_idx]
ax.barh(top_features, weights, height=0.7)
ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
ax.tick_params(axis="both", which="major", labelsize=20)
for i in "top right left".split():
ax.spines[i].set_visible(False)
fig.suptitle(title, fontsize=40)

plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
plt.savefig('topic_centers.jpg')

def build_topic(dirname: str='preprocess'):
namemap = load_namemap()
pdb.set_trace()

tf_vectorizer = CountVectorizer(
max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
)

t0 = time()
tf = tf_vectorizer.fit_transform(filecontents(dirname))
print("BoW in %0.3fs." % (time() - t0))

lda = LatentDirichletAllocation(
n_components=n_components,
max_iter=5,
learning_method="online",
learning_offset=50.0,
random_state=0,
)
t0 = time()
doc_types = lda.fit_transform(tf)

pdb.set_trace()
print("lda train in %0.3fs." % (time() - t0))
# transform(raw_documents)[source]
feature_names = tf_vectorizer.get_feature_names_out()

models = {'CountVectorizer': tf_vectorizer, 'LatentDirichletAllocation': lda}
with open('lda_models.pkl', 'wb') as model_file:
pkl.dump(models, model_file)

top_features_list = []
for _, topic in enumerate(lda.components_):
top_features_ind = topic.argsort()[-n_top_words:]
top_features = feature_names[top_features_ind]
weights = topic[top_features_ind]
top_features_list.append(top_features.tolist())

with open(os.path.join('cluster', 'desc.json'), 'w') as f:
json_str = json.dumps(top_features_list, ensure_ascii=False)
f.write(json_str)

filepaths = files()

pdb.set_trace()
for file_id, doc_score in enumerate(doc_types):
basename, input_filepath = filepaths[file_id]
hashname = basename.split('.')[0]
source_filepath = namemap[hashname]
indices_np = np.where(doc_score > 0.1)[0]
for topic_id in indices_np:
target_dir = os.path.join('cluster', str(topic_id))
if not os.path.exists(target_dir):
os.makedirs(target_dir)
shutil.copy(source_filepath, target_dir)

if __name__ == '__main__':
build_topic()

0 comments on commit 8771c29

Please sign in to comment.