update docs and examples

bab2min · Aug 4, 2020 · e3174dd · e3174dd
1 parent c850b42
commit e3174dd
Show file tree

Hide file tree

Showing 14 changed files with 199 additions and 156 deletions.
diff --git a/README.kr.rst b/README.kr.rst
@@ -78,6 +78,8 @@ tomotopy의 가장 최신버전은 0.9.0 입니다.
     for k in range(mdl.k):
         print('Top 10 words of topic #{}'.format(k))
         print(mdl.get_topic_words(k, top_n=10))
+    
+    mdl.summary()
 
 tomotopy의 성능
 -----------------------
@@ -243,7 +245,7 @@ LDA모델로 1000회 iteration을 수행시 걸리는 시간을 초 단위로
 
 예제 코드
 ---------
-tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/example.py 를 확인하시길 바랍니다.
+tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/master/examples/ 를 확인하시길 바랍니다.
 
 예제 코드에서 사용했던 데이터 파일은 https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view 에서 다운받을 수 있습니다.
 

diff --git a/README.rst b/README.rst
@@ -79,6 +79,8 @@ Here is a sample code for simple LDA training of texts from 'sample.txt' file.
     for k in range(mdl.k):
         print('Top 10 words of topic #{}'.format(k))
         print(mdl.get_topic_words(k, top_n=10))
+    
+    mdl.summary()
 
 Performance of tomotopy
 -----------------------
@@ -248,7 +250,7 @@ See `word_prior_example` in `example.py` for more details.
 
 Examples
 --------
-You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/example.py .
+You can find an example python code of tomotopy at https://github.com/bab2min/tomotopy/blob/master/examples/ .
 
 You can also get the data file used in the example code at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view .
 

diff --git a/example.py b/example.py
diff --git a/examples/corpus_and_labeling.py b/examples/corpus_and_labeling.py
@@ -0,0 +1,36 @@
+import sys
+import tomotopy as tp
+
+def corpus_and_labeling_example(input_file):
+    corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(), stopwords=['.'])
+    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
+    corpus.process(open(input_file, encoding='utf-8'))
+
+    # make LDA model and train
+    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
+    mdl.train(0)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
+    print('Removed top words:', mdl.removed_top_words)
+    for i in range(0, 1000, 10):
+        mdl.train(10)
+        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
+
+    mdl.summary()
+
+    # extract candidates for auto topic labeling
+    extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
+    cands = extractor.extract(mdl)
+
+    labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
+    for k in range(mdl.k):
+        print("== Topic #{} ==".format(k))
+        print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
+        for word, prob in mdl.get_topic_words(k, top_n=10):
+            print(word, prob, sep='\t')
+        print()
+
+# You can get the sample data file 'enwiki-stemmed-1000.txt'
+# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
+
+print('Running LDA and Labeling')
+corpus_and_labeling_example('enwiki-stemmed-1000.txt')
diff --git a/examples/ctm_network.py b/examples/ctm_network.py
@@ -47,6 +47,8 @@
     mdl.train(20)
 print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
 
+mdl.summary()
+
 # Let's visualize the result
 g = Network(width=800, height=800, font_color="#333")
 correl = mdl.get_correlations().reshape([-1])

diff --git a/examples/dtm_plot.py → examples/dtm.py b/examples/dtm_plot.py → examples/dtm.py
@@ -28,6 +28,8 @@ def data_feeder(input_file):
     mdl.train(20)
 print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
 
+mdl.summary()
+
 topic_dist_by_time = np.zeros(shape=[mdl.num_timepoints, mdl.k], dtype=np.float)
 for doc in mdl.docs:
     topic_dist_by_time[doc.timepoint] += doc.get_topic_dist()

diff --git a/examples/gdmr_plot.py b/examples/gdmr_plot.py
@@ -76,6 +76,8 @@ def __call__(self, value, clip=None):
     mdl.train(20)
 print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
 
+mdl.summary()
+
 # Let's visualize the result
 topic_counts = mdl.get_count_by_topics()
 lambdas = mdl.lambdas

diff --git a/examples/hdp_basic.py b/examples/hdp_basic.py
@@ -0,0 +1,32 @@
+import sys
+import tomotopy as tp
+
+def hdp_example(input_file, save_path):
+    mdl = tp.HDPModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5)
+    for n, line in enumerate(open(input_file, encoding='utf-8')):
+        ch = line.strip().split()
+        mdl.add_doc(ch)
+    mdl.burn_in = 100
+    mdl.train(0)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
+    print('Removed top words:', mdl.removed_top_words)
+    print('Training...', file=sys.stderr, flush=True)
+    for i in range(0, 1000, 10):
+        mdl.train(10)
+        print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, mdl.ll_per_word, mdl.live_k))
+
+    mdl.summary()
+    print('Saving...', file=sys.stderr, flush=True)
+    mdl.save(save_path, True)
+
+    important_topics = [k for k, v in sorted(enumerate(mdl.get_count_by_topics()), key=lambda x:x[1], reverse=True)]
+    for k in important_topics:
+        if not mdl.is_live_topic(k): continue
+        print('Topic #{}'.format(k))
+        for word, prob in mdl.get_topic_words(k):
+            print('\t', word, prob, sep='\t')
+# You can get the sample data file 'enwiki-stemmed-1000.txt'
+# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
+
+print('Running HDP')
+hdp_example('enwiki-stemmed-1000.txt', 'test.hdp.bin')
diff --git a/examples/lda_basic.py b/examples/lda_basic.py
@@ -0,0 +1,31 @@
+import sys
+import tomotopy as tp
+
+def lda_example(input_file, save_path):
+    mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20)
+    for n, line in enumerate(open(input_file, encoding='utf-8')):
+        ch = line.strip().split()
+        mdl.add_doc(ch)
+    mdl.burn_in = 100
+    mdl.train(0)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
+    print('Removed top words:', mdl.removed_top_words)
+    print('Training...', file=sys.stderr, flush=True)
+    for i in range(0, 1000, 10):
+        mdl.train(10)
+        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
+
+    mdl.summary()
+    print('Saving...', file=sys.stderr, flush=True)
+    mdl.save(save_path, True)
+
+    for k in range(mdl.k):
+        print('Topic #{}'.format(k))
+        for word, prob in mdl.get_topic_words(k):
+            print('\t', word, prob, sep='\t')
+
+# You can get the sample data file 'enwiki-stemmed-1000.txt'
+# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
+
+print('Running LDA')
+lda_example('enwiki-stemmed-1000.txt', 'test.lda.bin')
diff --git a/examples/lda_visualization.py b/examples/lda_visualization.py
@@ -45,6 +45,8 @@
     mdl.train(20)
 print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
 
+mdl.summary()
+
 topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
 doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
 doc_lengths = np.array([len(doc.words) for doc in mdl.docs])

diff --git a/examples/raw_corpus_and_labeling.py b/examples/raw_corpus_and_labeling.py
@@ -0,0 +1,42 @@
+import sys
+import tomotopy as tp
+
+def raw_corpus_and_labeling_example(input_file):
+    from nltk.stem.porter import PorterStemmer
+    from nltk.corpus import stopwords
+    stemmer = PorterStemmer()
+    stops = set(stopwords.words('english'))
+    corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
+        stopwords=lambda x: len(x) <= 2 or x in stops)
+    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
+    corpus.process(open(input_file, encoding='utf-8'))
+
+    # make LDA model and train
+    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
+    mdl.train(0)
+    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
+    print('Removed top words:', mdl.removed_top_words)
+    for i in range(0, 1000, 10):
+        mdl.train(10)
+        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
+
+    mdl.summary()
+
+    # extract candidates for auto topic labeling
+    extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
+    cands = extractor.extract(mdl)
+
+    labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
+    for k in range(mdl.k):
+        print("== Topic #{} ==".format(k))
+        print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
+        for word, prob in mdl.get_topic_words(k, top_n=10):
+            print(word, prob, sep='\t')
+        print()
+
+
+# You can get the sample data file 'enwiki-stemmed-1000.txt'
+# at https://drive.google.com/file/d/18OpNijd4iwPyYZ2O7pQoPyeTAKEXa71J/view?usp=sharing
+
+print('Running LDA from raw corpus and Labeling')
+raw_corpus_and_labeling_example('enwiki-1000.txt')