Merge pull request #104 from bab2min/dev

prepare 0.11.0
bab2min · Mar 25, 2021 · a800a22 · a800a22
2 parents ccae9aa + 463ab7c
commit a800a22
Show file tree

Hide file tree

Showing 86 changed files with 3,002 additions and 1,055 deletions.
diff --git a/.github/workflows/generate_documentation.yml b/.github/workflows/generate_documentation.yml
@@ -61,6 +61,8 @@ jobs:
         popd
         cp html/tomotopy/* ../g/bab2min.github.io/tomotopy/v${TOMOTOPY_VER}/${{ matrix.language }}
         pushd ../g/bab2min.github.io
+        echo "<meta http-equiv='refresh' content='0;url=/tomotopy/${TOMOTOPY_VER}/en/'' >" > index.html
+        echo "<meta http-equiv='refresh' content='0;url=/tomotopy/${TOMOTOPY_VER}/kr/'' >" > index.kr.html
         git config user.email "[email protected]"
         git config user.name "bab2min"
         git add .

diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,4 @@ enwiki-1000.txt
 examples/*
 !examples/*.py
 enwiki-16000.txt
+*.cps
diff --git a/README.kr.rst b/README.kr.rst
@@ -35,7 +35,7 @@ tomotopy 란?
 
 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다.
 
-tomotopy의 가장 최신버전은 0.10.2 입니다.
+tomotopy의 가장 최신버전은 0.11.0 입니다.
 
 시작하기
 ---------------
@@ -255,6 +255,9 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
 
 역사
 -------
+* 0.10.3 (2021-03-01)
+    * An issue was fixed where `tomotopy.HDPModel.infer` causes a segmentation fault sometimes.
+
 * 0.10.2 (2021-02-16)
     * `tomotopy.CTModel.train`가 큰 K값에 대해 실패하는 문제가 수정되었습니다.
     * `tomotopy.utils.Corpus`가 `uid`값을 잃는 문제가 수정되었습니다.

diff --git a/README.rst b/README.rst
@@ -36,7 +36,7 @@ The current version of `tomoto` supports several major topic models including
 
 Please visit https://bab2min.github.io/tomotopy to see more information.
 
-The most recent version of tomotopy is 0.10.2.
+The most recent version of tomotopy is 0.11.0.
 
 Getting Started
 ---------------
@@ -261,6 +261,9 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
 
 History
 -------
+* 0.10.3 (2021-03-01)
+    * An issue was fixed where `tomotopy.HDPModel.infer` causes a segmentation fault sometimes.
+
 * 0.10.2 (2021-02-16)
     * An issue was fixed where `tomotopy.CTModel.train` fails with large K.
     * An issue was fixed where `tomotopy.utils.Corpus` loses their `uid` values.
@@ -273,7 +276,7 @@ History
 
 * 0.10.0 (2020-12-19)
     * The interface of `tomotopy.utils.Corpus` and of `tomotopy.LDAModel.docs` were unified. Now you can access the document in corpus with the same manner.
-    * __getitem__ of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
+    * `__getitem__` of `tomotopy.utils.Corpus` was improved. Not only indexing by int, but also by Iterable[int], slicing are supported. Also indexing by uid is supported.
     * New methods `tomotopy.utils.Corpus.extract_ngrams` and `tomotopy.utils.Corpus.concat_ngrams` were added. They extracts n-gram collocations using PMI and concatenates them into a single words.
     * A new method `tomotopy.LDAModel.add_corpus` was added, and `tomotopy.LDAModel.infer` can receive corpus as input. 
     * A new module `tomotopy.coherence` was added. It provides the way to calculate coherence of the model.

diff --git a/examples/gdmr_both_categorical_and_numerical.py b/examples/gdmr_both_categorical_and_numerical.py
@@ -0,0 +1,73 @@
+'''
+This example show how to perform a g-DMR topic model
+for mixture of categorical and numerical metadata using tomotopy
+and visualize a topic distribution.
+
+Required Packages:
+    matplotlib
+'''
+
+import tomotopy as tp
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.colors as clr
+import re
+
+corpus = tp.utils.Corpus()
+for line in open('text_mining_year_journal.txt', encoding='utf-8'):
+    fd = line.strip().split('\t', maxsplit=2)
+    corpus.add_doc(fd[2].split(), numeric_metadata=[float(fd[0])], metadata=fd[1])
+# Use the argument `numeric_metadata` for continuous numerical metadata (list of float type),
+# and the argument `metadata` for categorical metadata (str type)
+
+# We set a range of the numeric metadata as [2000, 2017].
+# `decay=1.0` penalizes higher-order terms of lambdas to prevent overfitting.
+mdl = tp.GDMRModel(tw=tp.TermWeight.ONE, k=30, degrees=[6], 
+    alpha=1e-2, sigma=0.25, sigma0=3.0, decay=1.0,
+    metadata_range=[(2000, 2017)], corpus=corpus
+)
+mdl.optim_interval = 20
+mdl.burn_in = 200
+
+mdl.train(0)
+
+print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
+    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
+))
+
+# Let's train the model
+for i in range(0, 1000, 20):
+    print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
+    mdl.train(20)
+print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word))
+
+mdl.summary()
+
+# Let's visualize the result
+topic_counts = mdl.get_count_by_topics()
+lambdas = mdl.lambdas
+lambdas = lambdas.reshape(lambdas.shape[:1] + (len(mdl.metadata_dict), -1))
+# lambdas shape: [num_topics, num_categorical_metadata, degrees + 1]
+
+md_range = mdl.metadata_range
+r = np.stack([mdl.tdf_linspace(
+    [md_range[0][0]], 
+    [md_range[0][1]], 
+    [50], # interpolation size
+    cat
+) for cat in mdl.metadata_dict])
+# r shape: [num_categorical_metadata, 50, num_topics]
+
+xs = np.linspace(*md_range[0], 50)
+for k in (-topic_counts).argsort():
+    print('Topic #{} ({})'.format(k, topic_counts[k]))
+    print(*(w for w, _ in mdl.get_topic_words(k)))
+    print('Lambda:', lambdas[k].reshape((len(mdl.metadata_dict), -1)))
+
+    for label, ys in zip(mdl.metadata_dict, r[:, :, k]):
+        label = re.sub(r'^(Proceedings|Journal)( of)?( the)?( -)?|International Conference on', '', label).strip()
+        if len(label) >= 35: label = label[:33] + '...'
+        plt.plot(xs, ys, linewidth=2, label=label)
+    plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5))))
+    plt.legend()
+    plt.show()
diff --git a/examples/gdmr_plot.py b/examples/gdmr_plot.py
@@ -53,7 +53,7 @@ def __call__(self, value, clip=None):
 corpus = tp.utils.Corpus()
 for line in open('dataset2.txt', encoding='utf-8'):
     fd = line.strip().split()
-    corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))
+    corpus.add_doc(fd[2:], numeric_metadata=list(map(float, fd[:2])))
 
 # We set a range of the first metadata as [2000, 2017] 
 # and one of the second metadata as [0, 1].

diff --git a/examples/lda_visualization.py b/examples/lda_visualization.py
@@ -49,6 +49,7 @@
 
 topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
 doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
+doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
 doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
 vocab = list(mdl.used_vocabs)
 term_frequency = mdl.used_vocab_freq

diff --git a/setup.py b/setup.py
@@ -62,7 +62,7 @@
 
     version=__version__,
 
-    description='Tomoto, The Topic Modeling Tool for Python',
+    description='Tomoto, Topic Modeling Tool for Python',
     long_description=long_description,
 
     url='https://github.com/bab2min/tomotopy',

diff --git a/src/Labeling/Phraser.hpp b/src/Labeling/Phraser.hpp
@@ -1,14 +1,37 @@
 #pragma once
 
 #include <vector>
+#include <map>
 #include <unordered_map>
 #include "Labeler.h"
 #include "../Utils/Trie.hpp"
 
+#ifdef TMT_USE_BTREE
+#include "btree/map.h"
+#else
+#endif
+
 namespace tomoto
 {
 	namespace phraser
 	{
+#ifdef TMT_USE_BTREE
+		template<typename K, typename V> using map = btree::map<K, V>;
+#else
+		template<typename K, typename V> using map = std::map<K, V>;
+#endif
+
+		namespace detail
+		{
+			struct vvhash
+			{
+				size_t operator()(const std::pair<Vid, Vid>& k) const
+				{
+					return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
+				}
+			};
+		}
+
 		template<typename _DocIter>
 		void countUnigrams(std::vector<size_t>& unigramCf, std::vector<size_t>& unigramDf,
 			_DocIter docBegin, _DocIter docEnd
@@ -30,17 +53,17 @@ namespace tomoto
 			}
 		}
 
-		template<typename _DocIter, typename _VvHash, typename _Freqs>
-		void countBigrams(std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramCf,
-			std::unordered_map<std::pair<Vid, Vid>, size_t, _VvHash>& bigramDf,
+		template<typename _DocIter, typename _Freqs>
+		void countBigrams(map<std::pair<Vid, Vid>, size_t>& bigramCf,
+			map<std::pair<Vid, Vid>, size_t>& bigramDf,
 			_DocIter docBegin, _DocIter docEnd,
 			_Freqs&& vocabFreqs, _Freqs&& vocabDf,
 			size_t candMinCnt, size_t candMinDf
 		)
 		{
 			for (auto docIt = docBegin; docIt != docEnd; ++docIt)
 			{
-				std::unordered_set<std::pair<Vid, Vid>, _VvHash> uniqBigram;
+				std::unordered_set<std::pair<Vid, Vid>, detail::vvhash> uniqBigram;
 				auto doc = *docIt;
 				if (!doc.size()) continue;
 				Vid prevWord = doc[0];
@@ -202,17 +225,6 @@ namespace tomoto
 			return std::move(data[0]);
 		}
 
-		namespace detail
-		{
-			struct vvhash
-			{
-				size_t operator()(const std::pair<Vid, Vid>& k) const
-				{
-					return std::hash<Vid>{}(k.first) ^ std::hash<Vid>{}(k.second);
-				}
-			};
-		}
-
 		template<typename _DocIter, typename _Freqs>
 		std::vector<label::Candidate> extractPMINgrams(_DocIter docBegin, _DocIter docEnd,
 			_Freqs&& vocabFreqs, _Freqs&& vocabDf,
@@ -221,13 +233,13 @@ namespace tomoto
 			ThreadPool* pool = nullptr)
 		{
 			// counting unigrams & bigrams
-			std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf;
+			map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
 
 			if (pool && pool->getNumWorkers() > 1)
 			{
 				using LocalCfDf = std::pair<
-					std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>,
-					std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>
+					decltype(bigramCnt),
+					decltype(bigramDf)
 				>;
 				std::vector<LocalCfDf> localdata(pool->getNumWorkers());
 				std::vector<std::future<void>> futures;
@@ -363,13 +375,13 @@ namespace tomoto
 			ThreadPool* pool = nullptr)
 		{
 			// counting unigrams & bigrams
-			std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash> bigramCnt, bigramDf;
+			map<std::pair<Vid, Vid>, size_t> bigramCnt, bigramDf;
 
 			if (pool && pool->getNumWorkers() > 1)
 			{
 				using LocalCfDf = std::pair<
-					std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>,
-					std::unordered_map<std::pair<Vid, Vid>, size_t, detail::vvhash>
+					decltype(bigramCnt),
+					decltype(bigramDf)
 				>;
 				std::vector<LocalCfDf> localdata(pool->getNumWorkers());
 				std::vector<std::future<void>> futures;

diff --git a/src/TopicModel/CT.h b/src/TopicModel/CT.h
@@ -15,13 +15,16 @@ namespace tomoto
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);
 	};
 
+	struct CTArgs : public LDAArgs
+	{
+
+	};
+
 	class ICTModel : public ILDAModel
 	{
 	public:
 		using DefaultDocType = DocumentCTM<TermWeight::one>;
-		static ICTModel* create(TermWeight _weight, size_t _K = 1,
-			Float smoothingAlpha = 0.1,  Float _eta = 0.01,
-			size_t seed = std::random_device{}(),
+		static ICTModel* create(TermWeight _weight, const CTArgs& args,
 			bool scalarRng = false);
 
 		virtual void setNumBetaSample(size_t numSample) = 0;

diff --git a/src/TopicModel/CTModel.cpp b/src/TopicModel/CTModel.cpp
@@ -2,12 +2,8 @@
 
 namespace tomoto
 {
-	/*template class CTModel<TermWeight::one>;
-	template class CTModel<TermWeight::idf>;
-	template class CTModel<TermWeight::pmi>;*/
-
-	ICTModel* ICTModel::create(TermWeight _weight, size_t _K, Float smoothingAlpha, Float _eta, size_t seed, bool scalarRng)
+	ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
 	{
-		TMT_SWITCH_TW(_weight, scalarRng, CTModel, _K, smoothingAlpha, _eta, seed);
+		TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
 	}
 }