-
-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #104 from bab2min/dev
prepare 0.11.0
- Loading branch information
Showing
86 changed files
with
3,002 additions
and
1,055 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,8 @@ jobs: | |
popd | ||
cp html/tomotopy/* ../g/bab2min.github.io/tomotopy/v${TOMOTOPY_VER}/${{ matrix.language }} | ||
pushd ../g/bab2min.github.io | ||
echo "<meta http-equiv='refresh' content='0;url=/tomotopy/${TOMOTOPY_VER}/en/'' >" > index.html | ||
echo "<meta http-equiv='refresh' content='0;url=/tomotopy/${TOMOTOPY_VER}/kr/'' >" > index.kr.html | ||
git config user.email "[email protected]" | ||
git config user.name "bab2min" | ||
git add . | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,3 +17,4 @@ enwiki-1000.txt | |
examples/* | ||
!examples/*.py | ||
enwiki-16000.txt | ||
*.cps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
''' | ||
This example show how to perform a g-DMR topic model | ||
for mixture of categorical and numerical metadata using tomotopy | ||
and visualize a topic distribution. | ||
Required Packages: | ||
matplotlib | ||
''' | ||
|
||
import tomotopy as tp | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
import matplotlib.colors as clr | ||
import re | ||
|
||
corpus = tp.utils.Corpus() | ||
for line in open('text_mining_year_journal.txt', encoding='utf-8'): | ||
fd = line.strip().split('\t', maxsplit=2) | ||
corpus.add_doc(fd[2].split(), numeric_metadata=[float(fd[0])], metadata=fd[1]) | ||
# Use the argument `numeric_metadata` for continuous numerical metadata (list of float type), | ||
# and the argument `metadata` for categorical metadata (str type) | ||
|
||
# We set a range of the numeric metadata as [2000, 2017]. | ||
# `decay=1.0` penalizes higher-order terms of lambdas to prevent overfitting. | ||
mdl = tp.GDMRModel(tw=tp.TermWeight.ONE, k=30, degrees=[6], | ||
alpha=1e-2, sigma=0.25, sigma0=3.0, decay=1.0, | ||
metadata_range=[(2000, 2017)], corpus=corpus | ||
) | ||
mdl.optim_interval = 20 | ||
mdl.burn_in = 200 | ||
|
||
mdl.train(0) | ||
|
||
print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format( | ||
len(mdl.docs), len(mdl.used_vocabs), mdl.num_words | ||
)) | ||
|
||
# Let's train the model | ||
for i in range(0, 1000, 20): | ||
print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word)) | ||
mdl.train(20) | ||
print('Iteration: {:04} LL per word: {:.4}'.format(1000, mdl.ll_per_word)) | ||
|
||
mdl.summary() | ||
|
||
# Let's visualize the result | ||
topic_counts = mdl.get_count_by_topics() | ||
lambdas = mdl.lambdas | ||
lambdas = lambdas.reshape(lambdas.shape[:1] + (len(mdl.metadata_dict), -1)) | ||
# lambdas shape: [num_topics, num_categorical_metadata, degrees + 1] | ||
|
||
md_range = mdl.metadata_range | ||
r = np.stack([mdl.tdf_linspace( | ||
[md_range[0][0]], | ||
[md_range[0][1]], | ||
[50], # interpolation size | ||
cat | ||
) for cat in mdl.metadata_dict]) | ||
# r shape: [num_categorical_metadata, 50, num_topics] | ||
|
||
xs = np.linspace(*md_range[0], 50) | ||
for k in (-topic_counts).argsort(): | ||
print('Topic #{} ({})'.format(k, topic_counts[k])) | ||
print(*(w for w, _ in mdl.get_topic_words(k))) | ||
print('Lambda:', lambdas[k].reshape((len(mdl.metadata_dict), -1))) | ||
|
||
for label, ys in zip(mdl.metadata_dict, r[:, :, k]): | ||
label = re.sub(r'^(Proceedings|Journal)( of)?( the)?( -)?|International Conference on', '', label).strip() | ||
if len(label) >= 35: label = label[:33] + '...' | ||
plt.plot(xs, ys, linewidth=2, label=label) | ||
plt.title('#{}\n({})'.format(k, ' '.join(w for w, _ in mdl.get_topic_words(k, top_n=5)))) | ||
plt.legend() | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.