From 67273f1d0a49b0515bb0c9aaa51b9ea0b6cf9648 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sat, 28 Dec 2019 21:51:38 +0900 Subject: [PATCH 01/12] preparing 0.5.0 * fixed bug (#25) * added methods about subtopic at PAM (#24) * implemented partitioned multisampling --- setup.py | 2 +- src/TopicModel/CTModel.hpp | 61 ++++++-- src/TopicModel/DMRModel.hpp | 11 +- src/TopicModel/GDMRModel.hpp | 6 +- src/TopicModel/HDPModel.hpp | 10 +- src/TopicModel/HLDAModel.hpp | 31 +++- src/TopicModel/HPA.h | 2 +- src/TopicModel/HPAModel.hpp | 62 +++++++- src/TopicModel/LDA.h | 2 +- src/TopicModel/LDACVB0Model.hpp | 18 ++- src/TopicModel/LDAModel.cpp | 2 +- src/TopicModel/LDAModel.hpp | 254 ++++++++++++++++++++++---------- src/TopicModel/LLDAModel.hpp | 8 +- src/TopicModel/MGLDAModel.hpp | 33 ++++- src/TopicModel/PA.h | 5 +- src/TopicModel/PAModel.cpp | 2 +- src/TopicModel/PAModel.hpp | 140 ++++++++++++++---- src/TopicModel/PLDAModel.hpp | 10 +- src/TopicModel/SLDAModel.hpp | 16 +- src/TopicModel/TopicModel.hpp | 127 +++++++++++++--- src/Utils/ThreadPool.hpp | 91 +++++++++--- src/Utils/exception.h | 8 +- src/Utils/sample.hpp | 21 ++- src/Utils/tvector.hpp | 2 +- src/python/docs.h | 28 +++- src/python/module.h | 71 +++++++++ src/python/py_HDP.cpp | 6 +- src/python/py_HLDA.cpp | 6 +- src/python/py_LDA.cpp | 22 +-- src/python/py_MGLDA.cpp | 2 +- src/python/py_PA.cpp | 47 +++++- src/python/py_main.cpp | 10 +- test/unit_test.py | 85 +++++++---- tomotopy/__init__.py | 47 ++++++ 34 files changed, 966 insertions(+), 282 deletions(-) diff --git a/setup.py b/setup.py index 554d058..83d9bf3 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ setup( name='tomotopy', - version='0.4.2', + version='0.5.0', description='Tomoto, The Topic Modeling Tool for Python', long_description=long_description, diff --git a/src/TopicModel/CTModel.hpp b/src/TopicModel/CTModel.hpp index 0f4d97e..fa4e69b 100644 --- a/src/TopicModel/CTModel.hpp +++ b/src/TopicModel/CTModel.hpp @@ -16,7 +16,7 @@ namespace tomoto { }; - template, @@ -55,6 +55,8 @@ namespace tomoto Eigen::Matrix pbeta, lowerBound, upperBound; constexpr FLOAT epsilon = 1e-8; constexpr size_t burnIn = 3; + sample::FastRealGenerator frg; + pbeta = lowerBound = upperBound = Eigen::Matrix::Zero(this->K); for (size_t i = 0; i < numBetaSample + burnIn; ++i) { @@ -66,7 +68,7 @@ namespace tomoto { FLOAT N_k = doc.numByTopic[k] + this->alpha; FLOAT N_nk = doc.getSumWordWeight() + this->alpha * (this->K + 1) - N_k; - FLOAT u1 = std::generate_canonical(rg), u2 = std::generate_canonical(rg); + FLOAT u1 = frg(rg), u2 = frg(rg); FLOAT max_uk = epsilon + pow(u1, (FLOAT)1 / N_k) * (pbeta[k] - epsilon); FLOAT min_unk = (1 - pow(u2, (FLOAT)1 / N_nk)) * (1 - pbeta[k]) + pbeta[k]; @@ -104,12 +106,47 @@ namespace tomoto doc.smBeta /= doc.smBeta.array().sum(); } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { - BaseClass::sampleDocument(doc, docId, ld, rgs, iterationCnt); - if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0) + BaseClass::template sampleDocument<_ps>(doc, docId, ld, rgs, iterationCnt, partitionId); + /*if (iterationCnt >= this->burnIn && this->optimInterval && (iterationCnt + 1) % this->optimInterval == 0) { updateBeta(doc, rgs); + }*/ + } + + template + void sampleGlobalLevel(ThreadPool* pool, _ModelState* localData, RandGen* rgs, _DocIter first, _DocIter last) const + { + if (this->iterated < this->burnIn || !this->optimInterval || (this->iterated + 1) % this->optimInterval != 0) return; + + if (pool) + { + std::vector> res; + const size_t chStride = pool->getNumWorkers() * 8; + size_t dist = std::distance(first, last); + for (size_t ch = 0; ch < chStride; ++ch) + { + auto b = first, e = first; + std::advance(b, dist * ch / chStride); + std::advance(e, dist * (ch + 1) / chStride); + res.emplace_back(pool->enqueue([&, ch, chStride](size_t threadId, _DocIter b, _DocIter e) + { + for (auto doc = b; doc != e; ++doc) + { + updateBeta(*doc, rgs[threadId]); + } + }, b, e)); + } + for (auto& r : res) r.get(); + } + else + { + for (auto doc = first; doc != last; ++doc) + { + updateBeta(*doc, rgs[0]); + } } } @@ -130,7 +167,7 @@ namespace tomoto } }, ch)); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); return 0; } @@ -211,17 +248,17 @@ namespace tomoto return ret; } - std::vector getPriorMean() const + std::vector getPriorMean() const override { return { topicPrior.mean.data(), topicPrior.mean.data() + topicPrior.mean.size() }; } - std::vector getPriorCov() const + std::vector getPriorCov() const override { return { topicPrior.cov.data(), topicPrior.cov.data() + topicPrior.cov.size() }; } - std::vector getCorrelationTopic(TID k) const + std::vector getCorrelationTopic(TID k) const override { Eigen::Matrix ret = topicPrior.cov.col(k).array() / (topicPrior.cov.diagonal().array() * topicPrior.cov(k, k)).sqrt(); return { ret.data(), ret.data() + ret.size() }; @@ -229,21 +266,21 @@ namespace tomoto GETTER(NumBetaSample, size_t, numBetaSample); - void setNumBetaSample(size_t _numSample) + void setNumBetaSample(size_t _numSample) override { numBetaSample = _numSample; } GETTER(NumDocBetaSample, size_t, numDocBetaSample); - void setNumDocBetaSample(size_t _numSample) + void setNumDocBetaSample(size_t _numSample) override { numDocBetaSample = _numSample; } GETTER(NumTMNSample, size_t, numTMNSample); - void setNumTMNSample(size_t _numSample) + void setNumTMNSample(size_t _numSample) override { numTMNSample = _numSample; } diff --git a/src/TopicModel/DMRModel.hpp b/src/TopicModel/DMRModel.hpp index dc1d0e5..1b79ce4 100644 --- a/src/TopicModel/DMRModel.hpp +++ b/src/TopicModel/DMRModel.hpp @@ -16,7 +16,7 @@ namespace tomoto Eigen::Matrix tmpK; }; - template, @@ -69,6 +69,7 @@ namespace tomoto res.emplace_back(pool.enqueue([&](size_t threadId) { auto& tmpK = localData[threadId].tmpK; + if (!tmpK.size()) tmpK.resize(this->K); Eigen::Matrix val = Eigen::Matrix::Zero(K * F + 1); for (size_t docId = ch; docId < this->docs.size(); docId += chStride) { @@ -95,7 +96,7 @@ namespace tomoto return val; })); } - for (auto&& r : res) + for (auto& r : res) { auto ret = r.get(); fx += ret[K * F]; @@ -279,12 +280,12 @@ namespace tomoto GETTER(AlphaEps, FLOAT, alphaEps); GETTER(OptimRepeat, size_t, optimRepeat); - void setAlphaEps(FLOAT _alphaEps) + void setAlphaEps(FLOAT _alphaEps) override { alphaEps = _alphaEps; } - void setOptimRepeat(size_t _optimRepeat) + void setOptimRepeat(size_t _optimRepeat) override { optimRepeat = _optimRepeat; } @@ -312,7 +313,7 @@ namespace tomoto return { l.data(), l.data() + F }; } - const Dictionary& getMetadataDict() const { return metadataDict; } + const Dictionary& getMetadataDict() const override { return metadataDict; } }; /* This is for preventing 'undefined symbol' problem in compiling by clang. */ diff --git a/src/TopicModel/GDMRModel.hpp b/src/TopicModel/GDMRModel.hpp index 9636311..99f526b 100644 --- a/src/TopicModel/GDMRModel.hpp +++ b/src/TopicModel/GDMRModel.hpp @@ -14,7 +14,7 @@ namespace tomoto std::vector ndimCnt; }; - template, @@ -136,7 +136,7 @@ namespace tomoto return ret; })); } - for (auto&& r : res) + for (auto& r : res) { auto ret = r.get(); fx += ret[K * F]; @@ -310,7 +310,7 @@ namespace tomoto GETTER(Fs, const std::vector&, degreeByF); GETTER(Sigma0, FLOAT, sigma0); - void setSigma0(FLOAT _sigma0) + void setSigma0(FLOAT _sigma0) override { this->sigma0 = _sigma0; } diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp index 5123a16..e67b668 100644 --- a/src/TopicModel/HDPModel.hpp +++ b/src/TopicModel/HDPModel.hpp @@ -190,7 +190,8 @@ namespace tomoto } } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -277,9 +278,10 @@ namespace tomoto } }, this->docs.size() * i / pool.getNumWorkers(), this->docs.size() * (i + 1) / pool.getNumWorkers()); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); } + template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { std::vector> res(pool.getNumWorkers()); @@ -328,7 +330,7 @@ namespace tomoto localData[i] = globalState; }); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); } /* this LL calculation is based on https://github.com/blei-lab/hdp/blob/master/hdp/state.cpp */ @@ -342,7 +344,7 @@ namespace tomoto { auto& doc = *_first; ll += doc.getNumTable() * log(alpha) - math::lgammaT(doc.getSumWordWeight() + alpha) + math::lgammaT(alpha); - for (auto&& nt : doc.numTopicByTable) + for (auto& nt : doc.numTopicByTable) { if (nt) ll += math::lgammaT(nt.num); } diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp index 973918d..6d42fbe 100644 --- a/src/TopicModel/HLDAModel.hpp +++ b/src/TopicModel/HLDAModel.hpp @@ -222,13 +222,31 @@ namespace tomoto }; // we elide the likelihood for root node because its weight applied to all path and can be seen as constant. - for (size_t b = 0; b < levelBlocks.size(); ++b) + if (pool) { - if (!levelBlocks[b]) continue; - if(pool) futures.emplace_back(pool->enqueue(calc, b)); - else calc(0, b); + const size_t chStride = pool->getNumWorkers() * 8; + for (size_t ch = 0; ch < chStride; ++ch) + { + futures.emplace_back(pool->enqueue([&](size_t threadId, size_t bBegin, size_t bEnd) + { + for (size_t b = bBegin; b < bEnd; ++b) + { + if (!levelBlocks[b]) continue; + calc(threadId, b); + } + }, levelBlocks.size() * ch / chStride, levelBlocks.size() * (ch + 1) / chStride)); + } + for (auto& f : futures) f.get(); + } + else + { + for (size_t b = 0; b < levelBlocks.size(); ++b) + { + if (!levelBlocks[b]) continue; + calc(0, b); + } } - for (auto& f : futures) f.get(); + updateWordLikelihood<_TW>(eta, realV, levelDepth, doc, newTopicWeights, &nodes[0]); } @@ -431,7 +449,8 @@ namespace tomoto } } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { sampleTopics(doc, docId, ld, rgs); } diff --git a/src/TopicModel/HPA.h b/src/TopicModel/HPA.h index 6fe19ff..33f9d35 100644 --- a/src/TopicModel/HPA.h +++ b/src/TopicModel/HPA.h @@ -18,4 +18,4 @@ namespace tomoto using DefaultDocType = DocumentHPA; static IHPAModel* create(TermWeight _weight, bool _exclusive = false, size_t _K1 = 1, size_t _K2 = 1, FLOAT _alpha = 50, FLOAT _eta = 0.01, const RandGen& _rg = RandGen{ std::random_device{}() }); }; -} \ No newline at end of file +} diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp index c87f734..bf879bd 100644 --- a/src/TopicModel/HPAModel.hpp +++ b/src/TopicModel/HPAModel.hpp @@ -173,14 +173,24 @@ namespace tomoto } } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { + size_t b = 0, e = doc.words.size(); + if (_ps == ParallelScheme::partition) + { + b = this->chunkOffsetByDoc(partitionId, docId); + e = this->chunkOffsetByDoc(partitionId + 1, docId); + } + + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + const auto K = this->K; - for (size_t w = 0; w < doc.words.size(); ++w) + for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; - addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w], doc.Z2s[w]); - auto dist = getZLikelihoods(ld, doc, docId, doc.words[w]); + addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); + auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); if (_Exclusive) { auto z = sample::sampleFromDiscreteAcc(dist, dist + K2 + K + 1, rgs); @@ -219,10 +229,15 @@ namespace tomoto doc.Z2s[w] = 0; } } - addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w], doc.Z2s[w]); + addWordTo<1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); } } + void distributePartition(ThreadPool& pool, _ModelState* localData) + { + } + + template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { std::vector> res(pool.getNumWorkers()); @@ -259,12 +274,32 @@ namespace tomoto localData[i] = globalState; }); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); } std::vector _getTopicsCount() const { - return { }; + std::vector cnt(1 + this->K + K2); + for (auto& doc : this->docs) + { + for (size_t i = 0; i < doc.Zs.size(); ++i) + { + if (doc.words[i] >= this->realV) continue; + if (doc.Zs[i] == 0 && doc.Z2s[i] == 0) + { + ++cnt[0]; + } + else if (doc.Zs[i] && doc.Z2s[i] == 0) + { + ++cnt[doc.Zs[i]]; + } + else + { + ++cnt[this->K + doc.Z2s[i]]; + } + } + } + return cnt; } template @@ -484,6 +519,17 @@ namespace tomoto } return ret; } + + std::vector getSubTopicsByDoc(const DocumentBase* doc) const override + { + throw std::runtime_error{ "not applicable" }; + } + + std::vector> getSubTopicsByDocSorted(const DocumentBase* doc, size_t topN) const override + { + throw std::runtime_error{ "not applicable" }; + } + }; template @@ -501,4 +547,4 @@ namespace tomoto } template using HPAModelExclusive = HPAModel<_TW, true>; -} \ No newline at end of file +} diff --git a/src/TopicModel/LDA.h b/src/TopicModel/LDA.h index 074a4ab..3652103 100644 --- a/src/TopicModel/LDA.h +++ b/src/TopicModel/LDA.h @@ -117,4 +117,4 @@ namespace tomoto virtual FLOAT getAlpha(TID k1) const = 0; virtual FLOAT getEta() const = 0; }; -} \ No newline at end of file +} diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp index 316006c..013dfc3 100644 --- a/src/TopicModel/LDACVB0Model.hpp +++ b/src/TopicModel/LDACVB0Model.hpp @@ -138,7 +138,8 @@ namespace tomoto if (DEC) ld.numByTopicWord.col(vid) = ld.numByTopicWord.col(vid).cwiseMax(0); } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { for (size_t w = 0; w < doc.words.size(); ++w) { @@ -149,6 +150,11 @@ namespace tomoto } } + void updatePartition(ThreadPool& pool, _ModelState* localData) + { + } + + template void trainOne(ThreadPool& pool, _ModelState* localData, RandGen* rgs) { std::vector> res; @@ -159,13 +165,13 @@ namespace tomoto { forRandom((this->docs.size() - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id) { - static_cast(this)->sampleDocument( + static_cast(this)->template sampleDocument( this->docs[id * chStride + ch], id * chStride + ch, localData[threadId], rgs[threadId], this->iterated); }); })); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); static_cast(this)->updateGlobalInfo(pool, localData); static_cast(this)->mergeState(pool, this->globalState, this->tState, localData); if (this->iterated >= 250 && optimInterval && (this->iterated + 1) % optimInterval == 0) @@ -197,7 +203,7 @@ namespace tomoto localData[i] = this->globalState; }); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); } void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData) const @@ -346,7 +352,7 @@ namespace tomoto } } - void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) + void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) override { if (initDocs) this->removeStopwords(minWordCnt, removeTopN); static_cast(this)->updateWeakArray(); @@ -397,7 +403,7 @@ namespace tomoto return ret; } - template + template std::vector _infer(_Iter docFirst, _Iter docLast, size_t maxIter, FLOAT tolerance, size_t numWorkers) const { return {}; diff --git a/src/TopicModel/LDAModel.cpp b/src/TopicModel/LDAModel.cpp index ef66bd6..104989c 100644 --- a/src/TopicModel/LDAModel.cpp +++ b/src/TopicModel/LDAModel.cpp @@ -10,4 +10,4 @@ namespace tomoto { SWITCH_TW(_weight, LDAModel, _K, _alpha, _eta, _rg); } -} \ No newline at end of file +} diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index 4e50ab4..9f7a120 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -44,7 +44,6 @@ namespace tomoto Eigen::Matrix zLikelihood; Eigen::Matrix numByTopic; // Dim: (Topic, 1) Eigen::Matrix numByTopicWord; // Dim: (Topic, Vocabs) - DEFINE_SERIALIZER(numByTopic, numByTopicWord); }; @@ -57,7 +56,7 @@ namespace tomoto }; } - template, @@ -75,16 +74,20 @@ namespace tomoto static constexpr const char* TMID = "LDA"; using WeightType = typename std::conditional<_TW == TermWeight::one, int32_t, float>::type; + enum { m_flags = _Flags }; + std::vector vocabWeights; std::vector sharedZs; std::vector sharedWordWeights; TID K; - FLOAT alpha; + FLOAT alpha, eta; Eigen::Matrix alphas; - FLOAT eta; size_t optimInterval = 10, burnIn = 0; Eigen::Matrix numByTopicDoc; + std::vector vChunkOffset; + Eigen::Matrix chunkOffsetByDoc; + template static FLOAT calcDigammaSum(_List list, size_t len, FLOAT alpha) { @@ -144,73 +147,145 @@ namespace tomoto /* main sampling procedure */ - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { - for (size_t w = 0; w < doc.words.size(); ++w) + size_t b = 0, e = doc.words.size(); + if (_ps == ParallelScheme::partition) + { + b = chunkOffsetByDoc(partitionId, docId); + e = chunkOffsetByDoc(partitionId + 1, docId); + } + + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + + for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; - addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w]); - auto dist = static_cast(this)->getZLikelihoods(ld, doc, docId, doc.words[w]); + addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w]); + auto dist = static_cast(this)->getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); doc.Zs[w] = sample::sampleFromDiscreteAcc(dist, dist + K, rgs); - addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w]); + addWordTo<1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w]); } } - /* - reserved for model which needs second sampling - */ - void sampleDocument_2(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const - { - } - - template + template void performSampling(ThreadPool& pool, _ModelState* localData, RandGen* rgs, std::vector>& res, - _DocIter docFirst, _DocIter docLast, _SamplingFunc func) const + _DocIter docFirst, _DocIter docLast) const { - if ((_Flags & flags::shared_state)) + // single-threaded sampling + if (_ps == ParallelScheme::none) { size_t docId = 0; for (auto doc = docFirst; doc != docLast; ++doc) { - (static_cast(this)->*func)( + static_cast(this)->template sampleDocument<_ps>( *doc, docId++, - *localData, *rgs, this->iterated); + *localData, *rgs, this->iterated, 0); } } - else + // multi-threaded sampling on partition ad update into global + else if (_ps == ParallelScheme::partition) + { + const size_t chStride = pool.getNumWorkers(); + for (size_t i = 0; i < chStride; ++i) + { + res = pool.enqueueToAll([&, i, chStride](size_t partitionId) + { + size_t didx = (i + partitionId) % chStride; + forRandom(((size_t)std::distance(docFirst, docLast) + (chStride - 1) - didx) / chStride, rgs[partitionId](), [&](size_t id) + { + static_cast(this)->template sampleDocument<_ps>( + docFirst[id * chStride + didx], id * chStride + didx, + localData[partitionId], rgs[partitionId], this->iterated, partitionId); + }); + }); + for (auto& r : res) r.get(); + res.clear(); + } + } + // multi-threaded sampling on copy and merge into global + else if(_ps == ParallelScheme::copy_merge) { const size_t chStride = std::min(pool.getNumWorkers() * 8, (size_t)std::distance(docFirst, docLast)); for (size_t ch = 0; ch < chStride; ++ch) { - res.emplace_back(pool.enqueue([&, this, ch, chStride](size_t threadId) + res.emplace_back(pool.enqueue([&, ch, chStride](size_t threadId) { - forRandom(((size_t)std::distance(docFirst, docLast) - 1 - ch) / chStride + 1, rgs[threadId](), [&, this](size_t id) + forRandom(((size_t)std::distance(docFirst, docLast) + (chStride - 1) - ch) / chStride, rgs[threadId](), [&](size_t id) { - (static_cast(this)->*func)( + static_cast(this)->template sampleDocument<_ps>( docFirst[id * chStride + ch], id * chStride + ch, - localData[threadId], rgs[threadId], this->iterated); + localData[threadId], rgs[threadId], this->iterated, 0); }); })); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); res.clear(); } } + void updatePartition(ThreadPool& pool, _ModelState* localData) + { + size_t numPools = pool.getNumWorkers(); + if (vChunkOffset.size() != numPools) + { + vChunkOffset.clear(); + size_t totCnt = std::accumulate(this->vocabFrequencies.begin(), this->vocabFrequencies.begin() + this->realV, 0); + size_t cumCnt = 0; + for (size_t i = 0; i < this->realV; ++i) + { + cumCnt += this->vocabFrequencies[i]; + if (cumCnt * numPools >= totCnt * (vChunkOffset.size() + 1)) vChunkOffset.emplace_back(i + 1); + } + + chunkOffsetByDoc.resize(numPools + 1, this->docs.size()); + for (size_t i = 0; i < this->docs.size(); ++i) + { + auto& doc = this->docs[i]; + chunkOffsetByDoc(0, i) = 0; + size_t g = 0; + for (size_t j = 0; j < doc.words.size(); ++j) + { + for (; g < numPools && doc.words[j] >= vChunkOffset[g]; ++g) + { + chunkOffsetByDoc(g + 1, i) = j; + } + } + for (; g < numPools; ++g) + { + chunkOffsetByDoc(g + 1, i) = doc.words.size(); + } + } + } + static_cast(this)->distributePartition(pool, localData); + } + + void distributePartition(ThreadPool& pool, _ModelState* localData) + { + std::vector> res = pool.enqueueToAll([&](size_t partitionId) + { + size_t b = partitionId ? vChunkOffset[partitionId - 1] : 0, + e = vChunkOffset[partitionId]; + + localData[partitionId].numByTopicWord = this->globalState.numByTopicWord.block(0, b, this->globalState.numByTopicWord.rows(), e - b); + localData[partitionId].numByTopic = this->globalState.numByTopic; + if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = this->globalState.zLikelihood; + }); + + for (auto& r : res) r.get(); + } + + template void trainOne(ThreadPool& pool, _ModelState* localData, RandGen* rgs) { std::vector> res; try { - performSampling(pool, localData, rgs, res, - this->docs.begin(), this->docs.end(), &DerivedClass::sampleDocument); - if(&DerivedClass::sampleDocument_2 != &LDAModel::sampleDocument_2) performSampling(pool, localData, rgs, res, - this->docs.begin(), this->docs.end(), &DerivedClass::sampleDocument_2); + performSampling<_ps>(pool, localData, rgs, res, + this->docs.begin(), this->docs.end()); static_cast(this)->updateGlobalInfo(pool, localData); - if (!(_Flags & flags::shared_state)) - { - static_cast(this)->mergeState(pool, this->globalState, this->tState, localData, rgs); - } + static_cast(this)->template mergeState<_ps>(pool, this->globalState, this->tState, localData, rgs); static_cast(this)->template sampleGlobalLevel<>(&pool, localData, rgs, this->docs.begin(), this->docs.end()); if (this->iterated >= this->burnIn && optimInterval && (this->iterated + 1) % optimInterval == 0) { @@ -219,7 +294,7 @@ namespace tomoto } catch (const exception::TrainingError& e) { - for (auto&& r : res) if(r.valid()) r.get(); + for (auto& r : res) if(r.valid()) r.get(); throw; } } @@ -235,33 +310,59 @@ namespace tomoto /* merges multithreaded document sampling result */ + template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { - std::vector> res(pool.getNumWorkers()); + std::vector> res; - tState = globalState; - globalState = localData[0]; - for (size_t i = 1; i < pool.getNumWorkers(); ++i) + if (_ps == ParallelScheme::copy_merge) { - globalState.numByTopic += localData[i].numByTopic - tState.numByTopic; - globalState.numByTopicWord += localData[i].numByTopicWord - tState.numByTopicWord; - } + tState = globalState; + globalState = localData[0]; + for (size_t i = 1; i < pool.getNumWorkers(); ++i) + { + globalState.numByTopicWord += localData[i].numByTopicWord - tState.numByTopicWord; + } - // make all count being positive - if (_TW != TermWeight::one) - { - globalState.numByTopic = globalState.numByTopic.cwiseMax(0); - globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); - } + // make all count being positive + if (_TW != TermWeight::one) + { + globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); + } + globalState.numByTopic = globalState.numByTopicWord.rowwise().sum(); - for (size_t i = 0; i < pool.getNumWorkers(); ++i) + for (size_t i = 0; i < pool.getNumWorkers(); ++i) + { + res.emplace_back(pool.enqueue([&, i](size_t) + { + localData[i] = globalState; + })); + } + } + else if (_ps == ParallelScheme::partition) { - res[i] = pool.enqueue([&, i](size_t threadId) + res = pool.enqueueToAll([&](size_t partitionId) + { + size_t b = partitionId ? vChunkOffset[partitionId - 1] : 0, + e = vChunkOffset[partitionId]; + globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b) = localData[partitionId].numByTopicWord; + }); + for (auto& r : res) r.get(); + res.clear(); + + // make all count being positive + if (_TW != TermWeight::one) { - localData[i] = globalState; + globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); + } + globalState.numByTopic = globalState.numByTopicWord.rowwise().sum(); + + res = pool.enqueueToAll([&](size_t threadId) + { + localData[threadId].numByTopic = globalState.numByTopic; }); } - for (auto&& r : res) r.get(); + for (auto& r : res) r.get(); } /* @@ -338,7 +439,8 @@ namespace tomoto void prepareDoc(_DocType& doc, WeightType* topicDocPtr, size_t wordSize) const { - doc.numByTopic.init((_Flags & flags::continuous_doc_data) ? topicDocPtr : nullptr, K); + sortAndWriteOrder(doc.words, doc.wOrder); + doc.numByTopic.init((m_flags & flags::continuous_doc_data) ? topicDocPtr : nullptr, K); doc.Zs = tvector(wordSize); if(_TW != TermWeight::one) doc.wordWeights.resize(wordSize, 1); } @@ -352,7 +454,7 @@ namespace tomoto this->globalState.numByTopic = Eigen::Matrix::Zero(K); this->globalState.numByTopicWord = Eigen::Matrix::Zero(K, V); } - if(_Flags & flags::continuous_doc_data) numByTopicDoc = Eigen::Matrix::Zero(K, this->docs.size()); + if(m_flags & flags::continuous_doc_data) numByTopicDoc = Eigen::Matrix::Zero(K, this->docs.size()); } struct Generator @@ -381,7 +483,7 @@ namespace tomoto static_cast(this)->prepareDoc(doc, topicDocPtr, doc.words.size()); _Generator g2; _Generator* selectedG = &g; - if (_Flags & flags::generator_by_doc) + if (m_flags & flags::generator_by_doc) { g2 = static_cast(this)->makeGeneratorForInit(&doc); selectedG = &g2; @@ -427,7 +529,7 @@ namespace tomoto std::vector _getWidsByTopic(TID tid) const { - assert(tid < K); + assert(tid < this->globalState.numByTopic.rows()); const size_t V = this->realV; std::vector ret(V); FLOAT sum = this->globalState.numByTopic[tid] + V * eta; @@ -439,15 +541,14 @@ namespace tomoto return ret; } - template + template std::vector _infer(_Iter docFirst, _Iter docLast, size_t maxIter, FLOAT tolerance, size_t numWorkers) const { decltype(static_cast(this)->makeGeneratorForInit(nullptr)) generator; - if (!(_Flags & flags::generator_by_doc)) + if (!(m_flags & flags::generator_by_doc)) { generator = static_cast(this)->makeGeneratorForInit(nullptr); } - if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); ThreadPool pool(numWorkers, numWorkers * 8); if (_Together) { @@ -459,28 +560,25 @@ namespace tomoto initializeDocState(*d, nullptr, generator, tmpState, rgc); } - std::vector localData((_Flags & flags::shared_state) ? 0 : pool.getNumWorkers(), tmpState); + std::vector localData((m_flags & flags::shared_state) ? 0 : pool.getNumWorkers(), tmpState); std::vector rgs; for (size_t i = 0; i < pool.getNumWorkers(); ++i) rgs.emplace_back(rgc()); for (size_t i = 0; i < maxIter; ++i) { std::vector> res; - performSampling(pool, - (_Flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), res, - docFirst, docLast, &DerivedClass::sampleDocument); - if (&DerivedClass::sampleDocument_2 != &LDAModel::sampleDocument_2) performSampling(pool, - (_Flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), res, - docFirst, docLast, &DerivedClass::sampleDocument_2); - if(!(_Flags & flags::shared_state)) static_cast(this)->mergeState(pool, tmpState, tState, localData.data(), rgs.data()); + performSampling<_ps>(pool, + (m_flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), res, + docFirst, docLast); + static_cast(this)->template mergeState<_ps>(pool, tmpState, tState, localData.data(), rgs.data()); static_cast(this)->template sampleGlobalLevel<>( - &pool, (_Flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), docFirst, docLast); + &pool, (m_flags & flags::shared_state) ? &tmpState : localData.data(), rgs.data(), docFirst, docLast); } double ll = static_cast(this)->getLLRest(tmpState) - static_cast(this)->getLLRest(this->globalState); ll += static_cast(this)->template getLLDocs<>(docFirst, docLast); return { ll }; } - else if (_Flags & flags::shared_state) + else if (m_flags & flags::shared_state) { std::vector ret; const double gllRest = static_cast(this)->getLLRest(this->globalState); @@ -491,8 +589,7 @@ namespace tomoto initializeDocState(*d, nullptr, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) { - static_cast(this)->sampleDocument(*d, -1, tmpState, rgc, i); - static_cast(this)->sampleDocument_2(*d, -1, tmpState, rgc, i); + static_cast(this)->template sampleDocument(*d, -1, tmpState, rgc, i); static_cast(this)->template sampleGlobalLevel<>( &pool, &tmpState, &rgc, &*d, &*d + 1); } @@ -515,8 +612,7 @@ namespace tomoto initializeDocState(*d, nullptr, generator, tmpState, rgc); for (size_t i = 0; i < maxIter; ++i) { - static_cast(this)->sampleDocument(*d, -1, tmpState, rgc, i); - static_cast(this)->sampleDocument_2(*d, -1, tmpState, rgc, i); + static_cast(this)->template sampleDocument(*d, -1, tmpState, rgc, i); static_cast(this)->template sampleGlobalLevel<>( nullptr, &tmpState, &rgc, &*d, &*d + 1); } @@ -526,7 +622,7 @@ namespace tomoto })); } std::vector ret; - for (auto&& r : res) ret.emplace_back(r.get()); + for (auto& r : res) ret.emplace_back(r.get()); return ret; } } @@ -581,11 +677,11 @@ namespace tomoto size_t docId = 0; for (auto& doc : this->docs) { - doc.template update<>((_Flags & flags::continuous_doc_data) ? numByTopicDoc.col(docId++).data() : nullptr, *static_cast(this)); + doc.template update<>((m_flags & flags::continuous_doc_data) ? numByTopicDoc.col(docId++).data() : nullptr, *static_cast(this)); } } - void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) + void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) override { if (initDocs) this->removeStopwords(minWordCnt, removeTopN); static_cast(this)->updateWeakArray(); @@ -631,10 +727,10 @@ namespace tomoto } decltype(static_cast(this)->makeGeneratorForInit(nullptr)) generator; - if(!(_Flags & flags::generator_by_doc)) generator = static_cast(this)->makeGeneratorForInit(nullptr); + if(!(m_flags & flags::generator_by_doc)) generator = static_cast(this)->makeGeneratorForInit(nullptr); for (auto& doc : this->docs) { - initializeDocState(doc, (_Flags & flags::continuous_doc_data) ? numByTopicDoc.col(&doc - &this->docs[0]).data() : nullptr, generator, this->globalState, this->rg); + initializeDocState(doc, (m_flags & flags::continuous_doc_data) ? numByTopicDoc.col(&doc - &this->docs[0]).data() : nullptr, generator, this->globalState, this->rg); } } else diff --git a/src/TopicModel/LLDAModel.hpp b/src/TopicModel/LLDAModel.hpp index 28ab50e..e6d54e5 100644 --- a/src/TopicModel/LLDAModel.hpp +++ b/src/TopicModel/LLDAModel.hpp @@ -15,14 +15,14 @@ namespace tomoto typename _Derived = void, typename _DocType = DocumentLLDA<_TW>, typename _ModelState = ModelStateLDA<_TW>> - class LLDAModel : public LDAModel<_TW, flags::generator_by_doc, _Interface, + class LLDAModel : public LDAModel<_TW, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, typename std::conditional::value, LLDAModel<_TW>, _Derived>::type, _DocType, _ModelState> { static constexpr const char* TMID = "LLDA"; protected: using DerivedClass = typename std::conditional::value, LLDAModel<_TW>, _Derived>::type; - using BaseClass = LDAModel<_TW, flags::generator_by_doc, _Interface, DerivedClass, _DocType, _ModelState>; + using BaseClass = LDAModel<_TW, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -145,8 +145,8 @@ namespace tomoto return ret; } - const Dictionary& getTopicLabelDict() const { return topicLabelDict; } + const Dictionary& getTopicLabelDict() const override { return topicLabelDict; } - size_t getNumTopicsPerLabel() const { return 1; } + size_t getNumTopicsPerLabel() const override { return 1; } }; } diff --git a/src/TopicModel/MGLDAModel.hpp b/src/TopicModel/MGLDAModel.hpp index df8262d..7f6fd19 100644 --- a/src/TopicModel/MGLDAModel.hpp +++ b/src/TopicModel/MGLDAModel.hpp @@ -16,13 +16,13 @@ namespace tomoto typename _Derived = void, typename _DocType = DocumentMGLDA<_TW>, typename _ModelState = ModelStateLDA<_TW>> - class MGLDAModel : public LDAModel<_TW, 0, _Interface, + class MGLDAModel : public LDAModel<_TW, flags::partitioned_multisampling, _Interface, typename std::conditional::value, MGLDAModel<_TW>, _Derived>::type, _DocType, _ModelState> { protected: using DerivedClass = typename std::conditional::value, MGLDAModel<_TW>, _Derived>::type; - using BaseClass = LDAModel<_TW, 0, _Interface, DerivedClass, _DocType, _ModelState>; + using BaseClass = LDAModel<_TW, flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -97,18 +97,28 @@ namespace tomoto } } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { + size_t b = 0, e = doc.words.size(); + if (_ps == ParallelScheme::partition) + { + b = this->chunkOffsetByDoc(partitionId, docId); + e = this->chunkOffsetByDoc(partitionId + 1, docId); + } + + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + const auto K = this->K; - for (size_t w = 0; w < doc.words.size(); ++w) + for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; - addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w] - (doc.Zs[w] < K ? 0 : K), doc.sents[w], doc.Vs[w], doc.Zs[w] < K ? 0 : 1); - auto dist = getVZLikelihoods(ld, doc, doc.words[w], doc.sents[w]); + addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w] - (doc.Zs[w] < K ? 0 : K), doc.sents[w], doc.Vs[w], doc.Zs[w] < K ? 0 : 1); + auto dist = getVZLikelihoods(ld, doc, doc.words[w] - vOffset, doc.sents[w]); auto vz = sample::sampleFromDiscreteAcc(dist, dist + T * (K + KL), rgs); doc.Vs[w] = vz / (K + KL); doc.Zs[w] = vz % (K + KL); - addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w] - (doc.Zs[w] < K ? 0 : K), doc.sents[w], doc.Vs[w], doc.Zs[w] < K ? 0 : 1); + addWordTo<1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w] - (doc.Zs[w] < K ? 0 : K), doc.sents[w], doc.Vs[w], doc.Zs[w] < K ? 0 : 1); } } @@ -206,7 +216,7 @@ namespace tomoto if(K) ll += (math::lgammaT(K*alpha) - math::lgammaT(alpha)*K) * this->docs.size(); for (size_t i = 0; i < this->docs.size(); ++i) { - auto&& doc = this->docs[i]; + auto& doc = this->docs[i]; const size_t S = doc.numBySent.size(); if (K) { @@ -272,6 +282,13 @@ namespace tomoto void prepareDoc(_DocType& doc, WeightType* topicDocPtr, size_t wordSize) const { + sortAndWriteOrder(doc.words, doc.wOrder); + auto tmp = doc.sents; + for (size_t i = 0; i < doc.wOrder.size(); ++i) + { + doc.sents[doc.wOrder[i]] = tmp[i]; + } + const size_t S = doc.numBySent.size(); std::fill(doc.numBySent.begin(), doc.numBySent.end(), 0); doc.Zs = tvector(wordSize); diff --git a/src/TopicModel/PA.h b/src/TopicModel/PA.h index 520e7e9..8a96e18 100644 --- a/src/TopicModel/PA.h +++ b/src/TopicModel/PA.h @@ -29,5 +29,8 @@ namespace tomoto virtual FLOAT getSubAlpha(TID k1, TID k2) const = 0; virtual std::vector getSubTopicBySuperTopic(TID k) const = 0; virtual std::vector> getSubTopicBySuperTopicSorted(TID k, size_t topN) const = 0; + + virtual std::vector getSubTopicsByDoc(const DocumentBase* doc) const = 0; + virtual std::vector> getSubTopicsByDocSorted(const DocumentBase* doc, size_t topN) const = 0; }; -} \ No newline at end of file +} diff --git a/src/TopicModel/PAModel.cpp b/src/TopicModel/PAModel.cpp index cff5828..ad1a9c1 100644 --- a/src/TopicModel/PAModel.cpp +++ b/src/TopicModel/PAModel.cpp @@ -10,4 +10,4 @@ namespace tomoto { SWITCH_TW(_weight, PAModel, _K, _K2, _alpha, _eta, _rg); } -} \ No newline at end of file +} diff --git a/src/TopicModel/PAModel.hpp b/src/TopicModel/PAModel.hpp index de49d82..888be39 100644 --- a/src/TopicModel/PAModel.hpp +++ b/src/TopicModel/PAModel.hpp @@ -37,7 +37,7 @@ namespace tomoto friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; - size_t K2; + TID K2; FLOAT epsilon = 1e-5; size_t iteration = 5; @@ -102,51 +102,115 @@ namespace tomoto updateCnt(ld.numByTopicWord(z2, vid), INC * weight); } - void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt) const + template + void sampleDocument(_DocType& doc, size_t docId, _ModelState& ld, RandGen& rgs, size_t iterationCnt, size_t partitionId = 0) const { - for (size_t w = 0; w < doc.words.size(); ++w) + size_t b = 0, e = doc.words.size(); + if (_ps == ParallelScheme::partition) + { + b = this->chunkOffsetByDoc(partitionId, docId); + e = this->chunkOffsetByDoc(partitionId + 1, docId); + } + + size_t vOffset = (_ps == ParallelScheme::partition && partitionId) ? this->vChunkOffset[partitionId - 1] : 0; + for (size_t w = b; w < e; ++w) { if (doc.words[w] >= this->realV) continue; - addWordTo<-1>(ld, doc, w, doc.words[w], doc.Zs[w], doc.Z2s[w]); - auto dist = getZLikelihoods(ld, doc, docId, doc.words[w]); + addWordTo<-1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); + auto dist = getZLikelihoods(ld, doc, docId, doc.words[w] - vOffset); auto z = sample::sampleFromDiscreteAcc(dist, dist + this->K * K2, rgs); doc.Zs[w] = z / K2; doc.Z2s[w] = z % K2; - addWordTo<1>(ld, doc, w, doc.words[w], doc.Zs[w], doc.Z2s[w]); + addWordTo<1>(ld, doc, w, doc.words[w] - vOffset, doc.Zs[w], doc.Z2s[w]); } } + void distributePartition(ThreadPool& pool, _ModelState* localData) + { + std::vector> res = pool.enqueueToAll([&](size_t partitionId) + { + size_t b = partitionId ? this->vChunkOffset[partitionId - 1] : 0, + e = this->vChunkOffset[partitionId]; + + localData[partitionId].numByTopicWord = this->globalState.numByTopicWord.block(0, b, this->globalState.numByTopicWord.rows(), e - b); + localData[partitionId].numByTopic = this->globalState.numByTopic; + localData[partitionId].numByTopic1_2 = this->globalState.numByTopic1_2; + localData[partitionId].numByTopic2 = this->globalState.numByTopic2; + if (!localData[partitionId].zLikelihood.size()) localData[partitionId].zLikelihood = this->globalState.zLikelihood; + }); + + for (auto& r : res) r.get(); + } + + template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { - std::vector> res(pool.getNumWorkers()); + std::vector> res; - tState = globalState; - globalState = localData[0]; - for (size_t i = 1; i < pool.getNumWorkers(); ++i) + if (_ps == ParallelScheme::copy_merge) { - globalState.numByTopic += localData[i].numByTopic - tState.numByTopic; - globalState.numByTopic1_2 += localData[i].numByTopic1_2 - tState.numByTopic1_2; - globalState.numByTopic2 += localData[i].numByTopic2 - tState.numByTopic2; - globalState.numByTopicWord += localData[i].numByTopicWord - tState.numByTopicWord; - } + tState = globalState; + globalState = localData[0]; + for (size_t i = 1; i < pool.getNumWorkers(); ++i) + { + globalState.numByTopic += localData[i].numByTopic - tState.numByTopic; + globalState.numByTopic1_2 += localData[i].numByTopic1_2 - tState.numByTopic1_2; + globalState.numByTopic2 += localData[i].numByTopic2 - tState.numByTopic2; + globalState.numByTopicWord += localData[i].numByTopicWord - tState.numByTopicWord; + } - // make all count being positive - if (_TW != TermWeight::one) - { - globalState.numByTopic = globalState.numByTopic.cwiseMax(0); - globalState.numByTopic1_2 = globalState.numByTopic1_2.cwiseMax(0); - globalState.numByTopic2 = globalState.numByTopic2.cwiseMax(0); - globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); - } + // make all count being positive + if (_TW != TermWeight::one) + { + globalState.numByTopic = globalState.numByTopic.cwiseMax(0); + globalState.numByTopic1_2 = globalState.numByTopic1_2.cwiseMax(0); + globalState.numByTopic2 = globalState.numByTopic2.cwiseMax(0); + globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); + } - for (size_t i = 0; i < pool.getNumWorkers(); ++i) + for (size_t i = 0; i < pool.getNumWorkers(); ++i) + { + res.emplace_back(pool.enqueue([&, this, i](size_t threadId) + { + localData[i] = globalState; + })); + } + } + else if (_ps == ParallelScheme::partition) { - res[i] = pool.enqueue([&, this, i](size_t threadId) + res = pool.enqueueToAll([&](size_t partitionId) { - localData[i] = globalState; + size_t b = partitionId ? this->vChunkOffset[partitionId - 1] : 0, + e = this->vChunkOffset[partitionId]; + globalState.numByTopicWord.block(0, b, globalState.numByTopicWord.rows(), e - b) = localData[partitionId].numByTopicWord; + }); + for (auto& r : res) r.get(); + res.clear(); + + tState.numByTopic1_2 = globalState.numByTopic1_2; + globalState.numByTopic1_2 = localData[0].numByTopic1_2; + for (size_t i = 1; i < pool.getNumWorkers(); ++i) + { + globalState.numByTopic1_2 += localData[i].numByTopic1_2 - tState.numByTopic1_2; + } + + // make all count being positive + if (_TW != TermWeight::one) + { + globalState.numByTopicWord = globalState.numByTopicWord.cwiseMax(0); + } + globalState.numByTopic = globalState.numByTopic1_2.rowwise().sum(); + globalState.numByTopic2 = globalState.numByTopicWord.rowwise().sum(); + + res = pool.enqueueToAll([&](size_t threadId) + { + localData[threadId].numByTopic = globalState.numByTopic; + localData[threadId].numByTopic1_2 = globalState.numByTopic1_2; + localData[threadId].numByTopic2 = globalState.numByTopic2; }); } - for (auto&& r : res) r.get(); + + for (auto& r : res) r.get(); } template @@ -262,7 +326,7 @@ namespace tomoto FLOAT getSubAlpha(TID k1, TID k2) const override { return subAlphas(k1, k2); } - std::vector getSubTopicBySuperTopic(TID k) const + std::vector getSubTopicBySuperTopic(TID k) const override { assert(k < this->K); FLOAT sum = this->globalState.numByTopic[k] + subAlphaSum[k]; @@ -270,11 +334,29 @@ namespace tomoto return { ret.data(), ret.data() + K2 }; } - std::vector> getSubTopicBySuperTopicSorted(TID k, size_t topN) const + std::vector> getSubTopicBySuperTopicSorted(TID k, size_t topN) const override { return extractTopN(getSubTopicBySuperTopic(k), topN); } + std::vector getSubTopicsByDoc(const _DocType& doc) const + { + std::vector ret(K2); + Eigen::Map> { ret.data(), K2 }.array() = + ((doc.numByTopic1_2.array().template cast() + subAlphas.array()).colwise().sum()) / (doc.getSumWordWeight() + subAlphas.sum()); + return ret; + } + + std::vector getSubTopicsByDoc(const DocumentBase* doc) const override + { + return static_cast(this)->getSubTopicsByDoc(*static_cast(doc)); + } + + std::vector> getSubTopicsByDocSorted(const DocumentBase* doc, size_t topN) const override + { + return extractTopN(getSubTopicsByDoc(doc), topN); + } + std::vector _getWidsByTopic(TID k2) const { assert(k2 < K2); diff --git a/src/TopicModel/PLDAModel.hpp b/src/TopicModel/PLDAModel.hpp index a08eb93..9373a88 100644 --- a/src/TopicModel/PLDAModel.hpp +++ b/src/TopicModel/PLDAModel.hpp @@ -15,14 +15,14 @@ namespace tomoto typename _Derived = void, typename _DocType = DocumentLLDA<_TW>, typename _ModelState = ModelStateLDA<_TW>> - class PLDAModel : public LDAModel<_TW, flags::generator_by_doc, _Interface, + class PLDAModel : public LDAModel<_TW, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, typename std::conditional::value, PLDAModel<_TW>, _Derived>::type, _DocType, _ModelState> { static constexpr const char* TMID = "PLDA"; protected: using DerivedClass = typename std::conditional::value, PLDAModel<_TW>, _Derived>::type; - using BaseClass = LDAModel<_TW, flags::generator_by_doc, _Interface, DerivedClass, _DocType, _ModelState>; + using BaseClass = LDAModel<_TW, flags::generator_by_doc | flags::partitioned_multisampling, _Interface, DerivedClass, _DocType, _ModelState>; friend BaseClass; friend typename BaseClass::BaseClass; using WeightType = typename BaseClass::WeightType; @@ -152,10 +152,10 @@ namespace tomoto return ret; } - const Dictionary& getTopicLabelDict() const { return topicLabelDict; } + const Dictionary& getTopicLabelDict() const override { return topicLabelDict; } - size_t getNumLatentTopics() const { return numLatentTopics; } + size_t getNumLatentTopics() const override { return numLatentTopics; } - size_t getNumTopicsPerLabel() const { return numTopicsPerLabel; } + size_t getNumTopicsPerLabel() const override { return numTopicsPerLabel; } }; } diff --git a/src/TopicModel/SLDAModel.hpp b/src/TopicModel/SLDAModel.hpp index d1a0768..897579d 100644 --- a/src/TopicModel/SLDAModel.hpp +++ b/src/TopicModel/SLDAModel.hpp @@ -68,7 +68,7 @@ namespace tomoto { } - ISLDAModel::GLM getType() const { return ISLDAModel::GLM::linear; } + ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::linear; } void updateZLL( Eigen::Matrix& zLikelihood, @@ -100,7 +100,7 @@ namespace tomoto } FLOAT estimate(const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, - FLOAT docSize) const + FLOAT docSize) const override { return (this->regressionCoef.array() * numByTopic.array().template cast()).sum() / std::max(docSize, 0.01f); @@ -120,7 +120,7 @@ namespace tomoto { } - ISLDAModel::GLM getType() const { return ISLDAModel::GLM::binary_logistic; } + ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::binary_logistic; } void updateZLL( Eigen::Matrix& zLikelihood, @@ -161,7 +161,7 @@ namespace tomoto } FLOAT estimate(const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, - FLOAT docSize) const + FLOAT docSize) const override { FLOAT z = (this->regressionCoef.array() * numByTopic.array().template cast()).sum() / std::max(docSize, 0.01f); @@ -172,7 +172,7 @@ namespace tomoto }; } - template, @@ -332,14 +332,14 @@ namespace tomoto std::copy(_nuSq.begin(), _nuSq.end(), nuSq.data()); } - std::vector getRegressionCoef(size_t f) const + std::vector getRegressionCoef(size_t f) const override { return { responseVars[f]->regressionCoef.data(), responseVars[f]->regressionCoef.data() + this->K }; } GETTER(F, size_t, F); - ISLDAModel::GLM getTypeOfVar(size_t f) const + ISLDAModel::GLM getTypeOfVar(size_t f) const override { return responseVars[f]->getType(); } @@ -361,7 +361,7 @@ namespace tomoto return make_unique<_DocType>(doc); } - std::vector estimateVars(const DocumentBase* doc) const + std::vector estimateVars(const DocumentBase* doc) const override { std::vector ret; auto pdoc = dynamic_cast(doc); diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index ad85e33..edce651 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -21,6 +21,20 @@ namespace tomoto DEFINE_SERIALIZER(serializer::MagicConstant("Document"), weight, words, wOrder); }; + enum class ParallelScheme { default_, none, copy_merge, partition, size }; + + inline const char* toString(ParallelScheme ps) + { + switch (ps) + { + case ParallelScheme::default_: return "default"; + case ParallelScheme::none: return "none"; + case ParallelScheme::copy_merge: return "copy_merge"; + case ParallelScheme::partition: return "partition"; + default: return "unknown"; + } + } + class ITopicModel { public: @@ -36,7 +50,7 @@ namespace tomoto virtual const Dictionary& getVocabDict() const = 0; virtual const std::vector& getVocabFrequencies() const = 0; - virtual int train(size_t iteration, size_t numWorkers) = 0; + virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_) = 0; virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) = 0; virtual std::vector getWidsByTopic(TID tid) const = 0; virtual std::vector> getWordsByTopicSorted(TID tid, size_t topN) const = 0; @@ -45,7 +59,7 @@ namespace tomoto virtual std::vector getTopicsByDoc(const DocumentBase* doc) const = 0; virtual std::vector> getTopicsByDocSorted(const DocumentBase* doc, size_t topN) const = 0; - virtual std::vector infer(const std::vector& docs, size_t maxIter, FLOAT tolerance, size_t numWorkers, bool together) const = 0; + virtual std::vector infer(const std::vector& docs, size_t maxIter, FLOAT tolerance, size_t numWorkers, ParallelScheme ps, bool together) const = 0; virtual ~ITopicModel() {} }; @@ -55,7 +69,7 @@ namespace tomoto typedef std::pair<_TyKey, _TyValue> pair_t; std::vector ret; _TyKey k = 0; - for (auto&& t : vec) + for (auto& t : vec) { ret.emplace_back(std::make_pair(k++, t)); } @@ -73,7 +87,8 @@ namespace tomoto { continuous_doc_data = 1 << 0, shared_state = 1 << 1, - end_flag_of_TopicModel = 1 << 2, + partitioned_multisampling = 1 << 2, + end_flag_of_TopicModel = 1 << 3, }; } @@ -96,6 +111,8 @@ namespace tomoto size_t realV = 0; // vocab size after removing stopwords size_t realN = 0; // total word size after removing stopwords + std::unique_ptr cachedPool; + void _saveModel(std::ostream& writer, bool fullModel) const { serializer::writeMany(writer, @@ -206,7 +223,7 @@ namespace tomoto if (minWordCnt <= 1 && removeTopN == 0) realV = dict.size(); std::vector order; sortAndWriteOrder(vocabFrequencies, order, removeTopN, std::greater()); - realV = std::find_if(vocabFrequencies.begin(), vocabFrequencies.end(), [minWordCnt](size_t a) + realV = std::find_if(vocabFrequencies.begin(), vocabFrequencies.end() - std::min(removeTopN, vocabFrequencies.size()), [minWordCnt](size_t a) { return a < minWordCnt; }) - vocabFrequencies.begin(); @@ -251,35 +268,77 @@ namespace tomoto { } - int train(size_t iteration, size_t numWorkers) override + static ParallelScheme getRealScheme(ParallelScheme ps) + { + switch (ps) + { + case ParallelScheme::default_: + if ((_Flags & flags::partitioned_multisampling)) return ParallelScheme::partition; + if ((_Flags & flags::shared_state)) return ParallelScheme::none; + return ParallelScheme::copy_merge; + case ParallelScheme::copy_merge: + if ((_Flags & flags::shared_state)) THROW_ERROR_WITH_INFO(exception::InvalidArgument, + std::string{ "This model doesn't provide ParallelScheme::" } + toString(ps)); + case ParallelScheme::partition: + if (!(_Flags & flags::partitioned_multisampling)) THROW_ERROR_WITH_INFO(exception::InvalidArgument, + std::string{ "This model doesn't provide ParallelScheme::" } +toString(ps)); + } + return ps; + } + + int train(size_t iteration, size_t numWorkers, ParallelScheme ps) override { if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); - ThreadPool pool(numWorkers); + ps = getRealScheme(ps); + if (numWorkers == 1 || (_Flags & flags::shared_state)) ps = ParallelScheme::none; + if (!cachedPool || cachedPool->getNumWorkers() != numWorkers) + { + cachedPool = make_unique(numWorkers); + } + std::vector<_ModelState> localData; std::vector localRG; for (size_t i = 0; i < numWorkers; ++i) { localRG.emplace_back(RandGen{rg()}); - if(!(_Flags & flags::shared_state)) localData.emplace_back(static_cast<_Derived*>(this)->globalState); + if(ps == ParallelScheme::copy_merge) localData.emplace_back(static_cast<_Derived*>(this)->globalState); } + if (ps == ParallelScheme::partition) + { + localData.resize(numWorkers); + static_cast<_Derived*>(this)->updatePartition(*cachedPool, localData.data()); + } + + auto state = ps == ParallelScheme::none ? &globalState : localData.data(); for (size_t i = 0; i < iteration; ++i) { while (1) { try { - static_cast<_Derived*>(this)->trainOne(pool, - _Flags & flags::shared_state ? &globalState : localData.data(), - localRG.data()); + switch (ps) + { + case ParallelScheme::none: + static_cast<_Derived*>(this)->template trainOne( + *cachedPool, state, localRG.data()); + break; + case ParallelScheme::copy_merge: + static_cast<_Derived*>(this)->template trainOne( + *cachedPool, state, localRG.data()); + break; + case ParallelScheme::partition: + static_cast<_Derived*>(this)->template trainOne( + *cachedPool, state, localRG.data()); + break; + } break; } catch (const exception::TrainingError& e) { std::cerr << e.what() << std::endl; - int ret = static_cast<_Derived*>(this)->restoreFromTrainingError(e, pool, - _Flags & flags::shared_state ? &globalState : localData.data(), - localRG.data()); + int ret = static_cast<_Derived*>(this)->restoreFromTrainingError( + e, *cachedPool, state, localRG.data()); if(ret < 0) return ret; } } @@ -298,7 +357,7 @@ namespace tomoto return exp(-getLLPerWord()); } - std::vector getWidsByTopic(TID tid) const + std::vector getWidsByTopic(TID tid) const override { return static_cast(this)->_getWidsByTopic(tid); } @@ -336,15 +395,39 @@ namespace tomoto return vid2String(getWidsByDocSorted(doc, topN)); } - std::vector infer(const std::vector& docs, size_t maxIter, FLOAT tolerance, size_t numWorkers, bool together) const override + std::vector infer(const std::vector& docs, size_t maxIter, FLOAT tolerance, size_t numWorkers, ParallelScheme ps, bool together) const override { + if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); + ps = getRealScheme(ps); + if (numWorkers == 1) ps = ParallelScheme::none; auto tx = [](DocumentBase* p)->DocType& { return *static_cast(p); }; - if(together) return static_cast(this)->template _infer( - makeTransformIter(docs.begin(), tx), makeTransformIter(docs.end(), tx), - maxIter, tolerance, numWorkers); - else return static_cast(this)->template _infer( - makeTransformIter(docs.begin(), tx), makeTransformIter(docs.end(), tx), - maxIter, tolerance, numWorkers); + auto b = makeTransformIter(docs.begin(), tx), e = makeTransformIter(docs.end(), tx); + + if (together) + { + switch (ps) + { + case ParallelScheme::none: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + case ParallelScheme::copy_merge: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + case ParallelScheme::partition: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + } + } + else + { + switch (ps) + { + case ParallelScheme::none: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + case ParallelScheme::copy_merge: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + case ParallelScheme::partition: + return static_cast(this)->template _infer(b, e, maxIter, tolerance, numWorkers); + } + } + throw std::invalid_argument{ "invalid ParallelScheme" }; } std::vector getTopicsByDoc(const DocumentBase* doc) const override diff --git a/src/Utils/ThreadPool.hpp b/src/Utils/ThreadPool.hpp index 097ad82..a2b8639 100644 --- a/src/Utils/ThreadPool.hpp +++ b/src/Utils/ThreadPool.hpp @@ -19,19 +19,25 @@ namespace tomoto { class ThreadPool { public: - ThreadPool(size_t, size_t maxQueued = 0); + ThreadPool(size_t threads = 0, size_t maxQueued = 0); template auto enqueue(F&& f, Args&&... args) ->std::future::type>; + + template + auto enqueueToAll(F&& f, Args&&... args) + ->std::vector::type>>; + ~ThreadPool(); + size_t getNumWorkers() const { return workers.size(); } size_t getNumEnqued() const { return tasks.size(); } private: // need to keep track of threads so we can join them std::vector< std::thread > workers; // the task queue - std::queue< std::function > tasks; - + std::queue< std::function > shared_task; + std::vector< std::queue< std::function > > tasks; // synchronization std::mutex queue_mutex; std::condition_variable condition, inputCnd; @@ -39,30 +45,44 @@ namespace tomoto bool stop; }; + // the constructor just launches some amount of workers inline ThreadPool::ThreadPool(size_t threads, size_t _maxQueued) - : maxQueued(_maxQueued), stop(false) + : maxQueued(_maxQueued), stop(false), tasks(threads) { for (size_t i = 0; i < threads; ++i) - workers.emplace_back([this, i] { - for (;;) + workers.emplace_back([this, i] { - std::function task; + while (1) { - std::unique_lock lock(this->queue_mutex); - this->condition.wait(lock, - [this] { return this->stop || !this->tasks.empty(); }); - if (this->stop && this->tasks.empty()) return; - task = std::move(this->tasks.front()); - this->tasks.pop(); - if (this->maxQueued) this->inputCnd.notify_all(); + std::function task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this, i] { return this->stop || !this->shared_task.empty() || !this->tasks[i].empty(); }); + if (this->stop && this->shared_task.empty() && this->tasks[i].empty()) return; + if (this->tasks[i].empty()) + { + task = std::move(this->shared_task.front()); + this->shared_task.pop(); + } + else + { + task = std::move(this->tasks[i].front()); + this->tasks[i].pop(); + } + + if (this->maxQueued) this->inputCnd.notify_all(); + } + + //std::cout << "Start #" << i << std::endl; + task(i); + //std::cout << "End #" << i << std::endl; } - //std::cout << "Start #" << i << std::endl; - task(i); - //std::cout << "End #" << i << std::endl; - } - }); + }); + } } // add new work item to the pool @@ -81,16 +101,41 @@ namespace tomoto // don't allow enqueueing after stopping the pool if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); - if (maxQueued && tasks.size() >= maxQueued) + if (maxQueued && shared_task.size() >= maxQueued) { - inputCnd.wait(lock, [&]() { return tasks.size() < maxQueued; }); + inputCnd.wait(lock, [&]() { return shared_task.size() < maxQueued; }); } - tasks.emplace([task](size_t id) { (*task)(id); }); + shared_task.emplace([task](size_t id) { (*task)(id); }); } condition.notify_one(); return res; } + template + auto ThreadPool::enqueueToAll(F&& f, Args&&... args) + ->std::vector::type> > + { + using return_type = typename std::result_of::type; + + std::vector > ret; + std::unique_lock lock(queue_mutex); + for (size_t i = 0; i < workers.size(); ++i) + { + auto task = std::make_shared< std::packaged_task >( + std::bind(f, std::placeholders::_1, args...)); + + ret.emplace_back(task->get_future()); + + { + // don't allow enqueueing after stopping the pool + if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); + tasks[i].emplace([task](size_t id) { (*task)(id); }); + } + } + condition.notify_all(); + return ret; + } + // the destructor joins all threads inline ThreadPool::~ThreadPool() { @@ -102,4 +147,4 @@ namespace tomoto for (std::thread &worker : workers) worker.join(); } -} \ No newline at end of file +} diff --git a/src/Utils/exception.h b/src/Utils/exception.h index 4ab71d3..816e336 100644 --- a/src/Utils/exception.h +++ b/src/Utils/exception.h @@ -10,7 +10,13 @@ namespace tomoto public: using std::runtime_error::runtime_error; }; + + class InvalidArgument : public std::invalid_argument + { + public: + using std::invalid_argument::invalid_argument; + }; } } -#define THROW_ERROR_WITH_INFO(exec, msg) do {throw exec(tomoto::text::format("%s (%d): ", __FILE__, __LINE__) + msg); } while(0) \ No newline at end of file +#define THROW_ERROR_WITH_INFO(exec, msg) do {throw exec(tomoto::text::format("%s (%d): ", __FILE__, __LINE__) + msg); } while(0) diff --git a/src/Utils/sample.hpp b/src/Utils/sample.hpp index 7e012fc..aeb186b 100644 --- a/src/Utils/sample.hpp +++ b/src/Utils/sample.hpp @@ -127,10 +127,29 @@ namespace tomoto return z; } + struct FastRealGenerator + { + template + float operator()(Random& rg) + { + union + { + float f; + uint32_t u; + }; + + u = rg(); + u = (127 << 23) | (u & 0x7FFFFF); + return f - 1; + } + }; + template inline size_t sampleFromDiscreteAcc(RealIt begin, RealIt end, Random& rg) { - auto r = std::generate_canonical(rg) * *(end - 1); + //auto r = std::generate_canonical(rg) * *(end - 1); + FastRealGenerator dist; + auto r = dist(rg) * *(end - 1); size_t K = std::distance(begin, end); size_t z = 0; #ifdef __AVX__ diff --git a/src/Utils/tvector.hpp b/src/Utils/tvector.hpp index 2ef6177..2e59b50 100644 --- a/src/Utils/tvector.hpp +++ b/src/Utils/tvector.hpp @@ -492,7 +492,7 @@ namespace tomoto T* dp = dest.data() + dend; for (auto it = srcBegin; it != srcEnd; ++it) { - auto&& tv = **it; + auto& tv = **it; std::copy(tv.begin(), tv.end(), dp); tv = tvector{ dp, tv.size() }; dp += tv.size(); diff --git a/src/python/docs.h b/src/python/docs.h index f80e3f6..d426b96 100644 --- a/src/python/docs.h +++ b/src/python/docs.h @@ -31,6 +31,16 @@ DOC_SIGNATURE_EN_KO(Document_get_topic_dist__doc__, u8R""(Return a distribution of the topics in the document.)"", u8R""(현재 문헌의 토픽 확률 분포를 `list` 형태로 반환합니다.)""); +DOC_SIGNATURE_EN_KO(Document_get_sub_topics__doc__, + "get_topics(self, top_n=10)", + u8R""(Return the `top_n` sub topics with its probability of the document. (for only `tomotopy.PAModel`))"", + u8R""(현재 문헌의 상위 `top_n`개의 하위 토픽과 그 확률을 `tuple`의 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))""); + +DOC_SIGNATURE_EN_KO(Document_get_sub_topic_dist__doc__, + "get_topic_dist(self)", + u8R""(Return a distribution of the sub topics in the document. (for only `tomotopy.PAModel`))"", + u8R""(현재 문헌의 하위 토픽 확률 분포를 `list` 형태로 반환합니다. (`tomotopy.PAModel` 전용))""); + DOC_SIGNATURE_EN_KO(Document_get_words__doc__, "get_words(self, top_n=10)", u8R""(.. versionadded:: 0.4.2 @@ -189,7 +199,7 @@ words : iterable of str )""); DOC_SIGNATURE_EN_KO(LDA_train__doc__, - "train(self, iter=10, workers=0)", + "train(self, iter=10, workers=0, parallel=0)", u8R""(Train the model using Gibbs-sampling with `iter` iterations. Return `None`. After calling this method, you cannot `tomotopy.LDAModel.add_doc` more. @@ -200,6 +210,10 @@ iter : int workers : int an integer indicating the number of workers to perform samplings. If `workers` is 0, the number of cores in the system will be used. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + the parallelism scheme for training. the default value is ParallelScheme.DEFAULT which means that tomotopy selects the best scheme by model. )"", u8R""(깁스 샘플링을 `iter` 회 반복하여 현재 모델을 학습시킵니다. 반환값은 `None`입니다. 이 메소드가 호출된 이후에는 더 이상 `tomotopy.LDAModel.add_doc`로 현재 모델에 새로운 학습 문헌을 추가시킬 수 없습니다. @@ -211,6 +225,10 @@ iter : int workers : int 깁스 샘플링을 수행하는 데에 사용할 스레드의 개수입니다. 만약 이 값을 0으로 설정할 경우 시스템 내의 가용한 모든 코어가 사용됩니다. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + 학습에 사용할 병렬화 방법. 기본값은 ParallelScheme.DEFAULT로 이는 모델에 따라 최적의 방법을 tomotopy가 알아서 선택하도록 합니다. )""); DOC_SIGNATURE_EN_KO(LDA_get_topic_words__doc__, @@ -274,6 +292,10 @@ tolerance : float workers : int an integer indicating the number of workers to perform samplings. If `workers` is 0, the number of cores in the system will be used. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + the parallelism scheme for inference. the default value is ParallelScheme.DEFAULT which means that tomotopy selects the best scheme by model. together : bool all `doc`s are infered together in one process if True, otherwise each `doc` is infered independently. Its default value is `False`. )"", @@ -293,6 +315,10 @@ tolerance : float workers : int 깁스 샘플링을 수행하는 데에 사용할 스레드의 개수입니다. 만약 이 값을 0으로 설정할 경우 시스템 내의 가용한 모든 코어가 사용됩니다. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + 추론에 사용할 병렬화 방법. 기본값은 ParallelScheme.DEFAULT로 이는 모델에 따라 최적의 방법을 tomotopy가 알아서 선택하도록 합니다. together : bool 이 값이 True인 경우 입력한 `doc` 문헌들을 한 번에 모델에 넣고 추론을 진행합니다. False인 경우 각각의 문헌들을 별도로 모델에 넣어 추론합니다. 기본값은 `False`입니다. diff --git a/src/python/module.h b/src/python/module.h index b8889cf..51c6eec 100644 --- a/src/python/module.h +++ b/src/python/module.h @@ -141,6 +141,40 @@ PyObject* Document_##NAME(DocumentObject* self, void* closure)\ }\ } +#define DEFINE_DOCUMENT_GETTER_REORDER(DOCTYPE, NAME, FIELD) \ +PyObject* Document_##NAME(DocumentObject* self, void* closure)\ +{\ + try\ + {\ + if (!self->doc) throw runtime_error{ "doc is null!" };\ + do\ + {\ + auto* doc = dynamic_cast*>(self->doc);\ + if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ + } while (0);\ + do\ + {\ + auto* doc = dynamic_cast*>(self->doc);\ + if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ + } while (0);\ + do\ + {\ + auto* doc = dynamic_cast*>(self->doc);\ + if (doc) return buildPyValueReorder(doc->FIELD, doc->wOrder);\ + } while (0);\ + throw runtime_error{ "doc doesn't has '" #FIELD "' field!" };\ + }\ + catch (const bad_exception&)\ + {\ + return nullptr;\ + }\ + catch (const exception& e)\ + {\ + PyErr_SetString(PyExc_Exception, e.what());\ + return nullptr;\ + }\ +} + namespace py { template @@ -242,3 +276,40 @@ DEFINE_DOCUMENT_GETTER_PROTOTYPE(beta); DEFINE_DOCUMENT_GETTER_PROTOTYPE(y); DEFINE_DOCUMENT_GETTER_PROTOTYPE(labels); + +PyObject* Document_getSubTopics(DocumentObject* self, PyObject* args, PyObject *kwargs); +PyObject* Document_getSubTopicDist(DocumentObject* self); + +template +PyObject* buildPyValueReorder(const _Target& target, const _Order& order) +{ + if (order.empty()) + { + return py::buildPyValue(target); + } + else + { + using _OType = decltype(order[0]); + return py::buildPyValueTransform(order.begin(), order.end(), [&](_OType idx) + { + return target[idx]; + }); + } +} + +template +PyObject* buildPyValueReorder(const _Target& target, const _Order& order, _Tx&& transformer) +{ + if (order.empty()) + { + return py::buildPyValueTransform(target.begin(), target.end(), transformer); + } + else + { + using _OType = decltype(order[0]); + return py::buildPyValueTransform(order.begin(), order.end(), [&](_OType idx) + { + return transformer(target[idx]); + }); + } +} diff --git a/src/python/py_HDP.cpp b/src/python/py_HDP.cpp index 885e6e3..67fbafe 100644 --- a/src/python/py_HDP.cpp +++ b/src/python/py_HDP.cpp @@ -63,17 +63,17 @@ PyObject* Document_HDP_Z(DocumentObject* self, void* closure) do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->numTopicByTable[x].topic; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->numTopicByTable[x].topic; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->numTopicByTable[x].topic; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->numTopicByTable[x].topic; }); } while (0); return nullptr; } diff --git a/src/python/py_HLDA.cpp b/src/python/py_HLDA.cpp index 77f68ad..b4b826c 100644 --- a/src/python/py_HLDA.cpp +++ b/src/python/py_HLDA.cpp @@ -61,17 +61,17 @@ PyObject* Document_HLDA_Z(DocumentObject* self, void* closure) do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->path[x]; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->path[x]; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValueTransform(doc->Zs.begin(), doc->Zs.end(), [doc](size_t x) { return doc->path[x]; }); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder, [doc](size_t x) { return doc->path[x]; }); } while (0); return nullptr; } diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 596babe..8aeeed5 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -94,8 +94,9 @@ static PyObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwargs) { size_t iteration = 10, workers = 0; - static const char* kwlist[] = { "iter", "workers", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nn", (char**)kwlist, &iteration, &workers)) return nullptr; + tomoto::ParallelScheme ps = tomoto::ParallelScheme::default_; + static const char* kwlist[] = { "iter", "workers", "parallel", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnn", (char**)kwlist, &iteration, &workers, &ps)) return nullptr; try { if (!self->inst) throw runtime_error{ "inst is null" }; @@ -105,7 +106,7 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa inst->prepare(true, self->minWordCnt, self->removeTopWord); self->isPrepared = true; } - inst->train(iteration, workers); + inst->train(iteration, workers, ps); Py_INCREF(Py_None); return Py_None; } @@ -180,9 +181,10 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa { PyObject *argDoc, *iter = nullptr, *item; size_t iteration = 100, workers = 0, together = 0; + tomoto::ParallelScheme ps = tomoto::ParallelScheme::default_; float tolerance = -1; - static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "together", nullptr }; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnp", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &together)) return nullptr; + static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnnp", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &ps, &together)) return nullptr; DEBUG_LOG("infer " << self->ob_base.ob_type << ", " << self->ob_base.ob_refcnt); try { @@ -205,7 +207,7 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa self->inst->prepare(true, self->minWordCnt, self->removeTopWord); self->isPrepared = true; } - auto ll = self->inst->infer(docs, iteration, tolerance, workers, !!together); + auto ll = self->inst->infer(docs, iteration, tolerance, workers, ps, !!together); PyObject* ret = PyList_New(docs.size()); size_t i = 0; for (auto d : docs) @@ -236,7 +238,7 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa { std::vector docs; docs.emplace_back((tomoto::DocumentBase*)doc->doc); - float ll = self->inst->infer(docs, iteration, tolerance, workers, !!together)[0]; + float ll = self->inst->infer(docs, iteration, tolerance, workers, ps, !!together)[0]; return Py_BuildValue("(Nf)", py::buildPyValue(self->inst->getTopicsByDoc(doc->doc)), ll); } else @@ -418,17 +420,17 @@ PyObject* Document_LDA_Z(DocumentObject* self, void* closure) do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValue(doc->Zs); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValue(doc->Zs); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); } while (0); do { auto* doc = dynamic_cast*>(self->doc); - if (doc) return py::buildPyValue(doc->Zs); + if (doc) return buildPyValueReorder(doc->Zs, doc->wOrder); } while (0); return nullptr; } diff --git a/src/python/py_MGLDA.cpp b/src/python/py_MGLDA.cpp index f67f119..05bd0d9 100644 --- a/src/python/py_MGLDA.cpp +++ b/src/python/py_MGLDA.cpp @@ -158,7 +158,7 @@ DEFINE_GETTER(tomoto::IMGLDAModel, MGLDA, getAlphaML); DEFINE_GETTER(tomoto::IMGLDAModel, MGLDA, getEtaL); DEFINE_GETTER(tomoto::IMGLDAModel, MGLDA, getT); -DEFINE_DOCUMENT_GETTER(tomoto::DocumentMGLDA, windows, Vs); +DEFINE_DOCUMENT_GETTER_REORDER(tomoto::DocumentMGLDA, windows, Vs); DEFINE_LOADER(MGLDA, MGLDA_type); diff --git a/src/python/py_PA.cpp b/src/python/py_PA.cpp index bdb3110..7ca7adc 100644 --- a/src/python/py_PA.cpp +++ b/src/python/py_PA.cpp @@ -144,8 +144,53 @@ static PyObject* PA_getTopicWordDist(TopicModelObject* self, PyObject* args, PyO } } + +PyObject* Document_getSubTopics(DocumentObject* self, PyObject* args, PyObject *kwargs) +{ + size_t topN = 10; + static const char* kwlist[] = { "top_n", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|n", (char**)kwlist, &topN)) return nullptr; + try + { + if (!self->parentModel->inst) throw runtime_error{ "inst is null" }; + auto* inst = static_cast(self->parentModel->inst); + if (!self->parentModel->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + return py::buildPyValue(inst->getSubTopicsByDocSorted(self->doc, topN)); + } + catch (const bad_exception&) + { + return nullptr; + } + catch (const exception& e) + { + PyErr_SetString(PyExc_Exception, e.what()); + return nullptr; + } +} + +PyObject* Document_getSubTopicDist(DocumentObject* self) +{ + try + { + if (!self->parentModel->inst) throw runtime_error{ "inst is null" }; + auto* inst = static_cast(self->parentModel->inst); + if (!self->parentModel->isPrepared) throw runtime_error{ "train() should be called first for calculating the topic distribution" }; + return py::buildPyValue(inst->getSubTopicsByDoc(self->doc)); + } + catch (const bad_exception&) + { + return nullptr; + } + catch (const exception& e) + { + PyErr_SetString(PyExc_Exception, e.what()); + return nullptr; + } +} + + DEFINE_GETTER(tomoto::IPAModel, PA, getK2); -DEFINE_DOCUMENT_GETTER(tomoto::DocumentPA, Z2, Z2s); +DEFINE_DOCUMENT_GETTER_REORDER(tomoto::DocumentPA, Z2, Z2s); DEFINE_LOADER(PA, PA_type); static PyMethodDef PA_methods[] = diff --git a/src/python/py_main.cpp b/src/python/py_main.cpp index 9ca2d85..9274bcb 100644 --- a/src/python/py_main.cpp +++ b/src/python/py_main.cpp @@ -35,8 +35,9 @@ void CorpusObject::dealloc(CorpusObject* self) PyObject * DocumentObject::repr(DocumentObject * self) { string ret = "doc->words) + for (size_t i = 0; i < self->doc->words.size(); ++i) { + auto w = self->doc->wOrder.empty() ? self->doc->words[i] : self->doc->words[self->doc->wOrder[i]]; ret += self->parentModel->inst->getVocabDict().toWord(w); ret.push_back(' '); } @@ -220,7 +221,6 @@ static PyObject* Document_getTopicDist(DocumentObject* self) } } - static PyObject* Document_getWords(DocumentObject* self, PyObject* args, PyObject *kwargs) { size_t topN = 10; @@ -247,6 +247,10 @@ static PyMethodDef Document_methods[] = { { "get_topics", (PyCFunction)Document_getTopics, METH_VARARGS | METH_KEYWORDS, Document_get_topics__doc__ }, { "get_topic_dist", (PyCFunction)Document_getTopicDist, METH_NOARGS, Document_get_topic_dist__doc__ }, +#ifdef TM_PA + { "get_sub_topics", (PyCFunction)Document_getSubTopics, METH_VARARGS | METH_KEYWORDS, Document_get_sub_topics__doc__ }, + { "get_sub_topic_dist", (PyCFunction)Document_getSubTopicDist, METH_NOARGS, Document_get_sub_topic_dist__doc__ }, +#endif { "get_words", (PyCFunction)Document_getWords, METH_VARARGS | METH_KEYWORDS, Document_get_words__doc__ }, { nullptr } }; @@ -285,7 +289,7 @@ static PyObject* Document_words(DocumentObject* self, void* closure) try { if (!self->doc) throw runtime_error{ "doc is null!" }; - return py::buildPyValue(self->doc->words); + return buildPyValueReorder(self->doc->words, self->doc->wOrder); } catch (const bad_exception&) { diff --git a/test/unit_test.py b/test/unit_test.py index 53ffd2c..0a15eca 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -1,7 +1,6 @@ -import sys import tomotopy as tp -test_case = [ +model_cases = [ (tp.LDAModel, 'test/sample.txt', 0, None, {'k':10}), (tp.LLDAModel, 'test/sample_with_md.txt', 0, None, {'k':5}), (tp.PLDAModel, 'test/sample_with_md.txt', 0, None, {'latent_topics':2, 'topics_per_label':2}), @@ -16,33 +15,61 @@ (tp.SLDAModel, 'test/sample_with_md.txt', 1, lambda x:list(map(float, x)), {'k':10, 'vars':'b'}), ] -def test_train(): +def train(cls, inputFile, mdFields, f, kargs): + print('Test train') tw = 0 - for cls, inputFile, mdFields, f, kargs in test_case: - print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw]), file=sys.stderr, flush=True) - mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) - print('Adding docs...', file=sys.stderr, flush=True) - unseen_docs = [] - for n, line in enumerate(open(inputFile, encoding='utf-8')): - ch = line.strip().split() - if len(ch) < mdFields + 1: continue - if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) - else: mdl.add_doc(ch) - mdl.train(200) + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(200) -def test_save_and_load(): +def save_and_load(cls, inputFile, mdFields, f, kargs): + print('Test save & load') tw = 0 - for cls, inputFile, mdFields, f, kargs in test_case: - print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw]), file=sys.stderr, flush=True) - mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) - print('Adding docs...', file=sys.stderr, flush=True) - unseen_docs = [] - for n, line in enumerate(open(inputFile, encoding='utf-8')): - ch = line.strip().split() - if len(ch) < mdFields + 1: continue - if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) - else: mdl.add_doc(ch) - mdl.train(20) - mdl.save('test.model.{}.bin'.format(cls.__name__)) - mdl = cls.load('test.model.{}.bin'.format(cls.__name__)) - mdl.train(20) + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(20) + mdl.save('test.model.{}.bin'.format(cls.__name__)) + mdl = cls.load('test.model.{}.bin'.format(cls.__name__)) + mdl.train(20) + +def infer(cls, inputFile, mdFields, f, kargs): + print('Test infer') + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + unseen_docs = [] + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if n < 20: unseen_docs.append(line) + else: + if mdFields: + mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: + mdl.add_doc(ch) + mdl.train(20) + for n, line in enumerate(unseen_docs): + if mdFields: + unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) + else: + unseen_docs[n] = mdl.make_doc(ch) + + mdl.infer(unseen_docs) + + +for model_case in model_cases: + for func in [train, save_and_load, infer]: + locals()['test_{}_{}'.format(model_case[0].__name__, func.__name__)] = (lambda f, mc: lambda: f(*mc))(func, model_case) diff --git a/tomotopy/__init__.py b/tomotopy/__init__.py index 6cf3693..d364951 100644 --- a/tomotopy/__init__.py +++ b/tomotopy/__init__.py @@ -31,6 +31,37 @@ class TermWeight(IntEnum): Use Pointwise Mutual Information term weighting. """ +class ParallelScheme(IntEnum): + """ + This enumeration is for Parallelizing Scheme: + There are three options for parallelizing and the basic one is DEFAULT. Not all models supports all options. + """ + + DEFAULT = 0 + """tomotopy chooses the best available parallelism scheme for your model""" + + NONE = 1 + """ + Turn off multi-threading for Gibbs sampling at training or inference. Operations other than Gibbs sampling may use multithreading. + """ + + COPY_MERGE = 2 + """ + Use Copy and Merge algorithm from AD-LDA. It consumes RAM in proportion to the number of workers. + This has advantages when you have a small number of workers and a small number of topics and vocabulary sizes in the model. + Prior to version 0.5, all models used this algorithm by default. + + > * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed algorithms for topic models. Journal of Machine Learning Research, 10(Aug), 1801-1828. + """ + + PARTITION = 3 + """ + Use Partitioning algorithm from PCGS. It consumes only twice as much RAM as a single-threaded algorithm, regardless of the number of workers. + This has advantages when you have a large number of workers or a large number of topics and vocabulary sizes in the model. + + > * Yan, F., Xu, N., & Qi, Y. (2009). Parallel inference for latent dirichlet allocation on graphics processing units. In Advances in neural information processing systems (pp. 2134-2142). + """ + isa = '' """ Indicate which SIMD instruction set is used for acceleration. @@ -80,4 +111,20 @@ def _load(): 따라서 모든 문헌에 거의 골고루 등장하는 용어의 경우 낮은 가중치를 가지게 되며, 소수의 특정 문헌에만 집중적으로 등장하는 용어의 경우 높은 가중치를 가지게 됩니다.""" __pdoc__['TermWeight.PMI'] = """점별 상호정보량(PMI)을 가중치로 사용합니다.""" + __pdoc__['ParallelScheme'] = """병렬화 기법을 선택하는 데에 사용되는 열거형입니다. 총 3가지 기법을 사용할 수 있으나, 모든 모델이 아래의 기법을 전부 지원하지는 않습니다.""" + __pdoc__['ParallelScheme.DEFAULT'] = """tomotopy가 모델에 따라 적합한 병럴화 기법을 선택하도록 합니다. 이 값이 기본값입니다.""" + __pdoc__['ParallelScheme.NONE'] = """깁스 샘플링에 병렬화 기법을 사용하지 않습니다. 깁스 샘플링을 제외한 다른 연산들은 여전히 병렬로 처리될 수 있습니다.""" + __pdoc__['ParallelScheme.COPY_MERGE'] = """ +AD-LDA에서 제안된 복사 후 합치기 알고리즘을 사용합니다. 이는 작업자 수에 비례해 메모리를 소모합니다. +작업자 수가 적거나, 토픽 개수 혹은 어휘 집합의 크기가 작을 때 유리합니다. +0.5버전 이전까지는 모든 모델은 이 알고리즘을 기본으로 사용했습니다. + +> * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed algorithms for topic models. Journal of Machine Learning Research, 10(Aug), 1801-1828. +""" + __pdoc__['ParallelScheme.PARTITION'] = """ +PCGS에서 제안된 분할 샘플링 알고리즈을 사용합니다. 작업자 수에 관계없이 단일 스레드 알고리즘에 2배의 메모리만 소모합니다. +작업자 수가 많거나, 토픽 개수 혹은 어휘 집합의 크기가 클 때 유리합니다. + +> * Yan, F., Xu, N., & Qi, Y. (2009). Parallel inference for latent dirichlet allocation on graphics processing units. In Advances in neural information processing systems (pp. 2134-2142). +""" del _load, IntEnum, os From 61dd99969c1c36f79524ff6fce1c52ee13a616df Mon Sep 17 00:00:00 2001 From: bab2min Date: Sat, 28 Dec 2019 22:11:30 +0900 Subject: [PATCH 02/12] Update pull_request_test.yml --- .github/workflows/pull_request_test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index 328bc55..ac24ed2 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -29,7 +29,7 @@ jobs: run: | /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest /opt/python/${{ matrix.cp }}/bin/python setup.py build install - /opt/python/${{ matrix.cp }}/bin/python -m pytest test/unit_test.py + /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/unit_test.py build_macos: name: Build for macOS @@ -57,7 +57,7 @@ jobs: - name: Test run: | python -m pip install pytest - python -m pytest test/unit_test.py + python -m pytest --verbose test/unit_test.py build_windows: name: Build for Windows @@ -88,4 +88,4 @@ jobs: - name: Test run: | python -m pip install pytest - python -m pytest test/unit_test.py + python -m pytest --verbose test/unit_test.py From 325bdc353382512d020c608c2a32d0365af9095f Mon Sep 17 00:00:00 2001 From: bab2min Date: Sat, 28 Dec 2019 22:39:34 +0900 Subject: [PATCH 03/12] Update pull_request_test.yml --- .github/workflows/pull_request_test.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull_request_test.yml b/.github/workflows/pull_request_test.yml index ac24ed2..7ada04c 100644 --- a/.github/workflows/pull_request_test.yml +++ b/.github/workflows/pull_request_test.yml @@ -25,10 +25,12 @@ jobs: git checkout tags/3.3.7 cd .. mv eigen-git-mirror include - - name: Build & Test + - name: Build run: | - /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest /opt/python/${{ matrix.cp }}/bin/python setup.py build install + - name: Test + run: | + /opt/python/${{ matrix.cp }}/bin/python -m pip install pytest /opt/python/${{ matrix.cp }}/bin/python -m pytest --verbose test/unit_test.py build_macos: From 0226cb122039b9b555c102eae23b979ba8c25b53 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sat, 28 Dec 2019 23:21:33 +0900 Subject: [PATCH 04/12] limit for max number of threads --- src/TopicModel/LDAModel.hpp | 21 ++++++++++++++++++++- src/TopicModel/TopicModel.hpp | 4 ++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/TopicModel/LDAModel.hpp b/src/TopicModel/LDAModel.hpp index 9f7a120..9519aa4 100644 --- a/src/TopicModel/LDAModel.hpp +++ b/src/TopicModel/LDAModel.hpp @@ -276,6 +276,20 @@ namespace tomoto for (auto& r : res) r.get(); } + template + size_t estimateMaxThreads() const + { + if (_ps == ParallelScheme::partition) + { + return this->realV / 4; + } + if (_ps == ParallelScheme::copy_merge) + { + return this->docs.size() / 2; + } + return (size_t)-1; + } + template void trainOne(ThreadPool& pool, _ModelState* localData, RandGen* rgs) { @@ -549,9 +563,11 @@ namespace tomoto { generator = static_cast(this)->makeGeneratorForInit(nullptr); } - ThreadPool pool(numWorkers, numWorkers * 8); + if (_Together) { + numWorkers = std::min(numWorkers, this->maxThreads[(size_t)_ps]); + ThreadPool pool{ numWorkers }; // temporary state variable RandGen rgc{}; auto tmpState = this->globalState, tState = this->globalState; @@ -580,6 +596,7 @@ namespace tomoto } else if (m_flags & flags::shared_state) { + ThreadPool pool{ numWorkers }; std::vector ret; const double gllRest = static_cast(this)->getLLRest(this->globalState); for (auto d = docFirst; d != docLast; ++d) @@ -601,6 +618,7 @@ namespace tomoto } else { + ThreadPool pool{ numWorkers, numWorkers * 8 }; std::vector> res; const double gllRest = static_cast(this)->getLLRest(this->globalState); for (auto d = docFirst; d != docLast; ++d) @@ -739,6 +757,7 @@ namespace tomoto for (auto& doc : this->docs) doc.updateSumWordWeight(this->realV); } static_cast(this)->prepareShared(); + BaseClass::prepare(initDocs, minWordCnt, removeTopN); } std::vector getCountByTopic() const override diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index edce651..7631880 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -110,6 +110,7 @@ namespace tomoto Dictionary dict; size_t realV = 0; // vocab size after removing stopwords size_t realN = 0; // total word size after removing stopwords + size_t maxThreads[(size_t)ParallelScheme::size] = { 0, }; std::unique_ptr cachedPool; @@ -266,6 +267,8 @@ namespace tomoto void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) override { + maxThreads[(size_t)ParallelScheme::copy_merge] = static_cast<_Derived*>(this)->template estimateMaxThreads(); + maxThreads[(size_t)ParallelScheme::partition] = static_cast<_Derived*>(this)->template estimateMaxThreads(); } static ParallelScheme getRealScheme(ParallelScheme ps) @@ -290,6 +293,7 @@ namespace tomoto { if (!numWorkers) numWorkers = std::thread::hardware_concurrency(); ps = getRealScheme(ps); + numWorkers = std::min(numWorkers, maxThreads[(size_t)ps]); if (numWorkers == 1 || (_Flags & flags::shared_state)) ps = ParallelScheme::none; if (!cachedPool || cachedPool->getNumWorkers() != numWorkers) { From 8dfc180b5e7a2f72b02cc334eee0d10327b83167 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 00:48:12 +0900 Subject: [PATCH 05/12] fix HLDA bugs --- src/TopicModel/TopicModel.hpp | 2 ++ test/unit_test.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index 7631880..2175709 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -267,6 +267,8 @@ namespace tomoto void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t removeTopN = 0) override { + maxThreads[(size_t)ParallelScheme::default_] = -1; + maxThreads[(size_t)ParallelScheme::none] = -1; maxThreads[(size_t)ParallelScheme::copy_merge] = static_cast<_Derived*>(this)->template estimateMaxThreads(); maxThreads[(size_t)ParallelScheme::partition] = static_cast<_Derived*>(this)->template estimateMaxThreads(); } diff --git a/test/unit_test.py b/test/unit_test.py index 0a15eca..ed961d5 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -15,7 +15,33 @@ (tp.SLDAModel, 'test/sample_with_md.txt', 1, lambda x:list(map(float, x)), {'k':10, 'vars':'b'}), ] -def train(cls, inputFile, mdFields, f, kargs): +def train1(cls, inputFile, mdFields, f, kargs): + print('Test train') + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(200, workers=1) + +def train4(cls, inputFile, mdFields, f, kargs): + print('Test train') + tw = 0 + print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) + mdl = cls(tw=tw, min_cf=2, rm_top=2, **kargs) + print('Adding docs...') + for n, line in enumerate(open(inputFile, encoding='utf-8')): + ch = line.strip().split() + if len(ch) < mdFields + 1: continue + if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) + else: mdl.add_doc(ch) + mdl.train(200, workers=4) + +def train0(cls, inputFile, mdFields, f, kargs): print('Test train') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -71,5 +97,5 @@ def infer(cls, inputFile, mdFields, f, kargs): for model_case in model_cases: - for func in [train, save_and_load, infer]: + for func in [train1, train4, train0, save_and_load, infer]: locals()['test_{}_{}'.format(model_case[0].__name__, func.__name__)] = (lambda f, mc: lambda: f(*mc))(func, model_case) From a4a05318334e1d7f05197d5fc4c67b3bf144369a Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 01:27:32 +0900 Subject: [PATCH 06/12] fix bugs --- src/TopicModel/TopicModel.hpp | 2 ++ test/unit_test.py | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index 2175709..246ee83 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -284,9 +284,11 @@ namespace tomoto case ParallelScheme::copy_merge: if ((_Flags & flags::shared_state)) THROW_ERROR_WITH_INFO(exception::InvalidArgument, std::string{ "This model doesn't provide ParallelScheme::" } + toString(ps)); + break; case ParallelScheme::partition: if (!(_Flags & flags::partitioned_multisampling)) THROW_ERROR_WITH_INFO(exception::InvalidArgument, std::string{ "This model doesn't provide ParallelScheme::" } +toString(ps)); + break; } return ps; } diff --git a/test/unit_test.py b/test/unit_test.py index ed961d5..0e2c40c 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -26,7 +26,7 @@ def train1(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200, workers=1) + mdl.train(200, workers=1, parallel=tp.ParallelScheme.COPY_MERGE) def train4(cls, inputFile, mdFields, f, kargs): print('Test train') @@ -39,7 +39,7 @@ def train4(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200, workers=4) + mdl.train(200, workers=4, parallel=tp.ParallelScheme.COPY_MERGE) def train0(cls, inputFile, mdFields, f, kargs): print('Test train') @@ -52,7 +52,7 @@ def train0(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200) + mdl.train(200, parallel=tp.ParallelScheme.COPY_MERGE) def save_and_load(cls, inputFile, mdFields, f, kargs): print('Test save & load') @@ -65,10 +65,10 @@ def save_and_load(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(20) + mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) mdl.save('test.model.{}.bin'.format(cls.__name__)) mdl = cls.load('test.model.{}.bin'.format(cls.__name__)) - mdl.train(20) + mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) def infer(cls, inputFile, mdFields, f, kargs): print('Test infer') @@ -86,14 +86,14 @@ def infer(cls, inputFile, mdFields, f, kargs): mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(20) + mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) for n, line in enumerate(unseen_docs): if mdFields: unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) else: unseen_docs[n] = mdl.make_doc(ch) - mdl.infer(unseen_docs) + mdl.infer(unseen_docs, parallel=tp.ParallelScheme.COPY_MERGE) for model_case in model_cases: From c36bb12119045d34e41e127578f25821126f0c32 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 02:18:21 +0900 Subject: [PATCH 07/12] fix stack corruption bug --- src/python/py_LDA.cpp | 12 ++++------ test/unit_test.py | 55 +++++++++++++++++++++++-------------------- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 8aeeed5..1b36a2a 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -93,8 +93,7 @@ static PyObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - size_t iteration = 10, workers = 0; - tomoto::ParallelScheme ps = tomoto::ParallelScheme::default_; + size_t iteration = 10, workers = 0, ps = 0; static const char* kwlist[] = { "iter", "workers", "parallel", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|nnn", (char**)kwlist, &iteration, &workers, &ps)) return nullptr; try @@ -106,7 +105,7 @@ static PyObject* LDA_train(TopicModelObject* self, PyObject* args, PyObject *kwa inst->prepare(true, self->minWordCnt, self->removeTopWord); self->isPrepared = true; } - inst->train(iteration, workers, ps); + inst->train(iteration, workers, (tomoto::ParallelScheme)ps); Py_INCREF(Py_None); return Py_None; } @@ -180,8 +179,7 @@ static PyObject* LDA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) { PyObject *argDoc, *iter = nullptr, *item; - size_t iteration = 100, workers = 0, together = 0; - tomoto::ParallelScheme ps = tomoto::ParallelScheme::default_; + size_t iteration = 100, workers = 0, together = 0, ps = 0; float tolerance = -1; static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnnp", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &ps, &together)) return nullptr; @@ -207,7 +205,7 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa self->inst->prepare(true, self->minWordCnt, self->removeTopWord); self->isPrepared = true; } - auto ll = self->inst->infer(docs, iteration, tolerance, workers, ps, !!together); + auto ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together); PyObject* ret = PyList_New(docs.size()); size_t i = 0; for (auto d : docs) @@ -238,7 +236,7 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa { std::vector docs; docs.emplace_back((tomoto::DocumentBase*)doc->doc); - float ll = self->inst->infer(docs, iteration, tolerance, workers, ps, !!together)[0]; + float ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0]; return Py_BuildValue("(Nf)", py::buildPyValue(self->inst->getTopicsByDoc(doc->doc)), ll); } else diff --git a/test/unit_test.py b/test/unit_test.py index 0e2c40c..b03c822 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -1,21 +1,21 @@ import tomotopy as tp model_cases = [ - (tp.LDAModel, 'test/sample.txt', 0, None, {'k':10}), - (tp.LLDAModel, 'test/sample_with_md.txt', 0, None, {'k':5}), - (tp.PLDAModel, 'test/sample_with_md.txt', 0, None, {'latent_topics':2, 'topics_per_label':2}), - (tp.PLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'latent_topics':2, 'topics_per_label':2}), - (tp.HLDAModel, 'test/sample.txt', 0, None, {'depth':3}), - (tp.CTModel, 'test/sample.txt', 0, None, {'k':10}), - (tp.HDPModel, 'test/sample.txt', 0, None, {'initial_k':10}), - (tp.MGLDAModel, 'test/sample.txt', 0, None, {'k_g':5, 'k_l':5}), - (tp.PAModel, 'test/sample.txt', 0, None, {'k1':5, 'k2':10}), - (tp.HPAModel, 'test/sample.txt', 0, None, {'k1':5, 'k2':10}), - (tp.DMRModel, 'test/sample_with_md.txt', 1, lambda x:'_'.join(x), {'k':10}), - (tp.SLDAModel, 'test/sample_with_md.txt', 1, lambda x:list(map(float, x)), {'k':10, 'vars':'b'}), + (tp.LDAModel, 'test/sample.txt', 0, None, {'k':10}, None), + (tp.LLDAModel, 'test/sample_with_md.txt', 0, None, {'k':5}, None), + (tp.PLDAModel, 'test/sample_with_md.txt', 0, None, {'latent_topics':2, 'topics_per_label':2}, None), + (tp.PLDAModel, 'test/sample_with_md.txt', 1, lambda x:x, {'latent_topics':2, 'topics_per_label':2}, None), + (tp.HLDAModel, 'test/sample.txt', 0, None, {'depth':3}, [tp.ParallelScheme.NONE]), + (tp.CTModel, 'test/sample.txt', 0, None, {'k':10}, None), + (tp.HDPModel, 'test/sample.txt', 0, None, {'initial_k':10}, [tp.ParallelScheme.COPY_MERGE]), + (tp.MGLDAModel, 'test/sample.txt', 0, None, {'k_g':5, 'k_l':5}, None), + (tp.PAModel, 'test/sample.txt', 0, None, {'k1':5, 'k2':10}, [tp.ParallelScheme.COPY_MERGE]), + (tp.HPAModel, 'test/sample.txt', 0, None, {'k1':5, 'k2':10}, [tp.ParallelScheme.COPY_MERGE]), + (tp.DMRModel, 'test/sample_with_md.txt', 1, lambda x:'_'.join(x), {'k':10}, None), + (tp.SLDAModel, 'test/sample_with_md.txt', 1, lambda x:list(map(float, x)), {'k':10, 'vars':'b'}, None), ] -def train1(cls, inputFile, mdFields, f, kargs): +def train1(cls, inputFile, mdFields, f, kargs, ps): print('Test train') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -26,9 +26,9 @@ def train1(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200, workers=1, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(200, workers=1, parallel=ps) -def train4(cls, inputFile, mdFields, f, kargs): +def train4(cls, inputFile, mdFields, f, kargs, ps): print('Test train') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -39,9 +39,9 @@ def train4(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200, workers=4, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(200, workers=4, parallel=ps) -def train0(cls, inputFile, mdFields, f, kargs): +def train0(cls, inputFile, mdFields, f, kargs, ps): print('Test train') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -52,9 +52,9 @@ def train0(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(200, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(200, parallel=ps) -def save_and_load(cls, inputFile, mdFields, f, kargs): +def save_and_load(cls, inputFile, mdFields, f, kargs, ps): print('Test save & load') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -65,12 +65,12 @@ def save_and_load(cls, inputFile, mdFields, f, kargs): if len(ch) < mdFields + 1: continue if mdFields: mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(20, parallel=ps) mdl.save('test.model.{}.bin'.format(cls.__name__)) mdl = cls.load('test.model.{}.bin'.format(cls.__name__)) - mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(20, parallel=ps) -def infer(cls, inputFile, mdFields, f, kargs): +def infer(cls, inputFile, mdFields, f, kargs, ps): print('Test infer') tw = 0 print('Initialize model %s with TW=%s ...' % (str(cls), ['one', 'idf', 'pmi'][tw])) @@ -86,16 +86,19 @@ def infer(cls, inputFile, mdFields, f, kargs): mdl.add_doc(ch[mdFields:], f(ch[:mdFields])) else: mdl.add_doc(ch) - mdl.train(20, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.train(20, parallel=ps) for n, line in enumerate(unseen_docs): if mdFields: unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) else: unseen_docs[n] = mdl.make_doc(ch) - mdl.infer(unseen_docs, parallel=tp.ParallelScheme.COPY_MERGE) + mdl.infer(unseen_docs, parallel=ps) for model_case in model_cases: - for func in [train1, train4, train0, save_and_load, infer]: - locals()['test_{}_{}'.format(model_case[0].__name__, func.__name__)] = (lambda f, mc: lambda: f(*mc))(func, model_case) + pss = model_case[5] + if not pss: pss = [tp.ParallelScheme.COPY_MERGE, tp.ParallelScheme.PARTITION] + for ps in pss: + for func in [train1, train4, train0, save_and_load, infer]: + locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*mc, ps))(func, model_case[:-1], ps) From 7795eda73047dbd6a2631805f55f178a3ae3befb Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 02:44:30 +0900 Subject: [PATCH 08/12] fix for python 3.4 --- test/unit_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit_test.py b/test/unit_test.py index b03c822..5e664fd 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -101,4 +101,4 @@ def infer(cls, inputFile, mdFields, f, kargs, ps): if not pss: pss = [tp.ParallelScheme.COPY_MERGE, tp.ParallelScheme.PARTITION] for ps in pss: for func in [train1, train4, train0, save_and_load, infer]: - locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*mc, ps))(func, model_case[:-1], ps) + locals()['test_{}_{}_{}'.format(model_case[0].__name__, func.__name__, ps.name)] = (lambda f, mc, ps: lambda: f(*(mc + (ps,))))(func, model_case[:-1], ps) From 9514045918dc0ffa29e236e6aa37c4db37058012 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 17:43:22 +0900 Subject: [PATCH 09/12] test --- test/unit_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unit_test.py b/test/unit_test.py index 5e664fd..ef084e4 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -87,6 +87,7 @@ def infer(cls, inputFile, mdFields, f, kargs, ps): else: mdl.add_doc(ch) mdl.train(20, parallel=ps) + return for n, line in enumerate(unseen_docs): if mdFields: unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) From 208c2b021196aba7511e1eb2c749f0d245927241 Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 18:18:49 +0900 Subject: [PATCH 10/12] fix bugs --- src/TopicModel/HDPModel.hpp | 14 ++++++------ src/TopicModel/HLDA.h | 2 ++ src/TopicModel/HLDAModel.hpp | 12 ++++++++++ src/TopicModel/HPAModel.hpp | 10 ++++----- src/TopicModel/LDACVB0Model.hpp | 6 ++--- src/TopicModel/PAModel.hpp | 4 ++-- src/python/PyUtils.h | 40 ++++++++++++++++++++++++++++----- src/python/module.h | 6 ++--- src/python/py_DMR.cpp | 8 +++---- src/python/py_LDA.cpp | 16 ++++++------- src/python/py_LLDA.cpp | 14 ++++++------ src/python/py_MGLDA.cpp | 8 +++---- src/python/py_PLDA.cpp | 12 +++++----- src/python/py_SLDA.cpp | 29 ++++++++++++------------ 14 files changed, 111 insertions(+), 70 deletions(-) diff --git a/src/TopicModel/HDPModel.hpp b/src/TopicModel/HDPModel.hpp index e67b668..9d07415 100644 --- a/src/TopicModel/HDPModel.hpp +++ b/src/TopicModel/HDPModel.hpp @@ -140,7 +140,7 @@ namespace tomoto const size_t V = this->realV; const auto K = ld.numByTopic.size(); ld.topicLikelihood.resize(K + 1); - ld.topicLikelihood.head(K) = ld.zLikelihood.array().template cast() * ld.numTableByTopic.array().template cast(); + ld.topicLikelihood.head(K) = ld.zLikelihood.head(K).array().template cast() * ld.numTableByTopic.array().template cast(); ld.topicLikelihood[K] = ld.zLikelihood[K] * gamma; sample::prefixSum(ld.topicLikelihood.data(), ld.topicLikelihood.size()); return &ld.topicLikelihood[0]; @@ -255,7 +255,7 @@ namespace tomoto void updateGlobalInfo(ThreadPool& pool, _ModelState* localData) { - std::vector> res(pool.getNumWorkers()); + std::vector> res; auto& K = this->K; K = 0; for (size_t i = 0; i < pool.getNumWorkers(); ++i) @@ -266,7 +266,7 @@ namespace tomoto // synchronize topic size of all documents for (size_t i = 0; i < pool.getNumWorkers(); ++i) { - res[i] = pool.enqueue([&, this](size_t threadId, size_t b, size_t e) + res.emplace_back(pool.enqueue([&, this](size_t threadId, size_t b, size_t e) { for (size_t j = b; j < e; ++j) { @@ -276,7 +276,7 @@ namespace tomoto doc.numByTopic.conservativeResize(K); doc.numByTopic.tail(K - oldSize).setZero(); } - }, this->docs.size() * i / pool.getNumWorkers(), this->docs.size() * (i + 1) / pool.getNumWorkers()); + }, this->docs.size() * i / pool.getNumWorkers(), this->docs.size() * (i + 1) / pool.getNumWorkers())); } for (auto& r : res) r.get(); } @@ -284,7 +284,7 @@ namespace tomoto template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { - std::vector> res(pool.getNumWorkers()); + std::vector> res; const size_t V = this->realV; auto K = this->K; @@ -325,10 +325,10 @@ namespace tomoto for (size_t i = 0; i < pool.getNumWorkers(); ++i) { - res[i] = pool.enqueue([&, this, i](size_t threadId) + res.emplace_back(pool.enqueue([&, this, i](size_t threadId) { localData[i] = globalState; - }); + })); } for (auto& r : res) r.get(); } diff --git a/src/TopicModel/HLDA.h b/src/TopicModel/HLDA.h index b061a1f..a56fd3b 100644 --- a/src/TopicModel/HLDA.h +++ b/src/TopicModel/HLDA.h @@ -13,6 +13,8 @@ namespace tomoto // Zs indicates level in HLDAModel. std::vector path; + template void update(WeightType* ptr, const _TopicModel& mdl); + DEFINE_SERIALIZER_AFTER_BASE(DocumentLDA<_TW>, path); }; diff --git a/src/TopicModel/HLDAModel.hpp b/src/TopicModel/HLDAModel.hpp index 6d42fbe..1536759 100644 --- a/src/TopicModel/HLDAModel.hpp +++ b/src/TopicModel/HLDAModel.hpp @@ -648,4 +648,16 @@ namespace tomoto return ret; } }; + + template + template + inline void DocumentHLDA<_TW>::update(WeightType * ptr, const _TopicModel & mdl) + { + this->numByTopic.init(ptr, mdl.getLevelDepth()); + for (size_t i = 0; i < this->Zs.size(); ++i) + { + if (this->words[i] >= mdl.getV()) continue; + this->numByTopic[this->Zs[i]] += _TW != TermWeight::one ? this->wordWeights[i] : 1; + } + } } diff --git a/src/TopicModel/HPAModel.hpp b/src/TopicModel/HPAModel.hpp index bf879bd..b57cff2 100644 --- a/src/TopicModel/HPAModel.hpp +++ b/src/TopicModel/HPAModel.hpp @@ -66,7 +66,7 @@ namespace tomoto std::vector> res; for (size_t k = 0; k < K; ++k) { - pool.enqueue([&, k](size_t) + res.emplace_back(pool.enqueue([&, k](size_t) { for (size_t i = 0; i < iteration; ++i) { @@ -78,7 +78,7 @@ namespace tomoto } subAlphaSum[k] = subAlphas.row(k).sum(); } - }); + })); } for (auto& r : res) r.get(); } @@ -240,7 +240,7 @@ namespace tomoto template void mergeState(ThreadPool& pool, _ModelState& globalState, _ModelState& tState, _ModelState* localData, RandGen*) const { - std::vector> res(pool.getNumWorkers()); + std::vector> res; tState = globalState; globalState = localData[0]; @@ -269,10 +269,10 @@ namespace tomoto for (size_t i = 0; i < pool.getNumWorkers(); ++i) { - res[i] = pool.enqueue([&, this, i](size_t threadId) + res.emplace_back(pool.enqueue([&, this, i](size_t threadId) { localData[i] = globalState; - }); + })); } for (auto& r : res) r.get(); } diff --git a/src/TopicModel/LDACVB0Model.hpp b/src/TopicModel/LDACVB0Model.hpp index 013dfc3..de841d6 100644 --- a/src/TopicModel/LDACVB0Model.hpp +++ b/src/TopicModel/LDACVB0Model.hpp @@ -182,7 +182,7 @@ namespace tomoto void updateGlobalInfo(ThreadPool& pool, _ModelState* localData) { - std::vector> res(pool.getNumWorkers()); + std::vector> res; this->globalState.numByTopic.setZero(); this->globalState.numByTopicWord.setZero(); @@ -198,10 +198,10 @@ namespace tomoto for (size_t i = 0; i < pool.getNumWorkers(); ++i) { - res[i] = pool.enqueue([&, i](size_t threadId) + res.emplace_back(pool.enqueue([&, i](size_t threadId) { localData[i] = this->globalState; - }); + })); } for (auto& r : res) r.get(); } diff --git a/src/TopicModel/PAModel.hpp b/src/TopicModel/PAModel.hpp index 888be39..12a56db 100644 --- a/src/TopicModel/PAModel.hpp +++ b/src/TopicModel/PAModel.hpp @@ -49,7 +49,7 @@ namespace tomoto std::vector> res; for (size_t k = 0; k < K; ++k) { - pool.enqueue([&, k](size_t) + res.emplace_back(pool.enqueue([&, k](size_t) { for (size_t i = 0; i < iteration; ++i) { @@ -61,7 +61,7 @@ namespace tomoto } subAlphaSum[k] = subAlphas.row(k).sum(); } - }); + })); } for (auto& r : res) r.get(); } diff --git a/src/python/PyUtils.h b/src/python/PyUtils.h index 82db48b..079bca4 100644 --- a/src/python/PyUtils.h +++ b/src/python/PyUtils.h @@ -14,14 +14,43 @@ namespace py { using namespace std; - struct AutoReleaser + struct UniqueObj { - PyObject*& obj; - AutoReleaser(PyObject*& _obj) : obj(_obj) {} - ~AutoReleaser() + PyObject* obj; + UniqueObj(PyObject* _obj = nullptr) : obj(_obj) {} + ~UniqueObj() { Py_XDECREF(obj); } + + UniqueObj(const UniqueObj&) = delete; + UniqueObj& operator=(const UniqueObj&) = delete; + + UniqueObj(UniqueObj&& o) + { + std::swap(obj, o.obj); + } + + UniqueObj& operator=(UniqueObj&& o) + { + std::swap(obj, o.obj); + return *this; + } + + PyObject* get() const + { + return obj; + } + + operator bool() const + { + return !!obj; + } + + operator PyObject*() const + { + return obj; + } }; template @@ -68,11 +97,10 @@ namespace py template inline vector makeIterToVector(PyObject *iter) { - PyObject* item; + UniqueObj item; vector v; while ((item = PyIter_Next(iter))) { - AutoReleaser ar{ item }; v.emplace_back(makeObjectToCType(item)); } if (PyErr_Occurred()) diff --git a/src/python/module.h b/src/python/module.h index 51c6eec..068b81c 100644 --- a/src/python/module.h +++ b/src/python/module.h @@ -2,10 +2,10 @@ #include #ifdef _DEBUG -#undef _DEBUG -#define DEBUG_LOG(t) do{ cerr << t << endl; }while(0); +//#undef _DEBUG +#define DEBUG_LOG(t) do{ cerr << t << endl; }while(0) #include "PyUtils.h" -#define _DEBUG +//#define _DEBUG #else #define DEBUG_LOG(t) #include "PyUtils.h" diff --git a/src/python/py_DMR.cpp b/src/python/py_DMR.cpp index 4e45597..d890a44 100644 --- a/src/python/py_DMR.cpp +++ b/src/python/py_DMR.cpp @@ -32,7 +32,7 @@ static int DMR_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* DMR_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords; const char* metadata = ""; static const char* kwlist[] = { "words", "metadata", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &metadata)) return nullptr; @@ -42,11 +42,11 @@ static PyObject* DMR_addDoc(TopicModelObject* self, PyObject* args, PyObject *kw if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->addDoc(py::makeIterToVector(iter), { string{metadata} }); return py::buildPyValue(ret); } @@ -64,7 +64,7 @@ static PyObject* DMR_addDoc(TopicModelObject* self, PyObject* args, PyObject *kw static PyObject* DMR_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords; const char* metadata = ""; static const char* kwlist[] = { "words", "metadata", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &metadata)) return nullptr; @@ -73,11 +73,11 @@ static PyObject* DMR_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->makeDoc(py::makeIterToVector(iter), { string{metadata} }); return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); } diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 1b36a2a..0678b8c 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -34,7 +34,7 @@ static int LDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* LDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords; static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &argWords)) return nullptr; try @@ -43,11 +43,11 @@ static PyObject* LDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kw if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->addDoc(py::makeIterToVector(iter)); return py::buildPyValue(ret); } @@ -64,7 +64,7 @@ static PyObject* LDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kw static PyObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords = nullptr; static const char* kwlist[] = { "words", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O", (char**)kwlist, &argWords)) return nullptr; try @@ -72,11 +72,11 @@ static PyObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->makeDoc(py::makeIterToVector(iter)); return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); } @@ -178,7 +178,7 @@ static PyObject* LDA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argDoc, *iter = nullptr, *item; + PyObject *argDoc; size_t iteration = 100, workers = 0, together = 0, ps = 0; float tolerance = -1; static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", nullptr }; @@ -187,15 +187,15 @@ static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwa try { if (!self->inst) throw runtime_error{ "inst is null" }; + py::UniqueObj iter; if ((iter = PyObject_GetIter(argDoc)) != nullptr) { - py::AutoReleaser arIter{ iter }; std::vector docs; + py::UniqueObj item; while ((item = PyIter_Next(iter))) { - py::AutoReleaser arItem{ item }; if (Py_TYPE(item) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document type or list of tomotopy.Document" }; - auto* doc = (DocumentObject*)item; + auto* doc = (DocumentObject*)item.get(); if (doc->parentModel != self) throw runtime_error{ "'doc' was from another model, not fit to this model" }; docs.emplace_back((tomoto::DocumentBase*)doc->doc); } diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index 769d119..dd79a4b 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -32,7 +32,7 @@ static int LLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *argLabels = nullptr, *iter = nullptr, *iter2 = nullptr; + PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; try @@ -41,20 +41,20 @@ static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; vector labels; if(argLabels) { + py::UniqueObj iter2; if (PyUnicode_Check(argLabels)) PRINT_WARN("[warn] 'labels' should be an iterable of str."); - if (!(iter2 = PyObject_GetIter(argLabels))) + if (!(iter = PyObject_GetIter(argLabels))) { throw runtime_error{ "'labels' must be an iterable of str." }; } - py::AutoReleaser arIter2{ iter2 }; labels = py::makeIterToVector(iter2); } auto ret = inst->addDoc(py::makeIterToVector(iter), labels); @@ -73,7 +73,7 @@ static PyObject* LLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k static PyObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *argLabels = nullptr, *iter = nullptr, *iter2 = nullptr; + PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; try @@ -81,20 +81,20 @@ static PyObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; vector labels; if (argLabels) { + py::UniqueObj iter2; if (PyUnicode_Check(argLabels)) PRINT_WARN("[warn] 'labels' should be an iterable of str."); if (!(iter2 = PyObject_GetIter(argLabels))) { throw runtime_error{ "'labels' must be an iterable of str." }; } - py::AutoReleaser arIter2{ iter2 }; labels = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(py::makeIterToVector(iter), labels); diff --git a/src/python/py_MGLDA.cpp b/src/python/py_MGLDA.cpp index 05bd0d9..ccc6e51 100644 --- a/src/python/py_MGLDA.cpp +++ b/src/python/py_MGLDA.cpp @@ -35,7 +35,7 @@ static int MGLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* MGLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords; const char* delimiter = "."; static const char* kwlist[] = { "words", "delimiter", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &delimiter)) return nullptr; @@ -45,11 +45,11 @@ static PyObject* MGLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject * if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->addDoc(py::makeIterToVector(iter), delimiter); return py::buildPyValue(ret); } @@ -66,7 +66,7 @@ static PyObject* MGLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject * static PyObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr; + PyObject *argWords; const char* delimiter = "."; static const char* kwlist[] = { "words", "delimiter", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s", (char**)kwlist, &argWords, &delimiter)) return nullptr; @@ -75,11 +75,11 @@ static PyObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto ret = inst->makeDoc(py::makeIterToVector(iter), delimiter); return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); } diff --git a/src/python/py_PLDA.cpp b/src/python/py_PLDA.cpp index c476636..16ce330 100644 --- a/src/python/py_PLDA.cpp +++ b/src/python/py_PLDA.cpp @@ -33,7 +33,7 @@ static int PLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* PLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *argLabels = nullptr, *iter = nullptr, *iter2 = nullptr; + PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; try @@ -42,20 +42,20 @@ static PyObject* PLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; vector labels; if(argLabels) { if (PyUnicode_Check(argLabels)) PRINT_WARN("[warn] 'labels' should be an iterable of str."); + py::UniqueObj iter2; if (!(iter2 = PyObject_GetIter(argLabels))) { throw runtime_error{ "'labels' must be an iterable of str." }; } - py::AutoReleaser arIter2{ iter2 }; labels = py::makeIterToVector(iter2); } auto ret = inst->addDoc(py::makeIterToVector(iter), labels); @@ -74,7 +74,7 @@ static PyObject* PLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k static PyObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *argLabels = nullptr, *iter = nullptr, *iter2 = nullptr; + PyObject *argWords, *argLabels = nullptr; static const char* kwlist[] = { "words", "labels", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argLabels)) return nullptr; try @@ -82,20 +82,20 @@ static PyObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; vector labels; if (argLabels) { if (PyUnicode_Check(argLabels)) PRINT_WARN("[warn] 'labels' should be an iterable of str."); + py::UniqueObj iter2; if (!(iter2 = PyObject_GetIter(argLabels))) { throw runtime_error{ "'labels' must be an iterable of str." }; } - py::AutoReleaser arIter2{ iter2 }; labels = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(py::makeIterToVector(iter), labels); diff --git a/src/python/py_SLDA.cpp b/src/python/py_SLDA.cpp index 9f4a300..bdde730 100644 --- a/src/python/py_SLDA.cpp +++ b/src/python/py_SLDA.cpp @@ -20,12 +20,11 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) &mu, &nuSq, &glmCoef, &seed)) return -1; try { - PyObject* iter; vector varTypes; if (vars) { + py::UniqueObj iter; if (!(iter = PyObject_GetIter(vars))) throw runtime_error{ "'vars' must be an iterable." }; - py::AutoReleaser ar{ iter }; auto vs = py::makeIterToVector(iter); for (auto& s : vs) { @@ -44,9 +43,9 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if ((fTemp = PyFloat_AsDouble(mu)) == -1 && PyErr_Occurred()) { PyErr_Clear(); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(mu))) throw runtime_error{ "'mu' must be float or iterable of float." }; - py::AutoReleaser ar{ iter }; vmu = py::makeIterToVector(iter); } else @@ -60,9 +59,9 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if ((fTemp = PyFloat_AsDouble(nuSq)) == -1 && PyErr_Occurred()) { PyErr_Clear(); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(nuSq))) throw runtime_error{ "'nu_sq' must be float or iterable of float." }; - py::AutoReleaser ar{ iter }; vnuSq = py::makeIterToVector(iter); } else @@ -76,9 +75,9 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) if ((fTemp = PyFloat_AsDouble(glmCoef)) == -1 && PyErr_Occurred()) { PyErr_Clear(); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(glmCoef))) throw runtime_error{ "'glm_param' must be float or iterable of float." }; - py::AutoReleaser ar{ iter }; vglmCoef = py::makeIterToVector(iter); } else @@ -107,7 +106,7 @@ static int SLDA_init(TopicModelObject *self, PyObject *args, PyObject *kwargs) static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr, *argY = nullptr; + PyObject *argWords, *argY = nullptr; static const char* kwlist[] = { "words", "y", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argY)) return nullptr; try @@ -116,18 +115,18 @@ static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k if (self->isPrepared) throw runtime_error{ "cannot add_doc() after train()" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "'words' must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto words = py::makeIterToVector(iter); vector ys; if (argY) { - if (!(iter = PyObject_GetIter(argY))) throw runtime_error{ "'y' must be an iterable of float." }; - py::AutoReleaser arIter{ iter }; - ys = py::makeIterToVector(iter); + py::UniqueObj iter2; + if (!(iter2 = PyObject_GetIter(argY))) throw runtime_error{ "'y' must be an iterable of float." }; + ys = py::makeIterToVector(iter2); } auto ret = inst->addDoc(words, ys); return py::buildPyValue(ret); @@ -145,7 +144,7 @@ static PyObject* SLDA_addDoc(TopicModelObject* self, PyObject* args, PyObject *k static PyObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *kwargs) { - PyObject *argWords, *iter = nullptr, *argY = nullptr; + PyObject *argWords, *argY = nullptr; static const char* kwlist[] = { "words", "y", nullptr }; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|O", (char**)kwlist, &argWords, &argY)) return nullptr; try @@ -153,18 +152,18 @@ static PyObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * if (!self->inst) throw runtime_error{ "inst is null" }; auto* inst = static_cast(self->inst); if (PyUnicode_Check(argWords)) PRINT_WARN("[warn] 'words' should be an iterable of str."); + py::UniqueObj iter; if (!(iter = PyObject_GetIter(argWords))) { throw runtime_error{ "words must be an iterable of str." }; } - py::AutoReleaser arIter{ iter }; auto words = py::makeIterToVector(iter); vector ys; if (argY) { - if (!(iter = PyObject_GetIter(argY))) throw runtime_error{ "'y' must be an iterable of float." }; - py::AutoReleaser arIter{ iter }; - ys = py::makeIterToVector(iter); + py::UniqueObj iter2; + if (!(iter2 = PyObject_GetIter(argY))) throw runtime_error{ "'y' must be an iterable of float." }; + ys = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(words, ys); return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); From 31f19d22ec0591f6d0e3fdfe23f6a648fe11293e Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 19:11:23 +0900 Subject: [PATCH 11/12] fix wrong python function call --- .gitignore | 1 + src/python/module.h | 3 ++- src/python/py_DMR.cpp | 8 +++++--- src/python/py_LDA.cpp | 9 ++++++--- src/python/py_LLDA.cpp | 8 +++++--- src/python/py_MGLDA.cpp | 3 ++- src/python/py_PLDA.cpp | 8 +++++--- src/python/py_SLDA.cpp | 3 ++- src/python/py_main.cpp | 7 ++++--- test/unit_test.py | 1 - 10 files changed, 32 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index dd6601b..6be3bde 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ /tomotopy.egg-info build_windows.bat *.bin +/venv/ \ No newline at end of file diff --git a/src/python/module.h b/src/python/module.h index 068b81c..289b5ec 100644 --- a/src/python/module.h +++ b/src/python/module.h @@ -78,7 +78,8 @@ PyObject* PREFIX##_load(PyObject*, PyObject* args, PyObject *kwargs)\ for (size_t i = 0; i < (size_t)tomoto::TermWeight::size; ++i)\ {\ str.seekg(0);\ - auto* p = PyObject_CallObject((PyObject*)&TYPE, Py_BuildValue("(n)", i));\ + py::UniqueObj args = Py_BuildValue("(n)", i);\ + auto* p = PyObject_CallObject((PyObject*)&TYPE, args);\ try\ {\ ((TopicModelObject*)p)->inst->loadModel(str);\ diff --git a/src/python/py_DMR.cpp b/src/python/py_DMR.cpp index d890a44..ee6bd18 100644 --- a/src/python/py_DMR.cpp +++ b/src/python/py_DMR.cpp @@ -79,7 +79,8 @@ static PyObject* DMR_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k throw runtime_error{ "words must be an iterable of str." }; } auto ret = inst->makeDoc(py::makeIterToVector(iter), { string{metadata} }); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { @@ -97,8 +98,9 @@ static PyObject* DMR_getMetadataDict(TopicModelObject* self, void* closure) try { if (!self->inst) throw runtime_error{ "inst is null" }; - return PyObject_CallObject((PyObject*)&Dictionary_type, Py_BuildValue("(Nn)", self, - &static_cast(self->inst)->getMetadataDict())); + py::UniqueObj args = Py_BuildValue("(On)", self, + &static_cast(self->inst)->getMetadataDict()); + return PyObject_CallObject((PyObject*)&Dictionary_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 0678b8c..52d8be2 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -78,7 +78,8 @@ static PyObject* LDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject *k throw runtime_error{ "words must be an iterable of str." }; } auto ret = inst->makeDoc(py::makeIterToVector(iter)); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { @@ -287,7 +288,8 @@ static PyObject* LDA_getDocs(TopicModelObject* self, void* closure) try { if (!self->inst) throw runtime_error{ "inst is null" }; - return PyObject_CallObject((PyObject*)&Corpus_type, Py_BuildValue("(O)", self)); + py::UniqueObj args = Py_BuildValue("(O)", self); + return PyObject_CallObject((PyObject*)&Corpus_type, args); } catch (const bad_exception&) { @@ -305,7 +307,8 @@ static PyObject* LDA_getVocabs(TopicModelObject* self, void* closure) try { if (!self->inst) throw runtime_error{ "inst is null" }; - return PyObject_CallObject((PyObject*)&Dictionary_type, Py_BuildValue("(Nn)", self, &self->inst->getVocabDict())); + py::UniqueObj args = Py_BuildValue("(On)", self, &self->inst->getVocabDict()); + return PyObject_CallObject((PyObject*)&Dictionary_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_LLDA.cpp b/src/python/py_LLDA.cpp index dd79a4b..b299cae 100644 --- a/src/python/py_LLDA.cpp +++ b/src/python/py_LLDA.cpp @@ -98,7 +98,8 @@ static PyObject* LLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * labels = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(py::makeIterToVector(iter), labels); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { @@ -116,8 +117,9 @@ static PyObject* LLDA_getTopicLabelDict(TopicModelObject* self, void* closure) try { if (!self->inst) throw runtime_error{ "inst is null" }; - return PyObject_CallObject((PyObject*)&Dictionary_type, Py_BuildValue("(Nn)", self, - &static_cast(self->inst)->getTopicLabelDict())); + py::UniqueObj args = Py_BuildValue("(On)", self, + &static_cast(self->inst)->getTopicLabelDict()); + return PyObject_CallObject((PyObject*)&Dictionary_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_MGLDA.cpp b/src/python/py_MGLDA.cpp index ccc6e51..b5ecd1e 100644 --- a/src/python/py_MGLDA.cpp +++ b/src/python/py_MGLDA.cpp @@ -81,7 +81,8 @@ static PyObject* MGLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject throw runtime_error{ "words must be an iterable of str." }; } auto ret = inst->makeDoc(py::makeIterToVector(iter), delimiter); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_PLDA.cpp b/src/python/py_PLDA.cpp index 16ce330..56bb8d2 100644 --- a/src/python/py_PLDA.cpp +++ b/src/python/py_PLDA.cpp @@ -99,7 +99,8 @@ static PyObject* PLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * labels = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(py::makeIterToVector(iter), labels); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { @@ -117,8 +118,9 @@ static PyObject* PLDA_getTopicLabelDict(TopicModelObject* self, void* closure) try { if (!self->inst) throw runtime_error{ "inst is null" }; - return PyObject_CallObject((PyObject*)&Dictionary_type, Py_BuildValue("(Nn)", self, - &static_cast(self->inst)->getTopicLabelDict())); + py::UniqueObj args = Py_BuildValue("(On)", self, + &static_cast(self->inst)->getTopicLabelDict()); + return PyObject_CallObject((PyObject*)&Dictionary_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_SLDA.cpp b/src/python/py_SLDA.cpp index bdde730..cf9454b 100644 --- a/src/python/py_SLDA.cpp +++ b/src/python/py_SLDA.cpp @@ -166,7 +166,8 @@ static PyObject* SLDA_makeDoc(TopicModelObject* self, PyObject* args, PyObject * ys = py::makeIterToVector(iter2); } auto ret = inst->makeDoc(words, ys); - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self, ret.release(), 1)); + py::UniqueObj args = Py_BuildValue("(Onn)", self, ret.release(), 1); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { diff --git a/src/python/py_main.cpp b/src/python/py_main.cpp index 9274bcb..836ce56 100644 --- a/src/python/py_main.cpp +++ b/src/python/py_main.cpp @@ -106,9 +106,9 @@ PyObject* DictionaryObject::getitem(DictionaryObject* self, Py_ssize_t key) PyObject* DictionaryObject::repr(DictionaryObject* self) { - PyObject* l = PyObject_CallObject((PyObject*)&PyList_Type, Py_BuildValue("(N)", self)); + py::UniqueObj args = Py_BuildValue("(O)", self); + py::UniqueObj l = PyObject_CallObject((PyObject*)&PyList_Type, args); PyObject* r = PyObject_Repr(l); - Py_XDECREF(l); return r; } @@ -445,7 +445,8 @@ static PyObject* Corpus_getitem(CorpusObject* self, Py_ssize_t key) PyErr_SetString(PyExc_IndexError, ""); throw bad_exception{}; } - return PyObject_CallObject((PyObject*)&Document_type, Py_BuildValue("(Nnn)", self->parentModel, key, 0)); + py::UniqueObj args = Py_BuildValue("(Onn)", self->parentModel, key, 0); + return PyObject_CallObject((PyObject*)&Document_type, args); } catch (const bad_exception&) { diff --git a/test/unit_test.py b/test/unit_test.py index ef084e4..5e664fd 100644 --- a/test/unit_test.py +++ b/test/unit_test.py @@ -87,7 +87,6 @@ def infer(cls, inputFile, mdFields, f, kargs, ps): else: mdl.add_doc(ch) mdl.train(20, parallel=ps) - return for n, line in enumerate(unseen_docs): if mdFields: unseen_docs[n] = mdl.make_doc(ch[mdFields:], f(ch[:mdFields])) From 03d8677dccab595d59c05b76e871bd212a3bb68d Mon Sep 17 00:00:00 2001 From: Minchul Lee Date: Sun, 29 Dec 2019 23:19:40 +0900 Subject: [PATCH 12/12] change behavior of infer at PAM & update documentations --- README.kr.rst | 9 +++- README.rst | 9 +++- src/TopicModel/TopicModel.hpp | 18 +++++++- src/Utils/Dictionary.h | 4 +- src/Utils/Utils.hpp | 2 +- src/python/docs.h | 56 ++++++++++++++++++++++- src/python/py_HPA.cpp | 2 + src/python/py_LDA.cpp | 2 +- src/python/py_PA.cpp | 86 +++++++++++++++++++++++++++++++++++ tomotopy/documentation.kr.rst | 9 +++- tomotopy/documentation.rst | 9 +++- 11 files changed, 195 insertions(+), 11 deletions(-) diff --git a/README.kr.rst b/README.kr.rst index bc4cea3..858840c 100644 --- a/README.kr.rst +++ b/README.kr.rst @@ -30,7 +30,7 @@ tomotopy 란? 더 자세한 정보는 https://bab2min.github.io/tomotopy/index.kr.html 에서 확인하시길 바랍니다. -tomotopy의 가장 최신버전은 0.4.2 입니다. +tomotopy의 가장 최신버전은 0.5.0 입니다. 시작하기 --------------- @@ -197,6 +197,13 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.5.0 (2019-12-30) + * `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다. + * `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용) + * `tomotopy.LDAModel.train` 및 `tomotopy.LDAModel.infer` 메소드에 parallel 옵션이 추가되었습니다. 이를 통해 학습 및 추론시 사용할 병렬화 알고리즘을 선택할 수 있습니다. + * `tomotopy.ParallelScheme.PARTITION` 알고리즘이 추가되었습니다. 이 알고리즘은 작업자 수가 많거나 토픽의 개수나 어휘 크기가 클 때도 효율적으로 작동합니다. + * 모델 생성시 min_cf < 2일때 rm_top 옵션이 적용되지 않는 문제를 수정하였습니다. + * 0.4.2 (2019-11-30) * `tomotopy.LLDAModel`와 `tomotopy.PLDAModel` 모델에서 토픽 할당이 잘못 일어나던 문제를 해결했습니다. * `tomotopy.Document` 및 `tomotopy.Dictionary` 클래스에 가독성이 좋은 __repr__가 추가되었습니다. diff --git a/README.rst b/README.rst index bfb4cd6..3d4b0a6 100644 --- a/README.rst +++ b/README.rst @@ -31,7 +31,7 @@ The current version of `tomoto` supports several major topic models including Please visit https://bab2min.github.io/tomotopy to see more information. -The most recent version of tomotopy is 0.4.2. +The most recent version of tomotopy is 0.5.0. Getting Started --------------- @@ -202,6 +202,13 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.5.0 (2019-12-30) + * Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution. + * New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel) + * New parameter `parallel` was added for `tomotopy.LDAModel.train` and `tomotopy.LDAModel.infer` method. You can select parallelism algorithm by changing this parameter. + * `tomotopy.ParallelScheme.PARTITION`, a new algorithm, was added. It works efficiently when the number of workers is large, the number of topics or the size of vocabulary is big. + * A bug where `rm_top` didn't work at `min_cf` < 2 was fixed. + * 0.4.2 (2019-11-30) * Wrong topic assignments of `tomotopy.LLDAModel` and `tomotopy.PLDAModel` were fixed. * Readable __repr__ of `tomotopy.Document` and `tomotopy.Dictionary` was implemented. diff --git a/src/TopicModel/TopicModel.hpp b/src/TopicModel/TopicModel.hpp index 246ee83..4eba47a 100644 --- a/src/TopicModel/TopicModel.hpp +++ b/src/TopicModel/TopicModel.hpp @@ -9,6 +9,22 @@ namespace tomoto { +#if _WIN32 || _WIN64 +#if _WIN64 + typedef std::mt19937_64 RandGen; +#else + typedef std::mt19937 RandGen; +#endif +#endif + +#if __GNUC__ +#if __x86_64__ || __ppc64__ + typedef std::mt19937_64 RandGen; +#else + typedef std::mt19937 RandGen; +#endif +#endif + class DocumentBase { public: @@ -476,4 +492,4 @@ namespace tomoto } }; -} \ No newline at end of file +} diff --git a/src/Utils/Dictionary.h b/src/Utils/Dictionary.h index 5027d94..10b5fce 100644 --- a/src/Utils/Dictionary.h +++ b/src/Utils/Dictionary.h @@ -14,8 +14,6 @@ namespace tomoto typedef uint16_t TID; typedef float FLOAT; - typedef std::mt19937_64 RandGen; - class Dictionary { protected: @@ -79,4 +77,4 @@ namespace tomoto } }; -} \ No newline at end of file +} diff --git a/src/Utils/Utils.hpp b/src/Utils/Utils.hpp index e9e9ff9..c779954 100644 --- a/src/Utils/Utils.hpp +++ b/src/Utils/Utils.hpp @@ -218,4 +218,4 @@ namespace tomoto { return { iter, f }; } -} \ No newline at end of file +} diff --git a/src/python/docs.h b/src/python/docs.h index d426b96..a90d773 100644 --- a/src/python/docs.h +++ b/src/python/docs.h @@ -275,7 +275,7 @@ DOC_SIGNATURE_EN_KO(LDA_get_count_by_topics__doc__, u8R""(각각의 토픽에 할당된 단어의 개수를 `list`형태로 반환합니다.)""); DOC_SIGNATURE_EN_KO(LDA_infer__doc__, - "infer(self, doc, iter=100, tolerance=-1, workers=0, together=False)", + "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False)", u8R""(Return the inferred topic distribution from unseen `doc`s. The return type is (a topic distribution of `doc`, log likelihood) or (a `list` of topic distribution of `doc`, log likelihood) @@ -1008,6 +1008,60 @@ super_topic_id : int 상위 토픽을 가리키는 [0, `k1`) 범위의 정수 )""); +DOC_SIGNATURE_EN_KO(PA_infer__doc__, + "infer(self, doc, iter=100, tolerance=-1, workers=0, parallel=0, together=False)", + u8R""(.. versionadded:: 0.5.0 + +Return the inferred topic distribution and sub-topic distribution from unseen `doc`s. +The return type is ((a topic distribution of `doc`, a sub-topic distribution of `doc`), log likelihood) or (a `list` of (topic distribution of `doc`, sub-topic distribution of `doc`), log likelihood) + +Parameters +---------- +doc : tomotopy.Document or list of tomotopy.Document + an instance of `tomotopy.Document` or a `list` of instances of `tomotopy.Document` to be inferred by the model. + It can be acquired from `tomotopy.LDAModel.make_doc` method. +iter : int + an integer indicating the number of iteration to estimate the distribution of topics of `doc`. + The higher value will generate a more accuracy result. +tolerance : float + isn't currently used. +workers : int + an integer indicating the number of workers to perform samplings. + If `workers` is 0, the number of cores in the system will be used. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + the parallelism scheme for inference. the default value is ParallelScheme.DEFAULT which means that tomotopy selects the best scheme by model. +together : bool + all `doc`s are infered together in one process if True, otherwise each `doc` is infered independently. Its default value is `False`. +)"", +u8R""(.. versionadded:: 0.5.0 + +새로운 문헌인 `doc`에 대해 각각의 주제 분포를 추론하여 반환합니다. +반환 타입은 ((`doc`의 주제 분포, `doc`의 하위 주제 분포), 로그가능도) 또는 ((`doc`의 주제 분포, `doc`의 하위 주제 분포)로 구성된 `list`, 로그가능도)입니다. + +Parameters +---------- +doc : tomotopy.Document or list of tomotopy.Document + 추론에 사용할 `tomotopy.Document`의 인스턴스이거나 이 인스턴스들의 `list`. + 이 인스턴스들은 `tomotopy.LDAModel.make_doc` 메소드를 통해 얻을 수 있습니다. +iter : int + `doc`의 주제 분포를 추론하기 위해 학습을 반복할 횟수입니다. + 이 값이 클 수록 더 정확한 결과를 낼 수 있습니다. +tolerance : float + 현재는 사용되지 않음 +workers : int + 깁스 샘플링을 수행하는 데에 사용할 스레드의 개수입니다. + 만약 이 값을 0으로 설정할 경우 시스템 내의 가용한 모든 코어가 사용됩니다. +parallel : int or tomotopy.ParallelScheme + .. versionadded:: 0.5.0 + + 추론에 사용할 병렬화 방법. 기본값은 ParallelScheme.DEFAULT로 이는 모델에 따라 최적의 방법을 tomotopy가 알아서 선택하도록 합니다. +together : bool + 이 값이 True인 경우 입력한 `doc` 문헌들을 한 번에 모델에 넣고 추론을 진행합니다. + False인 경우 각각의 문헌들을 별도로 모델에 넣어 추론합니다. 기본값은 `False`입니다. +)""); + DOC_VARIABLE_EN_KO(PA_k1__doc__, u8R""(k1, the number of super topics (read-only))"", u8R""(k1, 상위 토픽의 개수 (읽기전용))""); diff --git a/src/python/py_HPA.cpp b/src/python/py_HPA.cpp index 685dbf0..4087857 100644 --- a/src/python/py_HPA.cpp +++ b/src/python/py_HPA.cpp @@ -88,12 +88,14 @@ static PyObject* HPA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py } DEFINE_LOADER(HPA, HPA_type); +PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs); static PyMethodDef HPA_methods[] = { { "load", (PyCFunction)HPA_load, METH_STATIC | METH_VARARGS | METH_KEYWORDS, LDA_load__doc__ }, { "get_topic_words", (PyCFunction)HPA_getTopicWords, METH_VARARGS | METH_KEYWORDS, HPA_get_topic_words__doc__ }, { "get_topic_word_dist", (PyCFunction)HPA_getTopicWordDist, METH_VARARGS | METH_KEYWORDS, HPA_get_topic_word_dist__doc__ }, + { "infer", (PyCFunction)LDA_infer, METH_VARARGS | METH_KEYWORDS, LDA_infer__doc__ }, { nullptr } }; diff --git a/src/python/py_LDA.cpp b/src/python/py_LDA.cpp index 52d8be2..6c9d9ed 100644 --- a/src/python/py_LDA.cpp +++ b/src/python/py_LDA.cpp @@ -177,7 +177,7 @@ static PyObject* LDA_getTopicWordDist(TopicModelObject* self, PyObject* args, Py } } -static PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) +PyObject* LDA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) { PyObject *argDoc; size_t iteration = 100, workers = 0, together = 0, ps = 0; diff --git a/src/python/py_PA.cpp b/src/python/py_PA.cpp index 7ca7adc..fb30a70 100644 --- a/src/python/py_PA.cpp +++ b/src/python/py_PA.cpp @@ -188,6 +188,91 @@ PyObject* Document_getSubTopicDist(DocumentObject* self) } } +static PyObject* PA_infer(TopicModelObject* self, PyObject* args, PyObject *kwargs) +{ + PyObject *argDoc; + size_t iteration = 100, workers = 0, together = 0, ps = 0; + float tolerance = -1; + static const char* kwlist[] = { "doc", "iter", "tolerance", "workers", "parallel", "together", nullptr }; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nfnnp", (char**)kwlist, &argDoc, &iteration, &tolerance, &workers, &ps, &together)) return nullptr; + DEBUG_LOG("infer " << self->ob_base.ob_type << ", " << self->ob_base.ob_refcnt); + try + { + if (!self->inst) throw runtime_error{ "inst is null" }; + auto inst = static_cast(self->inst); + py::UniqueObj iter; + if ((iter = PyObject_GetIter(argDoc)) != nullptr) + { + std::vector docs; + py::UniqueObj item; + while ((item = PyIter_Next(iter))) + { + if (Py_TYPE(item) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document type or list of tomotopy.Document" }; + auto* doc = (DocumentObject*)item.get(); + if (doc->parentModel != self) throw runtime_error{ "'doc' was from another model, not fit to this model" }; + docs.emplace_back((tomoto::DocumentBase*)doc->doc); + } + if (PyErr_Occurred()) throw bad_exception{}; + if (!self->isPrepared) + { + inst->prepare(true, self->minWordCnt, self->removeTopWord); + self->isPrepared = true; + } + auto ll = inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together); + PyObject* ret = PyList_New(docs.size()); + size_t i = 0; + for (auto d : docs) + { + PyList_SetItem(ret, i++, Py_BuildValue("(NN)", + py::buildPyValue(inst->getTopicsByDoc(d)), + py::buildPyValue(inst->getSubTopicsByDoc(d)) + )); + } + if (together) + { + return Py_BuildValue("(Nf)", ret, ll[0]); + } + else + { + return Py_BuildValue("(NN)", ret, py::buildPyValue(ll)); + } + } + else + { + PyErr_Clear(); + if (Py_TYPE(argDoc) != &Document_type) throw runtime_error{ "'doc' must be tomotopy.Document type or list of tomotopy.Document" }; + auto* doc = (DocumentObject*)argDoc; + if (doc->parentModel != self) throw runtime_error{ "'doc' was from another model, not fit to this model" }; + if (!self->isPrepared) + { + inst->prepare(true, self->minWordCnt, self->removeTopWord); + self->isPrepared = true; + } + if (doc->owner) + { + std::vector docs; + docs.emplace_back((tomoto::DocumentBase*)doc->doc); + float ll = self->inst->infer(docs, iteration, tolerance, workers, (tomoto::ParallelScheme)ps, !!together)[0]; + return Py_BuildValue("((NN)f)", py::buildPyValue(inst->getTopicsByDoc(doc->doc)), + py::buildPyValue(inst->getSubTopicsByDoc(doc->doc)), ll); + } + else + { + return Py_BuildValue("((NN)s)", py::buildPyValue(inst->getTopicsByDoc(doc->doc)), + py::buildPyValue(inst->getSubTopicsByDoc(doc->doc)), nullptr); + } + } + } + catch (const bad_exception&) + { + return nullptr; + } + catch (const exception& e) + { + PyErr_SetString(PyExc_Exception, e.what()); + return nullptr; + } +} DEFINE_GETTER(tomoto::IPAModel, PA, getK2); DEFINE_DOCUMENT_GETTER_REORDER(tomoto::DocumentPA, Z2, Z2s); @@ -200,6 +285,7 @@ static PyMethodDef PA_methods[] = { "get_sub_topics", (PyCFunction)PA_getSubTopics, METH_VARARGS | METH_KEYWORDS, PA_get_sub_topics__doc__ }, { "get_topic_words", (PyCFunction)PA_getTopicWords, METH_VARARGS | METH_KEYWORDS, PA_get_topic_words__doc__}, { "get_topic_word_dist", (PyCFunction)PA_getTopicWordDist, METH_VARARGS | METH_KEYWORDS, PA_get_topic_word_dist__doc__ }, + { "infer", (PyCFunction)PA_infer, METH_VARARGS | METH_KEYWORDS, PA_infer__doc__ }, { nullptr } }; diff --git a/tomotopy/documentation.kr.rst b/tomotopy/documentation.kr.rst index ca0b573..5283d0c 100644 --- a/tomotopy/documentation.kr.rst +++ b/tomotopy/documentation.kr.rst @@ -16,7 +16,7 @@ tomotopy 란? * Hierarchical PA (`tomotopy.HPAModel`) * Correlated Topic Model (`tomotopy.CTModel`) -tomotopy의 가장 최신버전은 0.4.2 입니다. +tomotopy의 가장 최신버전은 0.5.0 입니다. .. image:: https://badge.fury.io/py/tomotopy.svg @@ -239,6 +239,13 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma 역사 ------- +* 0.5.0 (2019-12-30) + * `tomotopy.PAModel.infer`가 topic distribution과 sub-topic distribution을 동시에 반환합니다. + * `tomotopy.Document`에 get_sub_topics, get_sub_topic_dist 메소드가 추가되었습니다. (PAModel 전용) + * `tomotopy.LDAModel.train` 및 `tomotopy.LDAModel.infer` 메소드에 parallel 옵션이 추가되었습니다. 이를 통해 학습 및 추론시 사용할 병렬화 알고리즘을 선택할 수 있습니다. + * `tomotopy.ParallelScheme.PARTITION` 알고리즘이 추가되었습니다. 이 알고리즘은 작업자 수가 많거나 토픽의 개수나 어휘 크기가 클 때도 효율적으로 작동합니다. + * 모델 생성시 min_cf < 2일때 rm_top 옵션이 적용되지 않는 문제를 수정하였습니다. + * 0.4.2 (2019-11-30) * `tomotopy.LLDAModel`와 `tomotopy.PLDAModel` 모델에서 토픽 할당이 잘못 일어나던 문제를 해결했습니다. * `tomotopy.Document` 및 `tomotopy.Dictionary` 클래스에 가독성이 좋은 __repr__가 추가되었습니다. diff --git a/tomotopy/documentation.rst b/tomotopy/documentation.rst index b03440a..4947943 100644 --- a/tomotopy/documentation.rst +++ b/tomotopy/documentation.rst @@ -16,7 +16,7 @@ The current version of `tomoto` supports several major topic models including * Hierarchical PA (`tomotopy.HPAModel`) * Correlated Topic Model (`tomotopy.CTModel`). -The most recent version of tomotopy is 0.4.2. +The most recent version of tomotopy is 0.5.0. .. image:: https://badge.fury.io/py/tomotopy.svg @@ -242,6 +242,13 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh History ------- +* 0.5.0 (2019-12-30) + * Now `tomotopy.PAModel.infer` returns both topic distribution nd sub-topic distribution. + * New methods get_sub_topics and get_sub_topic_dist were added into `tomotopy.Document`. (for PAModel) + * New parameter `parallel` was added for `tomotopy.LDAModel.train` and `tomotopy.LDAModel.infer` method. You can select parallelism algorithm by changing this parameter. + * `tomotopy.ParallelScheme.PARTITION`, a new algorithm, was added. It works efficiently when the number of workers is large, the number of topics or the size of vocabulary is big. + * A bug where `rm_top` didn't work at `min_cf` < 2 was fixed. + * 0.4.2 (2019-11-30) * Wrong topic assignments of `tomotopy.LLDAModel` and `tomotopy.PLDAModel` were fixed. * Readable __repr__ of `tomotopy.Document` and `tomotopy.Dictionary` was implemented.