From 36f754352a6e35378a516649aa307ae509ad8f91 Mon Sep 17 00:00:00 2001 From: mourisl Date: Wed, 7 Aug 2024 13:11:21 -0400 Subject: [PATCH 1/3] The SSE macro for the popcount is unnecessary, and it should be changed to GNUC. Add the functionality to promote taxonomy ID to canonical ranks. --- Classifier.hpp | 3 +++ Makefile | 2 +- Taxonomy.hpp | 47 +++++++++++++++++++++++++++++++++++++++++++++ compactds/Utils.hpp | 6 +----- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/Classifier.hpp b/Classifier.hpp index 7854ab5..f12830a 100644 --- a/Classifier.hpp +++ b/Classifier.hpp @@ -527,6 +527,9 @@ class Classifier SimpleVector taxIds ; _taxonomy.ReduceTaxIds(bestSeqTaxIds, taxIds, _param.maxResult) ; + // Centrifuge will promote to canonical tax levels here. + // Maybe we will do the same in some future version. + //_taxonomy.PromoteToCanonicalTaxRank(taxIds, /*dedup=*/true) ; size = taxIds.Size() ; for (i = 0 ; i < size ; ++i) diff --git a/Makefile b/Makefile index 54282ef..4ea751b 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CXX = g++ -CXXFLAGS= -Wall -g -O3 -msse4.2 #-pg -g #-Wall #-O3 +CXXFLAGS= -Wall -g -O3 -march=native #-pg -g #-Wall #-O3 LINKPATH= LINKFLAGS = -lpthread -lz DEBUG= diff --git a/Taxonomy.hpp b/Taxonomy.hpp index e219522..bc22676 100644 --- a/Taxonomy.hpp +++ b/Taxonomy.hpp @@ -370,6 +370,16 @@ class Taxonomy return i ; return _nodeCnt ; } + + bool IsCanonicalRankNum(uint8_t r) // The taxonomy ranks defined in the TaxonomyPathTable in centrifuge, except for subspecies + { + if (r == RANK_STRAIN //|| r == RANK_SUB_SPECIES + || r == RANK_SPECIES || r == RANK_GENUS || r == RANK_FAMILY || r == RANK_ORDER + || r == RANK_CLASS || r == RANK_PHYLUM || r == RANK_KINGDOM || r == RANK_SUPER_KINGDOM + || r == RANK_DOMAIN) + return true ; + return false ; + } public: Taxonomy() { @@ -692,6 +702,43 @@ class Taxonomy promotedTaxIds.PushBack(_rootCTaxId) ; } + // Promote the taxIds to the ranks defined in the "IsCanonicalRankNum" function + // dedup: true: remove the duplicated item in the taxIds + void PromoteToCanonicalTaxRank(SimpleVector &taxIds, bool dedup) + { + size_t i ; + size_t taxCnt = taxIds.Size() ; + for (i = 0 ; i < taxCnt ; ++i) + { + size_t p = taxIds[i] ; + uint8_t rank = _taxonomyTree[p].rank ; + while ( !IsCanonicalRankNum(rank) ) + { + if (p == _taxonomyTree[p].parentTid) + break ; + p = _taxonomyTree[p].parentTid ; + rank = _taxonomyTree[p].rank ; + } + taxIds[i] = p ; + } + + if (dedup) + { + std::map used ; + size_t k = 0 ; + for (i = 0 ; i < taxCnt ; ++i) + { + if (used.find(taxIds[i]) == used.end()) + { + taxIds[k] = taxIds[i] ; + ++k ; + used[taxIds[i]] = 1 ; + } + } + taxIds.Resize(k) ; + } + } + // @return: Number of children tax ids. childrenTax: compact tax ids below or equal to ctid. size_t GetChildrenTax(size_t ctid, std::map &childrenTax) { diff --git a/compactds/Utils.hpp b/compactds/Utils.hpp index e97c051..97a09e8 100644 --- a/compactds/Utils.hpp +++ b/compactds/Utils.hpp @@ -10,10 +10,6 @@ #include #include -#ifdef __SSE4_2__ -#include -#endif - namespace compactds { #define WORD_64 // comment this out if word size is 32 @@ -80,7 +76,7 @@ class Utils // Count the number of 1's in x. static int Popcount(WORD x) { -#ifdef __SSE4_2__ +#ifdef __GNUC__ return __builtin_popcountll(x); #else #ifdef WORD_64 From e73972263b19a4c837a90bfe7e83824ac197de47 Mon Sep 17 00:00:00 2001 From: mourisl Date: Thu, 8 Aug 2024 16:06:33 -0400 Subject: [PATCH 2/3] Bump up version number --- defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defs.h b/defs.h index f9ff27c..b002bda 100644 --- a/defs.h +++ b/defs.h @@ -5,7 +5,7 @@ //#define DEBUG -#define CENTRIFUGER_VERSION "1.0.5-r159" +#define CENTRIFUGER_VERSION "1.0.5-r161" extern char nucToNum[26] ; extern char numToNuc[26] ; From 3de22d0b4055e4f82bea59328b88012697d6ffd9 Mon Sep 17 00:00:00 2001 From: mourisl Date: Thu, 8 Aug 2024 16:12:50 -0400 Subject: [PATCH 3/3] Correct the version number --- defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defs.h b/defs.h index b002bda..362bc57 100644 --- a/defs.h +++ b/defs.h @@ -5,7 +5,7 @@ //#define DEBUG -#define CENTRIFUGER_VERSION "1.0.5-r161" +#define CENTRIFUGER_VERSION "1.0.5-r163" extern char nucToNum[26] ; extern char numToNuc[26] ;