soedinglab · martin-steinegger · Aug 16, 2024 · Aug 19, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
@@ -4,6 +4,7 @@
 
 extern int align(int argc, const char **argv, const Command& command);
 extern int alignall(int argc, const char **argv, const Command& command);
+extern int alignproteome(int argc, const char **argv, const Command& command);
 extern int alignbykmer(int argc, const char **argv, const Command& command);
 extern int appenddbtoindex(int argc, const char **argv, const Command& command);
 extern int apply(int argc, const char **argv, const Command& command);

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -632,6 +632,15 @@ std::vector<Command> baseCommands = {
                                                            {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"alignproteome",             alignproteome,             &par.alignproteome,             COMMAND_ALIGNMENT,
+                "Within-result all-vs-all gapped local alignment",
+                NULL,
+                "Martin Steinegger <[email protected]>",
+                "<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
                 "Within-result all-vs-all gapped local alignment",
                 NULL,
@@ -640,6 +649,7 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+
         {"transitivealign",      transitivealign,      &par.align,                COMMAND_ALIGNMENT,
                 "Transfer alignments via transitivity",
                 //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",

diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
@@ -140,6 +140,10 @@ class Matcher{
                 }
             }
         }
+
+        float getSeqId(){
+            return seqId;
+        }
     };
 
     Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
@@ -217,7 +221,7 @@ class Matcher{
 
 
     static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress  = true, bool addOrfPosition = false);
-
+    static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false);
     static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);
 
     static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,

diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
@@ -8,6 +8,7 @@ set(commons_header_files
         commons/Command.h
         commons/CommandCaller.h
         commons/Concat.h
+        # commons/ClusterSpecies.h
         commons/DBConcat.h
         commons/DBReader.h
         commons/DBWriter.h
@@ -52,6 +53,7 @@ set(commons_source_files
         commons/BaseMatrix.cpp
         commons/Command.cpp
         commons/CommandCaller.cpp
+        # commons/ClusterSpecies.cpp
         commons/DBConcat.cpp
         commons/DBReader.cpp
         commons/DBWriter.cpp

diff --git a/src/commons/ClusterSpecies.cpp b/src/commons/ClusterSpecies.cpp
@@ -0,0 +1,32 @@
+#include "ClusterSpecies.h"
+void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) {
+   clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId);
+//    speciesClusterMemCounts[speciesId]++;
+}
+
+
+void ClusterSpecies::removeSpecies(unsigned int speciesId) {
+    clusterEntriesPerSpecies.erase(speciesId);
+    // speciesClusterMemCounts.erase(speciesId);
+}
+
+int ClusterSpecies::findRepSpecies(){
+    //find max entry in speciesClusterMemCounts
+    unsigned int maxSpeciesId = 0;
+    size_t maxSpeciesSize = 0;
+    for (auto const& species : clusterEntriesPerSpecies){
+        size_t speiciesSize = species.second.size();
+        if (speiciesSize > maxSpeciesSize){
+            maxSpeciesId = species.first;
+        }
+    }
+
+    return maxSpeciesId;
+    // for (auto const& x : speciesClusterMemCounts)
+    // {
+    //     if (x.second > maxSpeciesCount){
+    //         maxSpeciesCount = x.second;
+    //         maxSpeciesId = x.first;
+    //     }
+    // }
+}
diff --git a/src/commons/ClusterSpecies.h b/src/commons/ClusterSpecies.h
@@ -0,0 +1,42 @@
+#ifndef CLUSTER_SPECIES_H
+#define CLUSTER_SPECIES_H
+
+#include <vector>
+#include <unordered_map>
+
+
+class ClusterSpecies {
+public:
+
+    // std::unordered_map<unsigned int, std::unordered_map<unsigned int, std::vector<unsigned int>>> clusterEntriesPerSpecies;
+    // // std::unordered_map<unsigned int, size_t> speciesClusterMemCounts;
+    // void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId);
+    // void removeSpecies(unsigned int speciesId);
+    // int findRepSpecies();
+
+    // template <typename T>
+    // struct ProteomeLength{
+    //     T id;
+    //     unsigned int length;
+    // };
+
+    // std::vector<ProteomeLength<unsigned int>> proteomeAAToatalLength;
+
+    // void addProteomeLength(unsigned int id, unsigned int length){
+    //     ProteomeLength<unsigned int> pl;
+    //     pl.id = id;
+    //     pl.length = length;
+    //     proteomeAAToatalLength.push_back(pl);
+    // }
+
+    // unsigned int getProteomeLength(unsigned int id){
+    //     for (auto const& pl : proteomeAAToatalLength){
+    //         if (pl.id == id){
+    //             return pl.length;
+    //         }
+    //     }
+    //     return 0;
+    // }
+};
+
+#endif //CLUSTER_SPECIES_H
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
@@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         totalDataSize = 0;
         dataFileCnt = dataFileNames.size();
+
         dataSizeOffset = new size_t[dataFileNames.size() + 1];
         dataFiles = new char*[dataFileNames.size()];
         for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
@@ -136,6 +137,8 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
     }
     if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) {
+        Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri
+
         std::string lookupFilename = (std::string(dataFileName) + ".lookup");
         MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
         if (lookupData.isValid() == false) {
@@ -144,7 +147,9 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         char* lookupDataChar = (char *) lookupData.getData();
         size_t lookupDataSize = lookupData.size();
+        Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri
         lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads);
+        Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri
         lookup = new(std::nothrow) LookupEntry[this->lookupSize];
         incrementMemory(sizeof(LookupEntry) * this->lookupSize);
         readLookup(lookupDataChar, lookupDataSize, lookup);
@@ -155,6 +160,22 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         lookupData.close();
     }
+    if (dataMode & USE_SOURCE){
+        std::string sourceFilename = (std::string(dataFileName) + ".source");
+        MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
+        if (sourceData.isValid() == false) {
+            Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        char* sourceDataChar = (char *) sourceData.getData();
+        size_t sourceDataSize = sourceData.size();
+        sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
+        source = new(std::nothrow) SourceEntry[this->sourceSize];
+        incrementMemory(sizeof(SourceEntry) * this->sourceSize);
+        readSource(sourceDataChar, sourceDataSize, source);
+
+    }
+
     bool isSortedById = false;
     if (externalData == false) {
         MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
@@ -707,6 +728,15 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){
     return lookup[id].fileNumber;
 }
 
+template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].fileName;
+}
+
 template<>
 void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
     buffer.append(SSTR(entry.id));
@@ -995,6 +1025,19 @@ size_t DBReader<T>::getOffset(size_t id) {
     return index[id].offset;
 }
 
+template<typename T>
+size_t DBReader<T>::getIndexLen(size_t id) {
+    if (id >= size){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n";
+        Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (local2id != NULL) {
+        id = local2id[id];
+    }
+    return index[id].length;
+}
+
 template<typename T>
 size_t DBReader<T>::findNextOffsetid(size_t id) {
     size_t idOffset = getOffset(id);
@@ -1048,6 +1091,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
         currPos = lookupData - (char *) data;
 
         i++;
+
+
+    }
+}
+
+template <typename T>
+void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
+    size_t i=0;
+    size_t currPos = 0;
+    char* sourceData = (char *) data;
+    const char * cols[3];
+    while (currPos < dataSize){
+        if (i >= this->sourceSize) {
+            Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        Util::getFieldsOfLine(sourceData, cols, 3);
+        source[i].id = Util::fast_atoi<size_t>(cols[0]);
+        std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
+        size_t lastDotPosition = fileName.rfind('.');
+
+        if (lastDotPosition != std::string::npos) {
+            fileName = fileName.substr(0, lastDotPosition);
+        }
+        source[i].fileName = fileName;
+        sourceData = Util::skipLine(sourceData);
+
+        currPos = sourceData - (char *) data;
+
+        i++;
     }
 }
 

diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
@@ -98,7 +98,10 @@ class DBReader : public MemoryTracker {
             return false;
         }
     };
-
+    struct SourceEntry{
+        T id;
+        std::string fileName;
+    };
     struct LookupEntry {
         T id;
         std::string entryName;
@@ -184,6 +187,8 @@ class DBReader : public MemoryTracker {
 
     size_t getSize() const;
 
+    unsigned int getProteomeTotalLen(size_t id); //gyuri
+
     unsigned int getMaxSeqLen(){ 
             return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ?
                     (std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE :
@@ -248,6 +253,7 @@ class DBReader : public MemoryTracker {
     void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry);
     LookupEntry* getLookup() { return lookup; };
 
+    std::string getSourceFileName(size_t id);
     static const int NOSORT = 0;
     static const int SORT_BY_LENGTH = 1;
     static const int LINEAR_ACCCESS = 2;
@@ -266,6 +272,7 @@ class DBReader : public MemoryTracker {
     static const unsigned int USE_FREAD      = 4;
     static const unsigned int USE_LOOKUP     = 8;
     static const unsigned int USE_LOOKUP_REV = 16;
+    static const unsigned int USE_SOURCE     = 32;
 
 
     // compressed
@@ -309,6 +316,8 @@ class DBReader : public MemoryTracker {
 
     void readLookup(char *data, size_t dataSize, LookupEntry *lookup);
 
+    void readSource(char *data, size_t dataSize, SourceEntry *source);
+
     void readIndexId(T* id, char * line, const char** cols);
 
     unsigned int indexIdToNum(T* id);
@@ -437,6 +446,8 @@ class DBReader : public MemoryTracker {
 
     size_t findNextOffsetid(size_t id);
 
+    size_t getIndexLen(size_t id);
+
     int isCompressed(){
         return isCompressed(dbtype);
     }
@@ -485,7 +496,9 @@ class DBReader : public MemoryTracker {
 
     Index * index;
     size_t lookupSize;
+    size_t sourceSize;
     LookupEntry * lookup;
+    SourceEntry * source;
     bool sortedByOffset;
 
     unsigned int * id2local;

diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp
@@ -283,6 +283,7 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx
     }
     size_t totalWriten = 0;
     if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) {
+        Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri
         state[thrIdx] = COMPRESSED;
         // zstd seems to have a hard time with elements < 60
         ZSTD_inBuffer input = { data, dataSize, 0 };