-
Notifications
You must be signed in to change notification settings - Fork 206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
alignproteome added #875
base: master
Are you sure you want to change the base?
alignproteome added #875
Changes from 1 commit
1cb2074
12c0c81
df80f07
1073ec9
3617ea7
3fc7fcd
0487a1f
3f4fc91
eeabad3
9996821
27d9f68
e2ef229
fd1f5d6
0d7db0f
1fe489e
86f6b15
8344e5b
be5a53d
8d3a38f
bfa1911
7e912d2
618e2bf
e990773
e9ffe22
d14217f
5e6ee6d
743370d
44a6fca
910bb7f
e24b9e3
ad1acdf
e026260
d9213ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -632,6 +632,15 @@ std::vector<Command> baseCommands = { | |
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | ||
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | ||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, | ||
{"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT, | ||
"Within-result all-vs-all gapped local alignment", | ||
NULL, | ||
"Martin Steinegger <[email protected]>", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add you @Gyuuul2 |
||
"<i:sequenceDB> <i:resultDB> <o:alignmentDB>", | ||
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | ||
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | ||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, | ||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, | ||
{"alignall", alignall, &par.alignall, COMMAND_ALIGNMENT, | ||
"Within-result all-vs-all gapped local alignment", | ||
NULL, | ||
|
@@ -640,6 +649,7 @@ std::vector<Command> baseCommands = { | |
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, | ||
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, | ||
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, | ||
|
||
{"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT, | ||
"Transfer alignments via transitivity", | ||
//"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,6 +140,10 @@ class Matcher{ | |
} | ||
} | ||
} | ||
|
||
float getSeqId(){ | ||
return seqId; | ||
} | ||
}; | ||
|
||
Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m, | ||
|
@@ -217,7 +221,7 @@ class Matcher{ | |
|
||
|
||
static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false); | ||
|
||
static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this needed? |
||
static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart); | ||
|
||
static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ set(commons_header_files | |
commons/Command.h | ||
commons/CommandCaller.h | ||
commons/Concat.h | ||
# commons/ClusterSpecies.h | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove it. |
||
commons/DBConcat.h | ||
commons/DBReader.h | ||
commons/DBWriter.h | ||
|
@@ -52,6 +53,7 @@ set(commons_source_files | |
commons/BaseMatrix.cpp | ||
commons/Command.cpp | ||
commons/CommandCaller.cpp | ||
# commons/ClusterSpecies.cpp | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove it. |
||
commons/DBConcat.cpp | ||
commons/DBReader.cpp | ||
commons/DBWriter.cpp | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file looks incomplete |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#include "ClusterSpecies.h" | ||
void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) { | ||
clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId); | ||
// speciesClusterMemCounts[speciesId]++; | ||
} | ||
|
||
|
||
void ClusterSpecies::removeSpecies(unsigned int speciesId) { | ||
clusterEntriesPerSpecies.erase(speciesId); | ||
// speciesClusterMemCounts.erase(speciesId); | ||
} | ||
|
||
int ClusterSpecies::findRepSpecies(){ | ||
//find max entry in speciesClusterMemCounts | ||
unsigned int maxSpeciesId = 0; | ||
size_t maxSpeciesSize = 0; | ||
for (auto const& species : clusterEntriesPerSpecies){ | ||
size_t speiciesSize = species.second.size(); | ||
if (speiciesSize > maxSpeciesSize){ | ||
maxSpeciesId = species.first; | ||
} | ||
} | ||
|
||
return maxSpeciesId; | ||
// for (auto const& x : speciesClusterMemCounts) | ||
// { | ||
// if (x.second > maxSpeciesCount){ | ||
// maxSpeciesCount = x.second; | ||
// maxSpeciesId = x.first; | ||
// } | ||
// } | ||
} |
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#ifndef CLUSTER_SPECIES_H | ||
#define CLUSTER_SPECIES_H | ||
|
||
#include <vector> | ||
#include <unordered_map> | ||
|
||
|
||
class ClusterSpecies { | ||
public: | ||
|
||
// std::unordered_map<unsigned int, std::unordered_map<unsigned int, std::vector<unsigned int>>> clusterEntriesPerSpecies; | ||
// // std::unordered_map<unsigned int, size_t> speciesClusterMemCounts; | ||
// void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId); | ||
// void removeSpecies(unsigned int speciesId); | ||
// int findRepSpecies(); | ||
|
||
// template <typename T> | ||
// struct ProteomeLength{ | ||
// T id; | ||
// unsigned int length; | ||
// }; | ||
|
||
// std::vector<ProteomeLength<unsigned int>> proteomeAAToatalLength; | ||
|
||
// void addProteomeLength(unsigned int id, unsigned int length){ | ||
// ProteomeLength<unsigned int> pl; | ||
// pl.id = id; | ||
// pl.length = length; | ||
// proteomeAAToatalLength.push_back(pl); | ||
// } | ||
|
||
// unsigned int getProteomeLength(unsigned int id){ | ||
// for (auto const& pl : proteomeAAToatalLength){ | ||
// if (pl.id == id){ | ||
// return pl.length; | ||
// } | ||
// } | ||
// return 0; | ||
// } | ||
}; | ||
|
||
#endif //CLUSTER_SPECIES_H |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |
} | ||
totalDataSize = 0; | ||
dataFileCnt = dataFileNames.size(); | ||
|
||
dataSizeOffset = new size_t[dataFileNames.size() + 1]; | ||
dataFiles = new char*[dataFileNames.size()]; | ||
for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){ | ||
|
@@ -136,6 +137,8 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |
} | ||
} | ||
if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) { | ||
Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove this print out. |
||
|
||
std::string lookupFilename = (std::string(dataFileName) + ".lookup"); | ||
MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); | ||
if (lookupData.isValid() == false) { | ||
|
@@ -144,7 +147,9 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |
} | ||
char* lookupDataChar = (char *) lookupData.getData(); | ||
size_t lookupDataSize = lookupData.size(); | ||
Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove this print out. |
||
lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads); | ||
Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove this print out. |
||
lookup = new(std::nothrow) LookupEntry[this->lookupSize]; | ||
incrementMemory(sizeof(LookupEntry) * this->lookupSize); | ||
readLookup(lookupDataChar, lookupDataSize, lookup); | ||
|
@@ -155,6 +160,22 @@ template <typename T> bool DBReader<T>::open(int accessType){ | |
} | ||
lookupData.close(); | ||
} | ||
if (dataMode & USE_SOURCE){ | ||
std::string sourceFilename = (std::string(dataFileName) + ".source"); | ||
MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); | ||
if (sourceData.isValid() == false) { | ||
Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n"; | ||
EXIT(EXIT_FAILURE); | ||
} | ||
char* sourceDataChar = (char *) sourceData.getData(); | ||
size_t sourceDataSize = sourceData.size(); | ||
sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads); | ||
source = new(std::nothrow) SourceEntry[this->sourceSize]; | ||
incrementMemory(sizeof(SourceEntry) * this->sourceSize); | ||
readSource(sourceDataChar, sourceDataSize, source); | ||
|
||
} | ||
|
||
bool isSortedById = false; | ||
if (externalData == false) { | ||
MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); | ||
|
@@ -707,6 +728,15 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){ | |
return lookup[id].fileNumber; | ||
} | ||
|
||
template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){ | ||
if (id >= sourceSize){ | ||
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n"; | ||
Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n"; | ||
EXIT(EXIT_FAILURE); | ||
} | ||
return source[id].fileName; | ||
} | ||
|
||
template<> | ||
void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) { | ||
buffer.append(SSTR(entry.id)); | ||
|
@@ -995,6 +1025,19 @@ size_t DBReader<T>::getOffset(size_t id) { | |
return index[id].offset; | ||
} | ||
|
||
template<typename T> | ||
size_t DBReader<T>::getIndexLen(size_t id) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already have code to get the index length, called getSeqLen. |
||
if (id >= size){ | ||
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n"; | ||
Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n"; | ||
EXIT(EXIT_FAILURE); | ||
} | ||
if (local2id != NULL) { | ||
id = local2id[id]; | ||
} | ||
return index[id].length; | ||
} | ||
|
||
template<typename T> | ||
size_t DBReader<T>::findNextOffsetid(size_t id) { | ||
size_t idOffset = getOffset(id); | ||
|
@@ -1048,6 +1091,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry | |
currPos = lookupData - (char *) data; | ||
|
||
i++; | ||
|
||
|
||
} | ||
} | ||
|
||
template <typename T> | ||
void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) { | ||
size_t i=0; | ||
size_t currPos = 0; | ||
char* sourceData = (char *) data; | ||
const char * cols[3]; | ||
while (currPos < dataSize){ | ||
if (i >= this->sourceSize) { | ||
Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n"; | ||
EXIT(EXIT_FAILURE); | ||
} | ||
Util::getFieldsOfLine(sourceData, cols, 3); | ||
source[i].id = Util::fast_atoi<size_t>(cols[0]); | ||
std::string fileName = std::string(cols[1], (cols[2] - cols[1])); | ||
size_t lastDotPosition = fileName.rfind('.'); | ||
|
||
if (lastDotPosition != std::string::npos) { | ||
fileName = fileName.substr(0, lastDotPosition); | ||
} | ||
source[i].fileName = fileName; | ||
sourceData = Util::skipLine(sourceData); | ||
|
||
currPos = sourceData - (char *) data; | ||
|
||
i++; | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,7 +98,10 @@ class DBReader : public MemoryTracker { | |
return false; | ||
} | ||
}; | ||
|
||
struct SourceEntry{ | ||
T id; | ||
std::string fileName; | ||
}; | ||
struct LookupEntry { | ||
T id; | ||
std::string entryName; | ||
|
@@ -184,6 +187,8 @@ class DBReader : public MemoryTracker { | |
|
||
size_t getSize() const; | ||
|
||
unsigned int getProteomeTotalLen(size_t id); //gyuri | ||
|
||
unsigned int getMaxSeqLen(){ | ||
return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ? | ||
(std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE : | ||
|
@@ -248,6 +253,7 @@ class DBReader : public MemoryTracker { | |
void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry); | ||
LookupEntry* getLookup() { return lookup; }; | ||
|
||
std::string getSourceFileName(size_t id); | ||
static const int NOSORT = 0; | ||
static const int SORT_BY_LENGTH = 1; | ||
static const int LINEAR_ACCCESS = 2; | ||
|
@@ -266,6 +272,7 @@ class DBReader : public MemoryTracker { | |
static const unsigned int USE_FREAD = 4; | ||
static const unsigned int USE_LOOKUP = 8; | ||
static const unsigned int USE_LOOKUP_REV = 16; | ||
static const unsigned int USE_SOURCE = 32; | ||
|
||
|
||
// compressed | ||
|
@@ -309,6 +316,8 @@ class DBReader : public MemoryTracker { | |
|
||
void readLookup(char *data, size_t dataSize, LookupEntry *lookup); | ||
|
||
void readSource(char *data, size_t dataSize, SourceEntry *source); | ||
|
||
void readIndexId(T* id, char * line, const char** cols); | ||
|
||
unsigned int indexIdToNum(T* id); | ||
|
@@ -437,6 +446,8 @@ class DBReader : public MemoryTracker { | |
|
||
size_t findNextOffsetid(size_t id); | ||
|
||
size_t getIndexLen(size_t id); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use getSeqLen |
||
|
||
int isCompressed(){ | ||
return isCompressed(dbtype); | ||
} | ||
|
@@ -485,7 +496,9 @@ class DBReader : public MemoryTracker { | |
|
||
Index * index; | ||
size_t lookupSize; | ||
size_t sourceSize; | ||
LookupEntry * lookup; | ||
SourceEntry * source; | ||
bool sortedByOffset; | ||
|
||
unsigned int * id2local; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -283,6 +283,7 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx | |
} | ||
size_t totalWriten = 0; | ||
if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) { | ||
Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would remove this print out. |
||
state[thrIdx] = COMPRESSED; | ||
// zstd seems to have a hard time with elements < 60 | ||
ZSTD_inBuffer input = { data, dataSize, 0 }; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would improve this text.