Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

alignproteome added #875

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1cb2074
alignproteome added
Gyuuul2 Aug 16, 2024
12c0c81
add workflow, revise based on feedback
Gyuuul2 Aug 19, 2024
df80f07
Change createdb.cpp so that it takes in ".txt" file containing paths …
elpis51613 Aug 25, 2024
1073ec9
Merge remote-tracking branch 'refs/remotes/origin/master'
elpis51613 Aug 25, 2024
3617ea7
Change filepath format from txt to tsv. Also add explanations of its …
elpis51613 Aug 26, 2024
3fc7fcd
createtsv - handle source identifier
Gyuuul2 Aug 27, 2024
0487a1f
proteomcluster - change writer / way to select repProteome
Gyuuul2 Aug 27, 2024
3f4fc91
easy proteomecluster workflow
Gyuuul2 Aug 27, 2024
eeabad3
indexreader - handle source identifier
Gyuuul2 Aug 27, 2024
9996821
update mmseqsbase and change module name into proteomecluster
Gyuuul2 Aug 27, 2024
27d9f68
set alignment mode default
Gyuuul2 Aug 28, 2024
e2ef229
latest
Gyuuul2 Aug 28, 2024
fd1f5d6
original index
Gyuuul2 Aug 28, 2024
0d7db0f
use sync_fetch_and_add when increment memcount
Gyuuul2 Aug 28, 2024
1fe489e
change submat
Gyuuul2 Aug 28, 2024
86f6b15
add timer
Gyuuul2 Aug 28, 2024
8344e5b
Merge branch 'pr-879'
Gyuuul2 Aug 28, 2024
be5a53d
Remove __packed__ to resolve sync_fetch_and_add error
Gyuuul2 Aug 28, 2024
8d3a38f
change chmod in createdb
Gyuuul2 Aug 29, 2024
bfa1911
update proteome-similarity threshold parameter
Gyuuul2 Aug 29, 2024
7e912d2
Sort redundant proteomes by similarity score in proteome_cluster.tsv
Gyuuul2 Sep 8, 2024
618e2bf
delete redundant code - checking proteome is singleton(no redudant me…
Gyuuul2 Sep 9, 2024
e990773
stop
Gyuuul2 Sep 9, 2024
e9ffe22
jaccardindexTrial
Gyuuul2 Sep 8, 2024
d14217f
add normalized scoring
Gyuuul2 Sep 9, 2024
5e6ee6d
add proteome relative(normalized) similarity threshold parameter
Gyuuul2 Sep 9, 2024
743370d
relative similarity threshold
Gyuuul2 Sep 9, 2024
44a6fca
latest version-bugfix,relativeProteomeSimilarity added, singleton pro…
Gyuuul2 Sep 9, 2024
910bb7f
Change Relative Score formula
Gyuuul2 Sep 10, 2024
e24b9e3
Remove Debug printout
Gyuuul2 Sep 10, 2024
ad1acdf
style change
Gyuuul2 Sep 10, 2024
e026260
Latest Version
Gyuuul2 Sep 10, 2024
d9213ef
Add cluster count report
Gyuuul2 Sep 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

extern int align(int argc, const char **argv, const Command& command);
extern int alignall(int argc, const char **argv, const Command& command);
extern int alignproteome(int argc, const char **argv, const Command& command);
extern int alignbykmer(int argc, const char **argv, const Command& command);
extern int appenddbtoindex(int argc, const char **argv, const Command& command);
extern int apply(int argc, const char **argv, const Command& command);
Expand Down
10 changes: 10 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,15 @@ std::vector<Command> baseCommands = {
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"alignproteome", alignproteome, &par.alignproteome, COMMAND_ALIGNMENT,
"Within-result all-vs-all gapped local alignment",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would improve this text.

NULL,
"Martin Steinegger <[email protected]>",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add you @Gyuuul2

"<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"alignall", alignall, &par.alignall, COMMAND_ALIGNMENT,
"Within-result all-vs-all gapped local alignment",
NULL,
Expand All @@ -640,6 +649,7 @@ std::vector<Command> baseCommands = {
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},

{"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT,
"Transfer alignments via transitivity",
//"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
Expand Down
6 changes: 5 additions & 1 deletion src/alignment/Matcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ class Matcher{
}
}
}

float getSeqId(){
return seqId;
}
};

Matcher(int querySeqType, int targetSeqType, int maxSeqLen, BaseMatrix *m,
Expand Down Expand Up @@ -217,7 +221,7 @@ class Matcher{


static size_t resultToBuffer(char * buffer, const result_t &result, bool addBacktrace, bool compress = true, bool addOrfPosition = false);

static size_t resultToBuffer_str(char * buffer, const result_t &result, bool addBacktrace, bool compress, bool addOrfPosition=false);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed?

static int computeAlnLength(int anEnd, int start, int dbEnd, int dbStart);

static void updateResultByRescoringBacktrace(const char *querySeq, const char *targetSeq, const char **subMat, EvalueComputation &evaluer,
Expand Down
2 changes: 2 additions & 0 deletions src/commons/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(commons_header_files
commons/Command.h
commons/CommandCaller.h
commons/Concat.h
# commons/ClusterSpecies.h
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove it.

commons/DBConcat.h
commons/DBReader.h
commons/DBWriter.h
Expand Down Expand Up @@ -52,6 +53,7 @@ set(commons_source_files
commons/BaseMatrix.cpp
commons/Command.cpp
commons/CommandCaller.cpp
# commons/ClusterSpecies.cpp
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove it.

commons/DBConcat.cpp
commons/DBReader.cpp
commons/DBWriter.cpp
Expand Down
32 changes: 32 additions & 0 deletions src/commons/ClusterSpecies.cpp
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file looks incomplete

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "ClusterSpecies.h"
void ClusterSpecies::addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId) {
clusterEntriesPerSpecies[speciesId][clusterRepId].push_back(proteinId);
// speciesClusterMemCounts[speciesId]++;
}


void ClusterSpecies::removeSpecies(unsigned int speciesId) {
clusterEntriesPerSpecies.erase(speciesId);
// speciesClusterMemCounts.erase(speciesId);
}

int ClusterSpecies::findRepSpecies(){
//find max entry in speciesClusterMemCounts
unsigned int maxSpeciesId = 0;
size_t maxSpeciesSize = 0;
for (auto const& species : clusterEntriesPerSpecies){
size_t speiciesSize = species.second.size();
if (speiciesSize > maxSpeciesSize){
maxSpeciesId = species.first;
}
}

return maxSpeciesId;
// for (auto const& x : speciesClusterMemCounts)
// {
// if (x.second > maxSpeciesCount){
// maxSpeciesCount = x.second;
// maxSpeciesId = x.first;
// }
// }
}
42 changes: 42 additions & 0 deletions src/commons/ClusterSpecies.h
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#ifndef CLUSTER_SPECIES_H
#define CLUSTER_SPECIES_H

#include <vector>
#include <unordered_map>


class ClusterSpecies {
public:

// std::unordered_map<unsigned int, std::unordered_map<unsigned int, std::vector<unsigned int>>> clusterEntriesPerSpecies;
// // std::unordered_map<unsigned int, size_t> speciesClusterMemCounts;
// void addProtein(unsigned int speciesId, unsigned int clusterRepId, unsigned int proteinId);
// void removeSpecies(unsigned int speciesId);
// int findRepSpecies();

// template <typename T>
// struct ProteomeLength{
// T id;
// unsigned int length;
// };

// std::vector<ProteomeLength<unsigned int>> proteomeAAToatalLength;

// void addProteomeLength(unsigned int id, unsigned int length){
// ProteomeLength<unsigned int> pl;
// pl.id = id;
// pl.length = length;
// proteomeAAToatalLength.push_back(pl);
// }

// unsigned int getProteomeLength(unsigned int id){
// for (auto const& pl : proteomeAAToatalLength){
// if (pl.id == id){
// return pl.length;
// }
// }
// return 0;
// }
};

#endif //CLUSTER_SPECIES_H
73 changes: 73 additions & 0 deletions src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
totalDataSize = 0;
dataFileCnt = dataFileNames.size();

dataSizeOffset = new size_t[dataFileNames.size() + 1];
dataFiles = new char*[dataFileNames.size()];
for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
Expand All @@ -136,6 +137,8 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
}
if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) {
Debug(Debug::INFO) << "ReadLookup file: " << dataFileName << "\n"; //gyuri
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove this print out.


std::string lookupFilename = (std::string(dataFileName) + ".lookup");
MemoryMapped lookupData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
if (lookupData.isValid() == false) {
Expand All @@ -144,7 +147,9 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
char* lookupDataChar = (char *) lookupData.getData();
size_t lookupDataSize = lookupData.size();
Debug(Debug::INFO) << "Lookup Data size is " << lookupDataSize << "\n"; //gyuri
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove this print out.

lookupSize = Util::ompCountLines(lookupDataChar, lookupDataSize, threads);
Debug(Debug::INFO) << "Lookup size is " << lookupSize << "\n"; //gyuri
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove this print out.

lookup = new(std::nothrow) LookupEntry[this->lookupSize];
incrementMemory(sizeof(LookupEntry) * this->lookupSize);
readLookup(lookupDataChar, lookupDataSize, lookup);
Expand All @@ -155,6 +160,22 @@ template <typename T> bool DBReader<T>::open(int accessType){
}
lookupData.close();
}
if (dataMode & USE_SOURCE){
std::string sourceFilename = (std::string(dataFileName) + ".source");
MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
if (sourceData.isValid() == false) {
Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
EXIT(EXIT_FAILURE);
}
char* sourceDataChar = (char *) sourceData.getData();
size_t sourceDataSize = sourceData.size();
sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
source = new(std::nothrow) SourceEntry[this->sourceSize];
incrementMemory(sizeof(SourceEntry) * this->sourceSize);
readSource(sourceDataChar, sourceDataSize, source);

}

bool isSortedById = false;
if (externalData == false) {
MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
Expand Down Expand Up @@ -707,6 +728,15 @@ template <typename T> unsigned int DBReader<T>::getLookupFileNumber(size_t id){
return lookup[id].fileNumber;
}

template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
if (id >= sourceSize){
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
EXIT(EXIT_FAILURE);
}
return source[id].fileName;
}

template<>
void DBReader<unsigned int>::lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry) {
buffer.append(SSTR(entry.id));
Expand Down Expand Up @@ -995,6 +1025,19 @@ size_t DBReader<T>::getOffset(size_t id) {
return index[id].offset;
}

template<typename T>
size_t DBReader<T>::getIndexLen(size_t id) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have code to get the index length, called getSeqLen.

if (id >= size){
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n";
Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n";
EXIT(EXIT_FAILURE);
}
if (local2id != NULL) {
id = local2id[id];
}
return index[id].length;
}

template<typename T>
size_t DBReader<T>::findNextOffsetid(size_t id) {
size_t idOffset = getOffset(id);
Expand Down Expand Up @@ -1048,6 +1091,36 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
currPos = lookupData - (char *) data;

i++;


}
}

template <typename T>
void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
size_t i=0;
size_t currPos = 0;
char* sourceData = (char *) data;
const char * cols[3];
while (currPos < dataSize){
if (i >= this->sourceSize) {
Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
EXIT(EXIT_FAILURE);
}
Util::getFieldsOfLine(sourceData, cols, 3);
source[i].id = Util::fast_atoi<size_t>(cols[0]);
std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
size_t lastDotPosition = fileName.rfind('.');

if (lastDotPosition != std::string::npos) {
fileName = fileName.substr(0, lastDotPosition);
}
source[i].fileName = fileName;
sourceData = Util::skipLine(sourceData);

currPos = sourceData - (char *) data;

i++;
}
}

Expand Down
15 changes: 14 additions & 1 deletion src/commons/DBReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ class DBReader : public MemoryTracker {
return false;
}
};

struct SourceEntry{
T id;
std::string fileName;
};
struct LookupEntry {
T id;
std::string entryName;
Expand Down Expand Up @@ -184,6 +187,8 @@ class DBReader : public MemoryTracker {

size_t getSize() const;

unsigned int getProteomeTotalLen(size_t id); //gyuri

unsigned int getMaxSeqLen(){
return (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE ) ) ?
(std::max(maxSeqLen, 1u)) / Sequence::PROFILE_READIN_SIZE :
Expand Down Expand Up @@ -248,6 +253,7 @@ class DBReader : public MemoryTracker {
void lookupEntryToBuffer(std::string& buffer, const LookupEntry& entry);
LookupEntry* getLookup() { return lookup; };

std::string getSourceFileName(size_t id);
static const int NOSORT = 0;
static const int SORT_BY_LENGTH = 1;
static const int LINEAR_ACCCESS = 2;
Expand All @@ -266,6 +272,7 @@ class DBReader : public MemoryTracker {
static const unsigned int USE_FREAD = 4;
static const unsigned int USE_LOOKUP = 8;
static const unsigned int USE_LOOKUP_REV = 16;
static const unsigned int USE_SOURCE = 32;


// compressed
Expand Down Expand Up @@ -309,6 +316,8 @@ class DBReader : public MemoryTracker {

void readLookup(char *data, size_t dataSize, LookupEntry *lookup);

void readSource(char *data, size_t dataSize, SourceEntry *source);

void readIndexId(T* id, char * line, const char** cols);

unsigned int indexIdToNum(T* id);
Expand Down Expand Up @@ -437,6 +446,8 @@ class DBReader : public MemoryTracker {

size_t findNextOffsetid(size_t id);

size_t getIndexLen(size_t id);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use getSeqLen


int isCompressed(){
return isCompressed(dbtype);
}
Expand Down Expand Up @@ -485,7 +496,9 @@ class DBReader : public MemoryTracker {

Index * index;
size_t lookupSize;
size_t sourceSize;
LookupEntry * lookup;
SourceEntry * source;
bool sortedByOffset;

unsigned int * id2local;
Expand Down
1 change: 1 addition & 0 deletions src/commons/DBWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ size_t DBWriter::writeAdd(const char* data, size_t dataSize, unsigned int thrIdx
}
size_t totalWriten = 0;
if(isCompressedDB && (state[thrIdx] == INIT_STATE || state[thrIdx] == COMPRESSED) ) {
Debug(Debug::INFO) << "Compressing data in thread " << thrIdx << "\n"; //gyuri
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would remove this print out.

state[thrIdx] = COMPRESSED;
// zstd seems to have a hard time with elements < 60
ZSTD_inBuffer input = { data, dataSize, 0 };
Expand Down
Loading