Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

alignproteome added #875

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1cb2074
alignproteome added
Gyuuul2 Aug 16, 2024
12c0c81
add workflow, revise based on feedback
Gyuuul2 Aug 19, 2024
df80f07
Change createdb.cpp so that it takes in ".txt" file containing paths …
elpis51613 Aug 25, 2024
1073ec9
Merge remote-tracking branch 'refs/remotes/origin/master'
elpis51613 Aug 25, 2024
3617ea7
Change filepath format from txt to tsv. Also add explanations of its …
elpis51613 Aug 26, 2024
3fc7fcd
createtsv - handle source identifier
Gyuuul2 Aug 27, 2024
0487a1f
proteomcluster - change writer / way to select repProteome
Gyuuul2 Aug 27, 2024
3f4fc91
easy proteomecluster workflow
Gyuuul2 Aug 27, 2024
eeabad3
indexreader - handle source identifier
Gyuuul2 Aug 27, 2024
9996821
update mmseqsbase and change module name into proteomecluster
Gyuuul2 Aug 27, 2024
27d9f68
set alignment mode default
Gyuuul2 Aug 28, 2024
e2ef229
latest
Gyuuul2 Aug 28, 2024
fd1f5d6
original index
Gyuuul2 Aug 28, 2024
0d7db0f
use sync_fetch_and_add when increment memcount
Gyuuul2 Aug 28, 2024
1fe489e
change submat
Gyuuul2 Aug 28, 2024
86f6b15
add timer
Gyuuul2 Aug 28, 2024
8344e5b
Merge branch 'pr-879'
Gyuuul2 Aug 28, 2024
be5a53d
Remove __packed__ to resolve sync_fetch_and_add error
Gyuuul2 Aug 28, 2024
8d3a38f
change chmod in createdb
Gyuuul2 Aug 29, 2024
bfa1911
update proteome-similarity threshold parameter
Gyuuul2 Aug 29, 2024
7e912d2
Sort redundant proteomes by similarity score in proteome_cluster.tsv
Gyuuul2 Sep 8, 2024
618e2bf
delete redundant code - checking proteome is singleton(no redudant me…
Gyuuul2 Sep 9, 2024
e990773
stop
Gyuuul2 Sep 9, 2024
e9ffe22
jaccardindexTrial
Gyuuul2 Sep 8, 2024
d14217f
add normalized scoring
Gyuuul2 Sep 9, 2024
5e6ee6d
add proteome relative(normalized) similarity threshold parameter
Gyuuul2 Sep 9, 2024
743370d
relative similarity threshold
Gyuuul2 Sep 9, 2024
44a6fca
latest version-bugfix,relativeProteomeSimilarity added, singleton pro…
Gyuuul2 Sep 9, 2024
910bb7f
Change Relative Score formula
Gyuuul2 Sep 10, 2024
e24b9e3
Remove Debug printout
Gyuuul2 Sep 10, 2024
ad1acdf
style change
Gyuuul2 Sep 10, 2024
e026260
Latest Version
Gyuuul2 Sep 10, 2024
d9213ef
Add cluster count report
Gyuuul2 Sep 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions data/workflow/easyproteomecluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/sh -e
fail() {
echo "Error: $1"
exit 1
}

notExists() {
[ ! -f "$1" ]
}


if notExists "${TMP_PATH}/input.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
|| fail "query createdb died"
fi

if notExists "${TMP_PATH}/clu.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" linclust "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
|| fail "Search died"
fi

if notExists "${TMP_PATH}/aln.dbtype"; then
# shellcheck disable=SC2086
"$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_protein" "${TMP_PATH}/aln_proteome" ${PROTEOMECLUSTER_PAR} \
|| fail "Convert Alignments died"
fi

if notExists "${RESULTS}_protein_cluster.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
|| fail "createtsv protein cluster died"
fi

if notExists "${RESULTS}_protein_align.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
|| fail "createtsv protein align died"
fi

if notExists "${RESULTS}_proteome_cluster.tsv"; then
# shellcheck disable=SC2086
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
|| fail "createtsv proteome cluster died"
fi


if [ -n "${REMOVE_TMP}" ]; then
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
rm -rf "${TMP_PATH}/clu_tmp"
rm -f "${TMP_PATH}/easyproteomecluster.sh"
fi
5 changes: 4 additions & 1 deletion src/commons/IndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class IndexReader {
) : sequenceReader(NULL), index(NULL) {
int targetDbtype = FileUtil::parseDbType(dataName.c_str());
if (Parameters::isEqualDbtype(targetDbtype, Parameters::DBTYPE_INDEX_DB)) {
index = new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happened here?

new DBReader<unsigned int>(dataName.c_str(), (dataName + ".index").c_str(), 1, dataMode);
index->open(DBReader<unsigned int>::NOSORT);
if (PrefilteringIndexReader::checkIfIndexFile(index)) {
PrefilteringIndexReader::printSummary(index);
Expand Down Expand Up @@ -95,6 +95,8 @@ class IndexReader {
}else{
failSuffix = "_aln";
}
} else if (databaseType & SOURCE) {
failSuffix = "";
}
}
sequenceReader = new DBReader<unsigned int>(
Expand All @@ -115,6 +117,7 @@ class IndexReader {
static const unsigned int SRC_HEADERS = 4;
static const unsigned int SRC_SEQUENCES = 8;
static const unsigned int ALIGNMENTS = 16;
static const unsigned int SOURCE = 32;
static const unsigned int USER_SELECT = 1 << 31;

static unsigned int makeUserDatabaseType(unsigned int baseKey) {
Expand Down
52 changes: 36 additions & 16 deletions src/util/createtsv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,34 @@
int createtsv(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
const bool hasTargetDB = par.filenames.size() > 3;
DBReader<unsigned int> *reader;
if (hasTargetDB) {
reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
} else {
reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
}
reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);
const bool useSourceIdentifier = DBReader<unsigned int>::getExtendedDbtype(reader->getDbtype()) & Parameters::DBTYPE_EXTENDED_SRC_IDENTIFIER;

bool queryNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db1.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
bool targetNucs = Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_NUCLEOTIDES);
const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
int queryHeaderType = (queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
int queryHeaderType = (useSourceIdentifier) ? IndexReader::SOURCE :
(queryNucs) ? IndexReader::SRC_HEADERS : IndexReader::HEADERS;
queryHeaderType = (par.idxSeqSrc == 0) ? queryHeaderType : (par.idxSeqSrc == 1) ? IndexReader::HEADERS : IndexReader::SRC_HEADERS;
IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
unsigned int preloadMode = (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0;
unsigned int dataMode = (useSourceIdentifier)
? (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_SOURCE)
: (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, preloadMode, dataMode);

// IndexReader qDbrHeader(par.db1, par.threads, queryHeaderType, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
IndexReader * tDbrHeader=NULL;
DBReader<unsigned int> * queryDB = qDbrHeader.sequenceReader;
DBReader<unsigned int> * targetDB = NULL;
bool sameDB = (par.db2.compare(par.db1) == 0);
const bool hasTargetDB = par.filenames.size() > 3;

DBReader<unsigned int>::Index * qHeaderIndex = qDbrHeader.sequenceReader->getIndex();
DBReader<unsigned int>::Index * tHeaderIndex = NULL;

Expand All @@ -49,23 +65,13 @@ int createtsv(int argc, const char **argv, const Command &command) {
}
}

DBReader<unsigned int> *reader;
if (hasTargetDB) {

reader = new DBReader<unsigned int>(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
} else {

reader = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
}
reader->open(DBReader<unsigned int>::LINEAR_ACCCESS);

const std::string& dataFile = hasTargetDB ? par.db4 : par.db3;
const std::string& indexFile = hasTargetDB ? par.db4Index : par.db3Index;
const bool shouldCompress = par.dbOut == true && par.compressed == true;
const int dbType = par.dbOut == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE;
DBWriter writer(dataFile.c_str(), indexFile.c_str(), par.threads, shouldCompress, dbType);
writer.open();

const size_t targetColumn = (par.targetTsvColumn == 0) ? SIZE_T_MAX : par.targetTsvColumn - 1;
#pragma omp parallel
{
Expand All @@ -84,6 +90,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
for (size_t i = 0; i < reader->getSize(); ++i) {
unsigned int queryKey = reader->getDbKey(i);
size_t queryIndex = queryDB->getId(queryKey);
size_t querySourceIndex = SIZE_T_MAX;
if (useSourceIdentifier){
querySourceIndex = queryDB->getSourceKey(queryKey);
}

char *headerData = queryDB->getData(queryIndex, thread_idx);
if (headerData == NULL) {
Expand All @@ -92,7 +102,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
}

std::string queryHeader;
if (par.fullHeader) {
if (useSourceIdentifier){
queryHeader = queryDB->getSourceFileName(querySourceIndex);
}
else if (par.fullHeader) {
queryHeader = "\"";
queryHeader.append(headerData, qHeaderIndex[queryIndex].length - 2);
queryHeader.append("\"");
Expand All @@ -118,12 +131,19 @@ int createtsv(int argc, const char **argv, const Command &command) {
} else if (hasTargetDB) {
unsigned int targetKey = (unsigned int) strtoul(dbKey, NULL, 10);
size_t targetIndex = targetDB->getId(targetKey);
size_t targetSourceIdex = SIZE_T_MAX;
if (useSourceIdentifier){
targetSourceIdex = targetDB->getSourceKey(targetKey);
}
char *targetData = targetDB->getData(targetIndex, thread_idx);
if (targetData == NULL) {
Debug(Debug::WARNING) << "Invalid header entry in query " << queryKey << " and target " << targetKey << "!\n";
continue;
}
if (par.fullHeader) {
if (useSourceIdentifier){
targetAccession = targetDB->getSourceFileName(targetSourceIdex);
}
else if (par.fullHeader) {
targetAccession = "\"";
targetAccession.append(targetData, tHeaderIndex[targetIndex].length - 2);
targetAccession.append("\"");
Expand Down
Loading
Loading