From ebaedb7e6a7ef2e979d1234efd9048118ba99f7f Mon Sep 17 00:00:00 2001 From: matchy233 Date: Wed, 2 Mar 2022 16:18:04 +0900 Subject: [PATCH 1/5] Fix createdb support for db input --- src/util/createdb.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index dca9dfb7b..eeaffaa63 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -98,9 +98,12 @@ int createdb(int argc, const char **argv, const Command& command) { size_t fileCount = filenames.size(); DBReader* reader = NULL; + DBReader* hdrReader = nullptr; if (dbInput == true) { reader = new DBReader(par.db1.c_str(), par.db1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX | DBReader::USE_LOOKUP); reader->open(DBReader::LINEAR_ACCCESS); + hdrReader = new DBReader((par.db1 + "_h").c_str(), (par.db1 + "_h.index").c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); + hdrReader->open(DBReader::LINEAR_ACCCESS); fileCount = reader->getSize(); } @@ -126,8 +129,11 @@ int createdb(int argc, const char **argv, const Command& command) { } KSeqWrapper* kseq = NULL; + std::string seq = ">"; if (dbInput == true) { - kseq = new KSeqBuffer(reader->getData(fileIdx, 0), reader->getEntryLen(fileIdx) - 1); + seq.append(hdrReader->getData(fileIdx, 0)); + seq.append(reader->getData(fileIdx, 0)); + kseq = new KSeqBuffer(seq.c_str(), seq.length()); } else { kseq = KSeqFactory(filenames[fileIdx].c_str()); } @@ -260,6 +266,8 @@ int createdb(int argc, const char **argv, const Command& command) { if (dbInput == true) { reader->close(); delete reader; + hdrReader->close(); + delete hdrReader; } if (entries_num == 0) { From ece8e5d5a872a786dbd5549d675011e2a503a746 Mon Sep 17 00:00:00 2001 From: matchy233 Date: Thu, 17 Mar 2022 00:11:13 +0900 Subject: [PATCH 2/5] Match coding convention --- src/util/createdb.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index eeaffaa63..7318d0168 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -98,11 +98,11 @@ int createdb(int argc, const char **argv, const Command& command) { size_t fileCount = filenames.size(); DBReader* reader = NULL; - DBReader* hdrReader = nullptr; + DBReader* hdrReader = NULL; if (dbInput == true) { reader = new DBReader(par.db1.c_str(), par.db1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX | DBReader::USE_LOOKUP); reader->open(DBReader::LINEAR_ACCCESS); - hdrReader = new DBReader((par.db1 + "_h").c_str(), (par.db1 + "_h.index").c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); + hdrReader = new DBReader(par.hdr1.c_str(), par.hdr1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); hdrReader->open(DBReader::LINEAR_ACCCESS); fileCount = reader->getSize(); } From e34f40d2cdb1c65fa6f3947ae22cb780b6d3a92e Mon Sep 17 00:00:00 2001 From: matchy233 Date: Thu, 17 Mar 2022 00:50:17 +0900 Subject: [PATCH 3/5] Update usage text with introduction for db input --- src/MMseqsBase.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index b610a806b..2cdd12d09 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -124,11 +124,13 @@ std::vector baseCommands = { "mmseqs createdb file1.fa file2.fa.gz file3.fa sequenceDB\n\n" "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" + "# Create a seqDB from another seqDB named inputDB\n" + "mmseqs createdb inputDB sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n", "Martin Steinegger ", - " ... | ", - CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, + " ... || ", + CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin|inputDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, {"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN, NULL, From cdb5c8a43da521e42f6266ae1d3facadaf7bcb1b Mon Sep 17 00:00:00 2001 From: matchy233 Date: Thu, 17 Mar 2022 02:41:17 +0900 Subject: [PATCH 4/5] Add back support for generic db --- src/util/createdb.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index 7318d0168..6b7bf799b 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -28,12 +28,17 @@ int createdb(int argc, const char **argv, const Command& command) { } bool dbInput = false; + bool isSeqDb = false; if (FileUtil::fileExists(par.db1dbtype.c_str()) == true) { if (filenames.size() > 1) { Debug(Debug::ERROR) << "Only one database can be used with database input\n"; EXIT(EXIT_FAILURE); } dbInput = true; + // if par.hdr1 is not an empty string and the file exists + if (par.hdr1 != "" && FileUtil::fileExists(par.hdr1.c_str()) == true ) { + isSeqDb = true; + } par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; } @@ -102,8 +107,10 @@ int createdb(int argc, const char **argv, const Command& command) { if (dbInput == true) { reader = new DBReader(par.db1.c_str(), par.db1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX | DBReader::USE_LOOKUP); reader->open(DBReader::LINEAR_ACCCESS); - hdrReader = new DBReader(par.hdr1.c_str(), par.hdr1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); - hdrReader->open(DBReader::LINEAR_ACCCESS); + if (isSeqDb) { + hdrReader = new DBReader(par.hdr1.c_str(), par.hdr1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); + hdrReader->open(DBReader::LINEAR_ACCCESS); + } fileCount = reader->getSize(); } @@ -131,9 +138,13 @@ int createdb(int argc, const char **argv, const Command& command) { KSeqWrapper* kseq = NULL; std::string seq = ">"; if (dbInput == true) { - seq.append(hdrReader->getData(fileIdx, 0)); - seq.append(reader->getData(fileIdx, 0)); - kseq = new KSeqBuffer(seq.c_str(), seq.length()); + if (isSeqDb) { + seq.append(reader->getData(fileIdx, 0)); + seq.append(hdrReader->getData(fileIdx, 0)); + kseq = new KSeqBuffer(seq.c_str(), seq.length()); + } else { + kseq = new KSeqBuffer(reader->getData(fileIdx, 0), reader->getEntryLen(fileIdx) - 1); + } } else { kseq = KSeqFactory(filenames[fileIdx].c_str()); } @@ -266,8 +277,10 @@ int createdb(int argc, const char **argv, const Command& command) { if (dbInput == true) { reader->close(); delete reader; - hdrReader->close(); - delete hdrReader; + if (isSeqDb) { + hdrReader->close(); + delete hdrReader; + } } if (entries_num == 0) { From f9eb35dbbcd33e08bcaedb8c6a33dae78524bfae Mon Sep 17 00:00:00 2001 From: matchy233 Date: Thu, 17 Mar 2022 02:42:43 +0900 Subject: [PATCH 5/5] Update usage text for createdb with more details Add explanation about generic dbs --- src/MMseqsBase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 2cdd12d09..2378621ba 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -124,7 +124,7 @@ std::vector baseCommands = { "mmseqs createdb file1.fa file2.fa.gz file3.fa sequenceDB\n\n" "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" - "# Create a seqDB from another seqDB named inputDB\n" + "# Create a seqDB from generic DB created by tar2db or another seqDB\n" "mmseqs createdb inputDB sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",