diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index b610a806b..2378621ba 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -124,11 +124,13 @@ std::vector baseCommands = { "mmseqs createdb file1.fa file2.fa.gz file3.fa sequenceDB\n\n" "# Create a seqDB from stdin\n" "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n" + "# Create a seqDB from generic DB created by tar2db or another seqDB\n" + "mmseqs createdb inputDB sequenceDB\n\n" "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n" "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n", "Martin Steinegger ", - " ... | ", - CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, + " ... || ", + CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin|inputDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, {"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN, NULL, diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index dca9dfb7b..6b7bf799b 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -28,12 +28,17 @@ int createdb(int argc, const char **argv, const Command& command) { } bool dbInput = false; + bool isSeqDb = false; if (FileUtil::fileExists(par.db1dbtype.c_str()) == true) { if (filenames.size() > 1) { Debug(Debug::ERROR) << "Only one database can be used with database input\n"; EXIT(EXIT_FAILURE); } dbInput = true; + // if par.hdr1 is not an empty string and the file exists + if (par.hdr1 != "" && FileUtil::fileExists(par.hdr1.c_str()) == true ) { + isSeqDb = true; + } par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; } @@ -98,9 +103,14 @@ int createdb(int argc, const char **argv, const Command& command) { size_t fileCount = filenames.size(); DBReader* reader = NULL; + DBReader* hdrReader = NULL; if (dbInput == true) { reader = new DBReader(par.db1.c_str(), par.db1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX | DBReader::USE_LOOKUP); reader->open(DBReader::LINEAR_ACCCESS); + if (isSeqDb) { + hdrReader = new DBReader(par.hdr1.c_str(), par.hdr1Index.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); + hdrReader->open(DBReader::LINEAR_ACCCESS); + } fileCount = reader->getSize(); } @@ -126,8 +136,15 @@ int createdb(int argc, const char **argv, const Command& command) { } KSeqWrapper* kseq = NULL; + std::string seq = ">"; if (dbInput == true) { - kseq = new KSeqBuffer(reader->getData(fileIdx, 0), reader->getEntryLen(fileIdx) - 1); + if (isSeqDb) { + seq.append(reader->getData(fileIdx, 0)); + seq.append(hdrReader->getData(fileIdx, 0)); + kseq = new KSeqBuffer(seq.c_str(), seq.length()); + } else { + kseq = new KSeqBuffer(reader->getData(fileIdx, 0), reader->getEntryLen(fileIdx) - 1); + } } else { kseq = KSeqFactory(filenames[fileIdx].c_str()); } @@ -260,6 +277,10 @@ int createdb(int argc, const char **argv, const Command& command) { if (dbInput == true) { reader->close(); delete reader; + if (isSeqDb) { + hdrReader->close(); + delete hdrReader; + } } if (entries_num == 0) {