Skip to content

Commit

Permalink
Support tax id conversion table at file level.
Browse files Browse the repository at this point in the history
  • Loading branch information
mourisl committed Jun 8, 2024
1 parent 66fe1af commit 8226c10
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 27 deletions.
33 changes: 24 additions & 9 deletions Builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,12 @@ class Builder
_fmIndex.SetSequenceExtraParameter((void *)b) ;
}

void Build(ReadFiles &refGenomeFile, char *taxonomyFile, char *nameTable, char *conversionTable, uint64_t subsetTax, size_t memoryConstraint, struct _FMBuilderParam &fmBuilderParam, const char *alphabetList)
void Build(ReadFiles &refGenomeFile, char *taxonomyFile, char *nameTable, char *conversionTable, bool conversionTableAtFileLevel, uint64_t subsetTax, size_t memoryConstraint, struct _FMBuilderParam &fmBuilderParam, const char *alphabetList)
{
size_t i ;
const int alphabetSize = strlen(alphabetList) ;

_taxonomy.Init(taxonomyFile, nameTable, conversionTable) ;
_taxonomy.Init(taxonomyFile, nameTable, conversionTable, conversionTableAtFileLevel) ;

FixedSizeElemArray genomes ;
SequenceCompactor seqCompactor ;
Expand All @@ -77,7 +77,16 @@ class Builder
std::vector<size_t> genomeLens ;
while (refGenomeFile.Next())
{
size_t seqid = _taxonomy.SeqNameToId(refGenomeFile.id) ;
size_t seqid = 0 ;
char fileNameBuffer[1024] ;
if (conversionTableAtFileLevel)
{
Utils::GetFileBaseName(refGenomeFile.GetFileName( refGenomeFile.GetCurrentFileInd() ).c_str(),
"fna|fa|fasta|faa", fileNameBuffer) ;
seqid = _taxonomy.SeqNameToId(fileNameBuffer) ;
}
else
seqid = _taxonomy.SeqNameToId(refGenomeFile.id) ;

if (subsetTax != 0)
{
Expand All @@ -88,8 +97,9 @@ class Builder

if (seqid >= _taxonomy.GetSeqCount())
{
fprintf(stderr, "WARNING: taxonomy id doesn't exist for %s!\n", refGenomeFile.id) ;
seqid = _taxonomy.AddExtraSeqName(refGenomeFile.id) ;
fprintf(stderr, "WARNING: taxonomy id doesn't exist for %s!\n",
conversionTableAtFileLevel ? fileNameBuffer : refGenomeFile.id) ;
seqid = _taxonomy.AddExtraSeqName(conversionTableAtFileLevel ? fileNameBuffer : refGenomeFile.id) ;
}

size_t len = seqCompactor.Compact(refGenomeFile.seq, genomes) ;
Expand All @@ -100,10 +110,15 @@ class Builder
genomes.SetSize(size - len) ;
continue ;
}

_seqLength[seqid] = len ;
genomeSeqIds.push_back(seqid) ;
genomeLens.push_back(len) ;

if (_seqLength.find(seqid) == _seqLength.end()) // Assume there is no duplicated seqid
{
_seqLength[seqid] = len ;
genomeSeqIds.push_back(seqid) ;
genomeLens.push_back(len) ;
}
else
_seqLength[seqid] += len ;
}

FixedSizeElemArray BWT ;
Expand Down
50 changes: 42 additions & 8 deletions CentrifugerBuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ char usage[] = "./centrifuger-build [OPTIONS]:\n"
"\t-l FILE: list of reference sequence file stored in <file>, one file per row\n"
"\t--taxonomy-tree FILE: taxonomy tree, i.e., nodes.dmp file\n"
"\t--name-table FILE: name table, i.e., names.dmp file\n"
"\t--conversion-table FILE: seqID to taxID conversion file\n"
"Optional:\n"
"\t--conversion-table FILE: seqID to taxID conversion file\n"
"\t\tWhen not set, expect -l option and the -l file should have two columns as \"file taxID\"\n"
"\t-o STRING: output prefix [centrifuger]\n"
"\t-t INT: number of threads [1]\n"
"\t--build-mem STR: automatic infer bmax and dcv to match memory constraints, can use T,G,M,K to specify the memory size [not used]\n"
Expand Down Expand Up @@ -46,6 +47,7 @@ int main(int argc, char *argv[])
fprintf( stderr, "%s", usage ) ;
return 0 ;
}
int i ;
int c, option_index ;
option_index = 0 ;
char outputPrefix[1024] = "centrifuger" ;
Expand All @@ -55,6 +57,9 @@ int main(int argc, char *argv[])
uint64_t subsetTax = 0 ;
size_t buildMemoryConstraint = 0 ;
ReadFiles refGenomeFile ;
char *fileList = NULL ; // the file corresponds to "-l" option
int fileListColumnCnt = 0 ;
bool conversionTableAtFileLevel = false ;

Builder builder ;

Expand All @@ -74,12 +79,29 @@ int main(int argc, char *argv[])
}
else if (c == 'l')
{
char *fileName = (char *)malloc(sizeof(char) * 4096) ;
fileList = strdup(optarg) ;

const int bufferSize = 4096 ;
char *lineBuffer = (char *)malloc(sizeof(char) * bufferSize) ;
char *fileName = (char *)malloc(sizeof(char) * bufferSize) ;
FILE *fpList = fopen(optarg, "r") ;
while (fscanf(fpList, "%s", fileName) != EOF)
while (fgets(lineBuffer, bufferSize, fpList) != NULL)
{
sscanf(lineBuffer, "%s", fileName) ;
refGenomeFile.AddReadFile(fileName, false) ;

if (fileListColumnCnt == 0) // Find how many columns in the file
{
fileListColumnCnt = 1 ;
for (i = 0 ; lineBuffer[i] && lineBuffer[i] != '\n' ; ++i)
if (lineBuffer[i] == ' ' || lineBuffer[i] == '\t')
++fileListColumnCnt ;
}
}
fclose(fpList) ;

free(fileName) ;
free(lineBuffer) ;
}
else if (c == 'o')
{
Expand Down Expand Up @@ -144,20 +166,32 @@ int main(int argc, char *argv[])
}
if (!conversionTable)
{
fprintf(stderr, "Need to use --conversion-table to specify sequence id to taxonomy id mapping.\n") ;
return EXIT_FAILURE ;
// Check whether the "-l" is right
if (fileList == NULL || fileListColumnCnt < 2)
{
fprintf(stderr, "Should use two-column file to specify the file name to taxonomy id mapping through the \"-l\" option. Otherwise, need to use --conversion-table to specify sequence id to taxonomy id mapping.\n") ;
return EXIT_FAILURE ;
}
else
{
conversionTableAtFileLevel = true ;
}
}

const char alphabetList[] = "ACGT" ;

Utils::PrintLog("Start to read in the genome files.") ;
builder.Build(refGenomeFile, taxonomyFile, nameTable, conversionTable, subsetTax, buildMemoryConstraint, fmBuilderParam, alphabetList) ;
builder.Build(refGenomeFile, taxonomyFile, nameTable,
conversionTableAtFileLevel ? fileList : conversionTable, conversionTableAtFileLevel,
subsetTax, buildMemoryConstraint, fmBuilderParam, alphabetList) ;
builder.Save(outputPrefix) ;

free(taxonomyFile) ;
free(nameTable) ;
free(conversionTable) ;

if (conversionTable)
free(conversionTable) ;
if (fileList)
free(fileList) ;
Utils::PrintLog("Done.") ;

return 0 ;
Expand Down
20 changes: 15 additions & 5 deletions ReadFiles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,19 @@ class ReadFiles

bool opened ;

void GetFileName( char *in, char *out )
void GetFileBaseName(const char *in, char *out )
{
int i, j ;
int i, j, k ;
int len = (int)strlen( in ) ;
for ( i = len ; i >= 0 && in[i] != '.' && in[i] != '/' ; --i )
;
for ( j = len ; j >= 0 && in[j] != '/' ; --j )
;
if ( i >= 0 && in[i] == '.' )
{
in[i] = '\0' ;
strcpy( out, in + j + 1 ) ;
in[i] = '.' ;
for (k = j + 1 ; k < i ; ++k)
out[k - (j + 1)] = in[k] ;
out[k - (j + 1)] = '\0' ;
}
else
{
Expand Down Expand Up @@ -310,6 +310,16 @@ class ReadFiles
}
}

int GetCurrentFileInd()
{
return currentFpInd ;
}

std::string GetFileName(int fileInd)
{
return fileNames[fileInd] ;
}

int GetFileCount()
{
return fileCnt ;
Expand Down
16 changes: 12 additions & 4 deletions Taxonomy.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "compactds/SimpleVector.hpp"
#include "compactds/Utils.hpp"

using namespace compactds ;

enum
{
RANK_UNKNOWN = 0,
Expand Down Expand Up @@ -259,7 +261,7 @@ class Taxonomy
}
}

void ReadSeqNameFile(std::string fname)
void ReadSeqNameFile(std::string fname, bool conversionTableAtFileLevel)
{
std::ifstream seqmap_file(fname.c_str(), std::ios::in);
std::map<std::string, uint64_t> rawSeqNameMap ;
Expand All @@ -273,6 +275,12 @@ class Taxonomy
uint64_t tid;
std::string seqIdStr;
cline >> seqIdStr >> tid ;
if (conversionTableAtFileLevel)
{
char buffer[1024] ;
Utils::GetFileBaseName(seqIdStr.c_str(), "fna|fa|fasta|faa", buffer) ;
seqIdStr = buffer ;
}
_seqStrNameMap.Add(seqIdStr) ;
rawSeqNameMap[seqIdStr] = tid ;
}
Expand All @@ -291,7 +299,7 @@ class Taxonomy
}
_seqCnt = _seqStrNameMap.GetSize() ;
}

void SaveString(FILE *fp, std::string &s)
{
size_t len = s.length() ;
Expand Down Expand Up @@ -350,13 +358,13 @@ class Taxonomy
}
}

void Init(const char *nodesFile, const char *namesFile, const char *seqIdFile)
void Init(const char *nodesFile, const char *namesFile, const char *seqIdFile, bool conversionTableAtFileLevel)
{
std::map<uint64_t, int> presentTax;
ReadPresentTaxonomyLeafs(std::string(seqIdFile), presentTax) ;
ReadTaxonomyTree(std::string(nodesFile), presentTax) ;
ReadTaxonomyName(std::string(namesFile), presentTax) ;
ReadSeqNameFile(std::string(seqIdFile)) ;
ReadSeqNameFile(std::string(seqIdFile), conversionTableAtFileLevel) ;

_rootCTaxId = FindRoot() ;
}
Expand Down
63 changes: 62 additions & 1 deletion compactds/Utils.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#ifndef _MOURISL_COMPACTDS_UTILS
#define _MOURISL_COMPACTDS_UTILS

#include <fstream>

#include <stdint.h>
#include <stdlib.h>
Expand Down Expand Up @@ -280,6 +279,68 @@ class Utils
return ret ;
}

// Deprive the path and extension from the file name
// Further go one more extension if there is an extra extension
// matching the given extraExtension string
// Can use "|" to add multiple potential extra extensions, like "fa|fna|faa"
// return: length of the base name
static int GetFileBaseName(const char *s, const char *extraExtension, char *result)
{
int i, j, k ;
int len = strlen(s) ;
int start, end ;

start = 0 ;
for (i = 0 ; i < len ; ++i)
if (s[i] == '/')
start = i + 1 ;

for (j = len - 1 ; j >= start ; --j )
{
if (s[j] == '.')
break ;
}

if (j < start)
j = len - 1 ;
else
--j ;
end = j ;
if (extraExtension != NULL)
{
for (i = 0 ; i < extraExtension[i] ; )
{
for (j = i + 1 ; extraExtension[j] && extraExtension[j] != '|' ; ++j )
;
// [i..j) corresponds to one extension in the |-separated extensions
bool flag = false ;
for (k = j - 1 ; k >= i ; --k)
if (end - (j - 1 - k ) < start || extraExtension[k] != s[end - (j - 1 - k)])
{
flag = true ;
break ;
}
if (end - (j - 1 - k ) < start || s[end - (j - 1 - k)] != '.')
flag = true ;

if (!flag)
{
end = end - (j - 1 - k) - 1 ; // extra minus 1 to get rid of the ".".
break ;
}

if (extraExtension[j] == '\0')
break ;
i = j + 1 ;
}
}

for (k = start ; k <= end ; ++k)
result[k - start] = s[k] ;
result[k - start] = '\0' ;
return end - start + 1 ;
}

static void PrintLog( const char *fmt, ... )
{
va_list args ;
Expand Down

0 comments on commit 8226c10

Please sign in to comment.