Skip to content

Commit

Permalink
Add Blast format and FastA Huge comment correction
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrillet committed May 23, 2014
1 parent bd05af3 commit 473f769
Show file tree
Hide file tree
Showing 45 changed files with 3,396 additions and 73 deletions.
16 changes: 7 additions & 9 deletions .cproject
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?>

<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="cdt.managedbuild.config.gnu.lib.debug.1749414343">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.lib.debug.1749414343" moduleId="org.eclipse.cdt.core.settings" name="Debug">
Expand All @@ -13,11 +11,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
Expand All @@ -37,8 +35,8 @@
<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1544253014" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.default" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1304733242" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/PlastLibrary/src}&quot;"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/linux"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java/include"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java/include/linux"/>
</option>
<option id="gnu.cpp.compiler.option.debugging.gprof.1253556430" name="Generate gprof information (-pg)" superClass="gnu.cpp.compiler.option.debugging.gprof" value="true" valueType="boolean"/>
<option id="gnu.cpp.compiler.option.preprocessor.def.1220974519" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
Expand Down Expand Up @@ -97,11 +95,11 @@
</externalSetting>
</externalSettings>
<extensions>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.ELF" point="org.eclipse.cdt.core.BinaryParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
Expand All @@ -116,8 +114,8 @@
<option id="gnu.cpp.compiler.lib.release.option.debugging.level.534390027" name="Debug Level" superClass="gnu.cpp.compiler.lib.release.option.debugging.level" value="gnu.cpp.compiler.debugging.level.none" valueType="enumerated"/>
<option id="gnu.cpp.compiler.option.include.paths.1267507508" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/PlastLibrary/src}&quot;"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.0.x86_64/include/linux"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java/include"/>
<listOptionValue builtIn="false" value="/usr/lib/jvm/java/include/linux"/>
</option>
<option id="gnu.cpp.compiler.option.preprocessor.def.650156252" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" valueType="definedSymbols">
<listOptionValue builtIn="false" value="__LINUX__=1"/>
Expand Down
13 changes: 12 additions & 1 deletion src/algo/core/api/IAlgoConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ class IConfiguration : public dp::SmartPointer
*/
virtual IParameters* createDefaultParameters (const std::string& algoName) = 0;

/** Create quick reader which permits to parse quickly the different file type
* It permits to open the fasta format (.fa), the blast format (.pin for protein or .nin for nucleotid).
* It permits also to open the alias file (.pal for protein and .nal for nucleotid).
* This quick reader open the file and extract some information which is used to split the database reading
* \param[in] uri : uri path
* \param[in] shouldInferType : tells whether we should try to find the kind of genomic database we read( used only for fasta format)
* \return a new IDatabaseQuickReader instance
*/
virtual database::IDatabaseQuickReader* createDefaultQuickReader (const std::string& uri, bool shouldInferType) = 0;

/** Create a command dispatcher instance. Such an instance can be used for parallelization (hits
* iteration for instance, see IAlgorithm class).
* \return a new ICommandDispatcher instance
Expand All @@ -104,9 +114,10 @@ class IConfiguration : public dp::SmartPointer
virtual os::impl::TimeInfo* createTimeInfo () = 0;

/** Create a factory that builds ISequenceIterator objects.
* \param[in] uri : uri path to select the sequence iterator factory depending of the file type
* \return the factory instance.
*/
virtual database::ISequenceIteratorFactory* createSequenceIteratorFactory () = 0;
virtual database::ISequenceIteratorFactory* createSequenceIteratorFactory (const std::string& uri) = 0;

/** Create a database object (with means for retrieving sequence within the database) from an uri (likely
* a local file, but it should be a location on a remote computer). A Range can be provided for using only
Expand Down
15 changes: 15 additions & 0 deletions src/algo/core/impl/AbstractAlgorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ static const char* keyIter = "iteration";
static const char* keyOutput = "output";
static const char* keyAlgorithm = "algorithm";

/*static u_int64_t checksum = 0;
static u_int64_t nbDataSeq = 0;*/

/********************************************************************************/

/** Command that launch 'iterate' method on a IHitIterator instance. Using a Command allows to
Expand Down Expand Up @@ -294,6 +297,17 @@ void AbstractAlgorithm::execute (void)
/** Shortcuts. */
ISequenceDatabase* subjectDb = subjectDbIt->currentItem();

/*ISequence seq;
for (u_int64_t nbSeq=0;nbSeq<subjectDb->getSequencesNumber();nbSeq++)
{
subjectDb->getSequenceByIndex(nbSeq,seq);
for (u_int64_t nData=0;nData<seq.data.letters.size;nData++)
{
checksum = (checksum + seq.data.letters.data[nData]) % (1<<16);
nbDataSeq++;
}
}*/

DEBUG (("AbstractAlgorithm::execute : subjectSeqNb=%ld querySeqNb=%ld\n",
subjectDb->getSequencesNumber(),
queryDb->getSequencesNumber()
Expand Down Expand Up @@ -363,6 +377,7 @@ void AbstractAlgorithm::execute (void)
} /* end of for (subjectDbIt.first(); ... */

} /* end of for (queryDbIt.first(); ... */
//printf ("checksum=%ld nbData=%ld\n", checksum, nbDataSeq);
}

/*********************************************************************
Expand Down
46 changes: 43 additions & 3 deletions src/algo/core/impl/DefaultAlgoConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@

#include <database/impl/BufferedSequenceDatabase.hpp>
#include <database/impl/FastaSequenceIterator.hpp>
#include <database/impl/FastaDatabaseQuickReader.hpp>
#include <database/impl/BlastdbSequenceIterator.hpp>
#include <database/impl/BlastdbDatabaseQuickReader.hpp>
#include <database/impl/DatabaseUtility.hpp>

#include <seed/impl/BasicSeedModel.hpp>
#include <seed/impl/SubSeedModel.hpp>
Expand Down Expand Up @@ -136,6 +140,31 @@ DefaultConfiguration::~DefaultConfiguration ()
setProperties (0);
}

/*********************************************************************
** METHOD :
** PURPOSE :
** INPUT :
** OUTPUT :
** RETURN :
** REMARKS :
*********************************************************************/
database::IDatabaseQuickReader* DefaultConfiguration::createDefaultQuickReader (const std::string& uri, bool shouldInferType)
{
DatabaseLookupType::QuickReaderType_e databaseType = DatabaseLookupType::ENUM_TYPE_UNKNOWN;

if (uri!="foo")
{
databaseType = DatabaseLookupType::quickReaderType(uri);
if ((databaseType==DatabaseLookupType::ENUM_BLAST_PIN)||(databaseType==DatabaseLookupType::ENUM_BLAST_NIN)
||(databaseType==DatabaseLookupType::ENUM_BLAST_PAL)||(databaseType==DatabaseLookupType::ENUM_BLAST_NAL))
return new BlastdbDatabaseQuickReader (uri, shouldInferType);
else
return new FastaDatabaseQuickReader (uri, shouldInferType);
}
else
return new FastaDatabaseQuickReader (uri, shouldInferType);
}

/*********************************************************************
** METHOD :
** PURPOSE :
Expand Down Expand Up @@ -355,9 +384,18 @@ os::impl::TimeInfo* DefaultConfiguration::createTimeInfo ()
** RETURN :
** REMARKS :
*********************************************************************/
database::ISequenceIteratorFactory* DefaultConfiguration::createSequenceIteratorFactory ()
database::ISequenceIteratorFactory* DefaultConfiguration::createSequenceIteratorFactory (const string& uri)
{
return new FastaSequenceIteratorFactory ();
DatabaseLookupType::QuickReaderType_e databaseType = DatabaseLookupType::ENUM_TYPE_UNKNOWN;

databaseType = DatabaseLookupType::quickReaderType(uri);
if ((databaseType==DatabaseLookupType::ENUM_BLAST_PIN)||(databaseType==DatabaseLookupType::ENUM_BLAST_NIN)
||(databaseType==DatabaseLookupType::ENUM_BLAST_PAL)||(databaseType==DatabaseLookupType::ENUM_BLAST_NAL))
{
return new BlastdbSequenceIteratorFactory ();
}
else
return new FastaSequenceIteratorFactory();
}

/*********************************************************************
Expand All @@ -376,11 +414,13 @@ ISequenceDatabase* DefaultConfiguration::createDatabase (
)
{
LOCAL (sequenceIteratorFactory);
ISequenceIteratorFactory* tempFactory = createSequenceIteratorFactory(uri);
LOCAL (tempFactory);

/** We create the sequence iterator. */
ISequenceIterator* seqIterator = sequenceIteratorFactory ?
sequenceIteratorFactory->createSequenceIterator (uri, range) :
new FastaSequenceIterator (uri.c_str(), 2*1024, range.begin, range.end);
tempFactory->createSequenceIterator (uri, range);

/** We create the database. */
return new BufferedSequenceDatabase (seqIterator, filtering);
Expand Down
5 changes: 4 additions & 1 deletion src/algo/core/impl/DefaultAlgoConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class DefaultConfiguration : public IConfiguration
/** Destructor. */
virtual ~DefaultConfiguration ();

/** \copydoc IConfiguration::createDefaultQuickReader */
database::IDatabaseQuickReader* createDefaultQuickReader (const std::string& uri, bool shouldInferType);

/** \copydoc IConfiguration::createDefaultParameters */
IParameters* createDefaultParameters (const std::string& algoName);

Expand All @@ -72,7 +75,7 @@ class DefaultConfiguration : public IConfiguration
os::impl::TimeInfo* createTimeInfo ();

/** \copydoc IConfiguration::createSequenceIteratorFactory */
database::ISequenceIteratorFactory* createSequenceIteratorFactory ();
database::ISequenceIteratorFactory* createSequenceIteratorFactory (const std::string& uri);

/** \copydoc IConfiguration::createDatabase */
database::ISequenceDatabase* createDatabase (
Expand Down
3 changes: 2 additions & 1 deletion src/algo/core/impl/DefaultAlgoEnvironment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,8 @@ void DefaultEnvironment::setSubjectBank (dp::IProperties* properties, u_int64_t
IProperty* subjectUriProp = properties->getProperty (STR_OPTION_SUBJECT_URI);

/** We create the quick reader instance for the subject bank. */
setQuickSubjectDbReader (new FastaDatabaseQuickReader (subjectUriProp->value, true));
setQuickSubjectDbReader (_config->createDefaultQuickReader(subjectUriProp->value, true));
subjectUriProp->value = _quickSubjectDbReader->getUri();

/** We check whether it is an 'info' file or a fasta file. */
bool isInfoFile = FastaDatabaseQuickReader::isQuickReaderFile (subjectUriProp->value);
Expand Down
4 changes: 3 additions & 1 deletion src/algo/core/impl/PlastnAlgoConfig.cpp
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,13 @@ ISequenceDatabase* PlastnConfiguration::createDatabase (
));

LOCAL (sequenceIteratorFactory);
ISequenceIteratorFactory* tempFactory = createSequenceIteratorFactory(uri);
LOCAL (tempFactory);

/** We create the sequence iterator. */
ISequenceIterator* seqIterator = sequenceIteratorFactory ?
sequenceIteratorFactory->createSequenceIterator (uri, range) :
new FastaSequenceIterator (uri.c_str(), 2*1024, range.begin, range.end);
tempFactory->createSequenceIterator (uri, range);

return new BufferedCachedSequenceDatabase (seqIterator, filtering);
}
Expand Down
3 changes: 2 additions & 1 deletion src/alignment/visitors/impl/RawOutputVisitor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ class RawOutputVisitor : public FileVisitor
/** \copydoc AbstractAlignmentResultVisitor::visitSubjectSequence */
void visitSubjectSequence (const database::ISequence* seq, const misc::ProgressInfo& progress)
{
getFile()->print ("S %d %s\n", seq->getLength(), seq->comment);
//getFile()->print ("S %d %s\n", seq->getLength(), seq->comment);
getFile()->print ("S %d %s\n", seq->getLength(), seq->getComment(seq->comment).c_str());
}

/** \copydoc AbstractAlignmentResultVisitor::visitAlignment */
Expand Down
7 changes: 5 additions & 2 deletions src/alignment/visitors/impl/TabulatedOutputVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,11 @@ void TabulatedOutputVisitor::dumpLine (core::Alignment* align)
snprintf (queryName, sizeof(queryName), "%s", _currentQuery->comment);
if ( (locate = database::ISequence::searchIdSeparator (queryName)) != 0) { *locate = 0; }

char subjectName[128];
snprintf (subjectName, sizeof(subjectName), "%s", _currentSubject->comment);
char subjectName[128] = "";
//snprintf (subjectName, sizeof(subjectName), "%s", _currentSubject->comment);
//snprintf (subjectName, sizeof(subjectName), "TEST");
snprintf (subjectName, sizeof(subjectName), "%s", _currentSubject->getComment(_currentSubject->comment).c_str());

if ( (locate = database::ISequence::searchIdSeparator (subjectName)) != 0) { *locate = 0; }

char evalueStr[32];
Expand Down
2 changes: 1 addition & 1 deletion src/alignment/visitors/impl/XmlOutputVisitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ void XmlOutputVisitor::visitSubjectSequence (

printline (4, "<Hit>");
printline (5, "<Hit_num>%d</Hit_num>", _nbSubject);
printline (5, "<Hit_def>%s</Hit_def>", _currentSubject->comment);
printline (5, "<Hit_def>%s</Hit_def>", _currentSubject->getComment(_currentSubject->comment).c_str());
printline (5, "<Hit_len>%d</Hit_len>", _currentSubject->data.letters.size);
printline (5, "<Hit_hsps>");
}
Expand Down
2 changes: 2 additions & 0 deletions src/database/api/IAlphabet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ enum Encoding
SUBSEED,
ASCII,
NCBI,
NCBI_DNA_NO_AMB,
NCBI_DNA_WITH_AMB,
UNKNOWN
};

Expand Down
5 changes: 4 additions & 1 deletion src/database/api/IDatabaseQuickReader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

#include <designpattern/api/SmartPointer.hpp>
#include <designpattern/api/IProperty.hpp>
#include <designpattern/impl/TokenizerIterator.hpp>
#include <os/impl/CommonOsImpl.hpp>
#include <misc/api/types.hpp>
#include <vector>
#include <string>
Expand Down Expand Up @@ -68,7 +70,7 @@ class IDatabaseQuickReader : public dp::SmartPointer
ENUM_UNKNOWN
};

/** Read the database. Must be called before using getters.
/** Read the database. Must be called before using getters.
* A 'maxblocksize' parameter can be provided; if not null, it is used for splitting the database
* into small sequences sets of maximum size; as a result, one can retrieve a vector of offsets,
* each offset pointing to a set of sequences.
Expand Down Expand Up @@ -118,6 +120,7 @@ class IDatabaseQuickReader : public dp::SmartPointer

/** */
virtual int save (const std::string& uri) = 0;

};

/********************************************************************************/
Expand Down
Loading

0 comments on commit 473f769

Please sign in to comment.