diff --git a/q2_sourmash/_compute.py b/q2_sourmash/_compute.py index 8b3396b..926785c 100644 --- a/q2_sourmash/_compute.py +++ b/q2_sourmash/_compute.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- from q2_types.per_sample_sequences import SingleLanePerSampleSingleEndFastqDirFmt, FastqGzFormat +from q2_types.feature_data import DNAIterator import qiime2.util import pandas as pd from q2_sourmash._format import MinHashSigJsonDirFormat @@ -14,6 +15,41 @@ import subprocess import glob import sys +import tempfile +import shutil + +def compute_fasta(sequence_file: DNAIterator, + ksizes: int, + scaled: int, + track_abundance: bool=True) -> MinHashSigJsonDirFormat: + output = MinHashSigJsonDirFormat() + + # write files in tmp dir + #with tempfile.TemporaryDirectory() as temp_dir: + for seq in sequence_file: + tmp_fp = os.path.join(str(output), str(seq.metadata['id']) + '.fasta') + with open(tmp_fp, 'w') as indiv_fasta: + seq.write(indiv_fasta) + + command = ['sourmash', + 'compute', + str(output) + "/*", + '--ksizes', + str(ksizes), + '--scaled', + str(scaled)] + + if track_abundance: + command.append('--track-abundance') + + subprocess.run(' '.join(command), check=True, shell=True, cwd=str(output)) + + for seq_file in glob.glob(os.path.join(str(output), '*fasta')): + os.remove(seq_file) + + sys.stdout.flush() + + return output def compute(sequence_file:SingleLanePerSampleSingleEndFastqDirFmt, ksizes: int, scaled: int, track_abundance: bool=True) -> MinHashSigJsonDirFormat: diff --git a/q2_sourmash/plugin_setup.py b/q2_sourmash/plugin_setup.py index 2a01248..282bf25 100644 --- a/q2_sourmash/plugin_setup.py +++ b/q2_sourmash/plugin_setup.py @@ -13,10 +13,11 @@ from qiime2.plugin import Plugin, Metadata, Str, List, Citations, SemanticType, TextFileFormat, ValidationError from qiime2.plugin import model import qiime2.util -from q2_sourmash._compute import compute +from q2_sourmash._compute import compute, compute_fasta from q2_sourmash._compare import compare from q2_types.distance_matrix import DistanceMatrix from q2_types.sample_data import SampleData +from q2_types.feature_data import FeatureData, Sequence from q2_types.per_sample_sequences import SequencesWithQuality from ._format import MinHashSigJsonDirFormat, MinHashSigJson from ._types import MinHashSig @@ -41,6 +42,17 @@ plugin.register_views(MinHashSigJson, MinHashSigJsonDirFormat) plugin.register_semantic_types(MinHashSig) +plugin.methods.register_function( + function=compute_fasta, + inputs={'sequence_file': FeatureData[Sequence]}, + parameters={'ksizes': qiime2.plugin.Int, + 'scaled': qiime2.plugin.Int, + 'track_abundance': qiime2.plugin.Bool}, + outputs=[('min_hash_signature', MinHashSig)], + name = 'compute sourmash signature', + description = 'Computes a sourmash MinHash signature from genome fasta files.' +) + plugin.methods.register_function( function=compute, inputs={'sequence_file': SampleData[SequencesWithQuality]},