diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 7402f13b..ec0180d6 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -25,11 +25,12 @@ requirements: - ijson - h5py - qiime2 {{ qiime2_epoch }}.* + - samtools test: commands: - py.test --pyargs q2_types - + requires: - pytest - qiime2 >={{ qiime2 }} diff --git a/q2_types/feature_data_mag/__init__.py b/q2_types/feature_data_mag/__init__.py new file mode 100644 index 00000000..7f918d0f --- /dev/null +++ b/q2_types/feature_data_mag/__init__.py @@ -0,0 +1,25 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import importlib + +from ._format import ( + MAGSequencesDirFmt, + OrthologAnnotationDirFmt, + OrthologFileFmt + ) + +from ._type import MAG, NOG, OG, KEGG +from ._transformer import MAGIterator + +__all__ = [ + 'MAG', 'MAGSequencesDirFmt', 'MAGIterator', 'NOG', 'OG', 'KEGG', + 'OrthologAnnotationDirFmt', 'OrthologFileFmt', + ] + +importlib.import_module('q2_types.feature_data._transformer') diff --git a/q2_types/feature_data_mag/_format.py b/q2_types/feature_data_mag/_format.py new file mode 100644 index 00000000..2b401748 --- /dev/null +++ b/q2_types/feature_data_mag/_format.py @@ -0,0 +1,78 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import re + +from q2_types.feature_data import DNAFASTAFormat +from q2_types.genome_data._format import OrthologFileFmt +from qiime2.plugin import model + +from ..plugin_setup import plugin + + +class MAGSequencesDirFmt(model.DirectoryFormat): + pathspec = ( + r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-4[0-9a-fA-F]{3}-" + r"[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\.(fa|fasta)$" + ) + + sequences = model.FileCollection(pathspec, format=DNAFASTAFormat) + + @sequences.set_path_maker + def sequences_path_maker(self, mag_id): + return r'%s.fasta' % mag_id + + def feature_dict(self, relative=False): + ''' + Returns a mapping of mag id to filepath for each mag. + + Parameters + --------- + relative : bool + Whether to return filepaths relative to the directory's location. + Returns absolute filepaths by default. + + Returns + ------- + dict + Mapping of feature id -> filepath as described above. Sorted + alphabetically by key. + ''' + pattern = re.compile(self.pathspec) + ids = {} + for path in self.path.iterdir(): + if not pattern.match(path.name): + continue + + id = path.stem + absolute_path = path.absolute() + if relative: + ids[id] = str( + absolute_path.relative_to(self.path.absolute()) + ) + else: + ids[id] = str(absolute_path) + + return dict(sorted(ids.items())) + + +plugin.register_formats(MAGSequencesDirFmt) + + +class OrthologAnnotationDirFmt(model.DirectoryFormat): + annotations = model.FileCollection( + r'.+\.annotations', + format=OrthologFileFmt + ) + + @annotations.set_path_maker + def annotations_path_maker(self, file_name): + return file_name.split(sep="_")[0] + + +plugin.register_formats(OrthologAnnotationDirFmt) diff --git a/q2_types/feature_data_mag/_transformer.py b/q2_types/feature_data_mag/_transformer.py new file mode 100644 index 00000000..4788c82f --- /dev/null +++ b/q2_types/feature_data_mag/_transformer.py @@ -0,0 +1,91 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import collections.abc +import glob +import os.path +from itertools import repeat + +import pandas as pd +import skbio +from q2_types.feature_data._transformer import _fastaformats_to_series + +from . import MAGSequencesDirFmt +from ..plugin_setup import plugin + +CONSTRUCTORS = { + 'DNA': skbio.DNA, + 'RNA': skbio.RNA, + 'protein': skbio.Protein +} + + +def _get_filename(full_path): + return os.path.splitext(os.path.basename(full_path))[0] + + +def _series_to_fasta(series, ff, seq_type='DNA'): + fp = os.path.join(str(ff), f'{series.name}.fasta') + with open(fp, 'w') as fh: + for id_, seq in series.iteritems(): + if seq: + sequence = CONSTRUCTORS[seq_type](seq, metadata={'id': id_}) + skbio.io.write(sequence, format='fasta', into=fh) + + +def _fastafiles_to_dataframe(ff): + data = {} + for fp in sorted(glob.glob(os.path.join(str(ff), '*.fa*'))): + fname = _get_filename(fp) + data[fname] = _fastaformats_to_series(fp, constructor=skbio.DNA) + df = pd.DataFrame.from_dict(data, orient='index') + df.index.name = 'Feature ID' + df = df.astype(str).replace({'nan': None}) + return df + + +@plugin.register_transformer +def _2(ff: MAGSequencesDirFmt) -> pd.DataFrame: + return _fastafiles_to_dataframe(ff) + + +@plugin.register_transformer +def _3(df: pd.DataFrame) -> MAGSequencesDirFmt: + result = MAGSequencesDirFmt() + df.apply(_series_to_fasta, axis=1, ff=result, seq_type='DNA') + return result + + +class MAGIterator(collections.abc.Iterable): + def __init__(self, generator): + self.generator = generator + + def __iter__(self): + yield from self.generator + + +@plugin.register_transformer +def _4(ff: MAGSequencesDirFmt) -> MAGIterator: + def _multi_generator(files): + for fp in files: + fname = _get_filename(fp) + fg = skbio.read(fp, format='fasta', constructor=skbio.DNA) + yield from zip(repeat(fname), fg) + + fps = sorted(glob.glob(os.path.join(str(ff), '*.fa*'))) + return MAGIterator(_multi_generator(fps)) + + +@plugin.register_transformer +def _5(data: MAGIterator) -> MAGSequencesDirFmt: + result = MAGSequencesDirFmt() + for fn, seq in data: + fp = os.path.join(str(result), f'{fn}.fasta') + with open(fp, 'a') as fin: + skbio.io.write(seq, format='fasta', into=fin) + return result diff --git a/q2_types/feature_data_mag/_type.py b/q2_types/feature_data_mag/_type.py new file mode 100644 index 00000000..bfe021bf --- /dev/null +++ b/q2_types/feature_data_mag/_type.py @@ -0,0 +1,48 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from q2_types.feature_data import FeatureData + +from q2_types.feature_data_mag._format import ( + MAGSequencesDirFmt, OrthologAnnotationDirFmt + ) +from qiime2.core.type import SemanticType + +from ..plugin_setup import plugin + + +MAG = SemanticType('MAG', variant_of=FeatureData.field['type']) + +plugin.register_semantic_types(MAG) +plugin.register_semantic_type_to_format( + FeatureData[MAG], + artifact_format=MAGSequencesDirFmt +) + +NOG = SemanticType('NOG', variant_of=FeatureData.field['type']) + +plugin.register_semantic_types(NOG) +plugin.register_artifact_class( + FeatureData[NOG], + directory_format=OrthologAnnotationDirFmt) + + +OG = SemanticType('OG', variant_of=FeatureData.field['type']) + +plugin.register_semantic_types(OG) +plugin.register_artifact_class( + FeatureData[OG], + directory_format=OrthologAnnotationDirFmt) + + +KEGG = SemanticType('KEGG', variant_of=FeatureData.field['type']) + +plugin.register_semantic_types(KEGG) +plugin.register_artifact_class( + FeatureData[KEGG], + directory_format=OrthologAnnotationDirFmt) diff --git a/q2_types/feature_data_mag/tests/__init__.py b/q2_types/feature_data_mag/tests/__init__.py new file mode 100644 index 00000000..afcc05c2 --- /dev/null +++ b/q2_types/feature_data_mag/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/feature_data_mag/tests/data/good_ortholog_annotation/test_output.emapper.annotations b/q2_types/feature_data_mag/tests/data/good_ortholog_annotation/test_output.emapper.annotations new file mode 100644 index 00000000..22aa1272 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/good_ortholog_annotation/test_output.emapper.annotations @@ -0,0 +1,2 @@ +1000565.METUNv1_03812 1000565.METUNv1_03812 4.71e-264 714.0 COG0012@1|root,COG0012@2|Bacteria,1MVM4@1224|Proteobacteria,2VJ1W@28216|Betaproteobacteria,2KUD2@206389|Rhodocyclales 206389|Rhodocyclales J ATPase that binds to both the 70S ribosome and the 50S ribosomal subunit in a nucleotide-independent manner ychF - - ko:K06942 - - - - ko00000,ko03009 - - - MMR_HSR1,YchF-GTPase_C +362663.ECP_0061 362663.ECP_0061 0.0 1624.0 COG0417@1|root,COG0417@2|Bacteria,1MVY9@1224|Proteobacteria,1RMQ1@1236|Gammaproteobacteria,3XPER@561|Escherichia 1236|Gammaproteobacteria L DNA polymerase polB GO:0003674,GO:0003824,GO:0003887,GO:0004518,GO:0004527,GO:0004529,GO:0004536,GO:0006139,GO:0006259,GO:0006260,GO:0006261,GO:0006281,GO:0006725,GO:0006807,GO:0006950,GO:0006974,GO:0007154,GO:0008150,GO:0008152,GO:0008296,GO:0008408,GO:0009058,GO:0009059,GO:0009432,GO:0009605,GO:0009987,GO:0009991,GO:0016740,GO:0016772,GO:0016779,GO:0016787,GO:0016788,GO:0016796,GO:0016895,GO:0018130,GO:0019438,GO:0031668,GO:0033554,GO:0034061,GO:0034641,GO:0034645,GO:0034654,GO:0043170,GO:0044237,GO:0044238,GO:0044249,GO:0044260,GO:0044271,GO:0045004,GO:0045005,GO:0046483,GO:0050896,GO:0051716,GO:0071496,GO:0071704,GO:0071897,GO:0090304,GO:0090305,GO:0140097,GO:1901360,GO:1901362,GO:1901576 2.7.7.7 ko:K02336 - - - - ko00000,ko01000,ko03400 - - - DNA_pol_B,DNA_pol_B_exo1 diff --git a/q2_types/feature_data_mag/tests/data/mags-fa/23c5b64e-3f3e-4688-9862-e9dae4fa0f5b.fa b/q2_types/feature_data_mag/tests/data/mags-fa/23c5b64e-3f3e-4688-9862-e9dae4fa0f5b.fa new file mode 100644 index 00000000..227a0625 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/mags-fa/23c5b64e-3f3e-4688-9862-e9dae4fa0f5b.fa @@ -0,0 +1,3 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAA diff --git a/q2_types/feature_data_mag/tests/data/mags-fa/70c5a728-96a6-4eed-b9f9-9a73153c1385.fa b/q2_types/feature_data_mag/tests/data/mags-fa/70c5a728-96a6-4eed-b9f9-9a73153c1385.fa new file mode 100644 index 00000000..89e7e13e --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/mags-fa/70c5a728-96a6-4eed-b9f9-9a73153c1385.fa @@ -0,0 +1,3 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTC diff --git a/q2_types/feature_data_mag/tests/data/mags-fa/7e2a749a-a19a-4b62-8195-0ee601b5fdfb.fa b/q2_types/feature_data_mag/tests/data/mags-fa/7e2a749a-a19a-4b62-8195-0ee601b5fdfb.fa new file mode 100644 index 00000000..a1bedbac --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/mags-fa/7e2a749a-a19a-4b62-8195-0ee601b5fdfb.fa @@ -0,0 +1,6 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGA +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGT diff --git a/q2_types/feature_data_mag/tests/data/mags-fasta/3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta b/q2_types/feature_data_mag/tests/data/mags-fasta/3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta new file mode 100644 index 00000000..d604d6ba --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/mags-fasta/3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta @@ -0,0 +1,3 @@ +>k129_5401 +CCATTGTATGTCTTTAGGTAGCTCCTCATGTTTGAGGTTCATGTCTTGGATTTTGTTTTC +TCCAAAAATC diff --git a/q2_types/feature_data_mag/tests/data/mags-fasta/6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta b/q2_types/feature_data_mag/tests/data/mags-fasta/6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta new file mode 100644 index 00000000..cde9661b --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/mags-fasta/6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta @@ -0,0 +1,10 @@ +>k129_4684 +TGATACCGACGCGGCACTTGAGTGCGCGCTATCCTTCAAGGAAGCCACATGCGTTATTGT +TAAACA +>k129_5618 +GTGCTAATCGCACCCTCATGAGCGACACCATTATTCTTTATTTTTGAGTCTTCAGCAAAA +>k129_5631 +TCATGATGATCCAAAAGCAGTTGCGGAAGCATCTGGGATAATTACGCGGAGTGGATGTCG +CCG +>k129_2817 +GTCGCCAATTAGCAACTATGATGTCTTCTGGAGTACCTTTGGTCCAATCATTTGAAATCA diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.annotations b/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.annotations new file mode 100644 index 00000000..22aa1272 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.annotations @@ -0,0 +1,2 @@ +1000565.METUNv1_03812 1000565.METUNv1_03812 4.71e-264 714.0 COG0012@1|root,COG0012@2|Bacteria,1MVM4@1224|Proteobacteria,2VJ1W@28216|Betaproteobacteria,2KUD2@206389|Rhodocyclales 206389|Rhodocyclales J ATPase that binds to both the 70S ribosome and the 50S ribosomal subunit in a nucleotide-independent manner ychF - - ko:K06942 - - - - ko00000,ko03009 - - - MMR_HSR1,YchF-GTPase_C +362663.ECP_0061 362663.ECP_0061 0.0 1624.0 COG0417@1|root,COG0417@2|Bacteria,1MVY9@1224|Proteobacteria,1RMQ1@1236|Gammaproteobacteria,3XPER@561|Escherichia 1236|Gammaproteobacteria L DNA polymerase polB GO:0003674,GO:0003824,GO:0003887,GO:0004518,GO:0004527,GO:0004529,GO:0004536,GO:0006139,GO:0006259,GO:0006260,GO:0006261,GO:0006281,GO:0006725,GO:0006807,GO:0006950,GO:0006974,GO:0007154,GO:0008150,GO:0008152,GO:0008296,GO:0008408,GO:0009058,GO:0009059,GO:0009432,GO:0009605,GO:0009987,GO:0009991,GO:0016740,GO:0016772,GO:0016779,GO:0016787,GO:0016788,GO:0016796,GO:0016895,GO:0018130,GO:0019438,GO:0031668,GO:0033554,GO:0034061,GO:0034641,GO:0034645,GO:0034654,GO:0043170,GO:0044237,GO:0044238,GO:0044249,GO:0044260,GO:0044271,GO:0045004,GO:0045005,GO:0046483,GO:0050896,GO:0051716,GO:0071496,GO:0071704,GO:0071897,GO:0090304,GO:0090305,GO:0140097,GO:1901360,GO:1901362,GO:1901576 2.7.7.7 ko:K02336 - - - - ko00000,ko01000,ko03400 - - - DNA_pol_B,DNA_pol_B_exo1 diff --git a/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.orthologs b/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.orthologs new file mode 100644 index 00000000..395dea59 --- /dev/null +++ b/q2_types/feature_data_mag/tests/data/ortholog_annotation_extra/test_output.emapper.orthologs @@ -0,0 +1,2 @@ +1000565.METUNv1_03812 1000565.METUNv1_03812,1121035.AUCH01000015_gene2548,1123367.C666_07595,1123487.KB892857_gene2312,1304883.KI912532_gene1239,1348657.M622_06940,159087.Daro_3733,305700.B447_07764,497321.C664_16088,62928.azo0752,640081.Dsui_1039,748247.AZKH_0690,76114.ebA4139,85643.Tmz1t_3631 +362663.ECP_0061 1001530.BACE01000011_gene2341,1001585.MDS_3838,1004785.AMBLS11_15500,1005395.CSV86_15315,1005994.GTGU_01680,1005995.GTPT_1156,1005999.GLGR_2221,1006000.GKAS_03113,1006004.GBAG_3457,1027273.GZ77_06950,1027273.GZ77_06975,1028307.EAE_11080,1038922.PflQ2_3424,1042209.HK44_015680,1042375.AFPL01000045_gene1053,1042375.AFPL01000051_gene192,1042377.AFPJ01000007_gene1879,1042876.PPS_1981,1045856.EcWSU1_00673,104623.Ser39006_03906,1046714.AMRX01000001_gene1551,1051646.VITU9109_20279,1056512.D515_04005,1073999.BN137_3094,1076550.LH22_19005,1080067.BAZH01000004_gene4111,1082705.JIBP01000020_gene1660,1085623.GNIT_0082,1109445.AGSX01000022_gene1619,1109445.AGSX01000023_gene1794,1111728.ATYS01000002_gene2032,1112217.PPL19_23878,1114922.CIFAM_14_00570,1114970.PSF113_3842,1115512.EH105704_02_03110,1115515.EV102420_08_02570,1116375.VEJY3_10210,1117314.PCIT_16365,1117315.AHCA01000002_gene3366,1117318.PRUB_20823,1117319.PSPO_10129,1117647.M5M_09500,1117958.PE143B_0119780,1120953.AUBH01000004_gene3192,1120963.KB894506_gene3814,1120970.AUBZ01000052_gene2919,1121374.KB891591_gene3511,1121878.AUGL01000024_gene3733,1121921.KB898710_gene523,1121935.AQXX01000142_gene2202,1121937.AUHJ01000011_gene2966,1121939.L861_00110,1121943.KB899993_gene35,1122201.AUAZ01000003_gene1977,1122207.MUS1_09280,1122599.AUGR01000012_gene874,1123020.AUIE01000047_gene1779,1123228.AUIH01000013_gene228,1123236.KB899382_gene2208,1123519.PSJM300_10125,1124983.PFLCHA0_c25450,1124991.MU9_896,1127673.GLIP_3806,1128912.GMES_0635,1129794.C427_0719,1134474.O59_003286,1136138.JH604622_gene2217,1136163.M565_ctg1P0716,1137799.GZ78_23460,1141662.OOA_13757,1141663.OOC_06427,1144325.PMI22_02075,1149133.ppKF707_3157,1151116.Q7S_19140,1151127.KB906326_gene533,1163398.AJJP01000153_gene3010,1163398.AJJP01000154_gene2728,1166016.W5S_3969,1166130.H650_19135,1166948.JPZL01000001_gene2228,1179778.PMM47T1_15041,1182590.BN5_03028,1187848.AJYQ01000080_gene1281,1188252.AJYK01000104_gene18,1190603.AJYD01000005_gene4046,1190606.AJYG01000064_gene1318,1191299.AJYX01000010_gene534,1196835.A458_14590,1197719.A464_88,1197719.A464_89,1201293.AKXQ01000005_gene187,1202962.KB907152_gene1129,1205683.CAKR01000014_gene261,1205908.AKXW01000045_gene3571,1206777.B195_15802,1207075.PputUW4_01630,1207076.ALAT01000047_gene3451,1207076.ALAT01000076_gene1696,1208321.D104_17215,1209072.ALBT01000065_gene1128,1211112.ALJC01000025_gene3695,1211579.PP4_34220,1212548.B381_10908,1214065.BAGV01000052_gene1889,1215092.PA6_033_00130,1215114.BBIU01000009_gene1489,1216007.AOPM01000111_gene322,1216966.BAUC01000023_gene1579,1218086.BBNB01000010_gene760,1218352.B597_008110,1219065.VPR01S_08_00960,1219072.VHA01S_049_00040,1219076.N646_1029,1219077.VAZ01S_024_00390,1219080.VEZ01S_14_00060,1221522.B723_22150,1224136.AMFN01000004_gene1724,1224318.DT73_07600,1225184.ALXE01000004_gene139,1225785.CM001983_gene2995,1226994.AMZB01000058_gene5029,1229485.AMYV01000161_gene1756,1232683.ADIMK_2705,1236541.BALL01000005_gene907,1236542.BALM01000018_gene1511,1238450.VIBNISOn1_1710005,1240350.AMZE01000004_gene2404,1245471.PCA10_33960,1248232.BANQ01000035_gene3916,1249634.D781_0685,1265490.JHVY01000003_gene3179,1265503.KB905165_gene1170,1267600.JFGT01000005_gene2743,1268068.PG5_19750,1268237.G114_12253,1268239.PALB_26410,1278307.KB906974_gene1764,1278309.KB907099_gene2467,1279015.KB908456_gene1623,1280001.BAOA01000128_gene1213,1282356.H045_05955,1283284.AZUK01000001_gene515,1286170.RORB6_14920,1294143.H681_11200,1298593.TOL_2986,1298865.H978DRAFT_0156,1301098.PKB_1141,1307437.J139_01197,1316927.ATKI01000146_gene923,1328313.DS2_18293,1333507.AUTQ01000175_gene1502,1333856.L686_02350,1336233.JAEH01000026_gene973,1336237.JAEE01000001_gene1943,1344012.ATMI01000002_gene1472,1348114.OM33_12680,1348635.BBJY01000017_gene4157,1357272.AVEO02000138_gene691,1357275.AVEL02000076_gene2015,1357279.N018_15585,1388763.O165_013170,1390370.O203_23445,1395516.PMO01_16945,1395571.TMS3_0100805,1397284.AYMN01000025_gene4216,1399774.JDWH01000001_gene2409,1410619.SRDD_35890,1415630.U771_10760,1419583.V466_14160,1437882.AZRU01000027_gene839,1437882.AZRU01000027_gene840,1439940.BAY1663_04623,1440052.EAKF1_ch1361,1441629.PCH70_24170,1441930.Z042_17410,1443113.LC20_04540,1443113.LC20_04541,1448139.AI20_07565,1453496.AT03_18325,1453501.JELR01000001_gene2440,1453503.AU05_13350,1454202.PPBDW_130199___1,1469245.JFBG01000054_gene2118,1470593.BW43_03461,1484157.PSNIH2_14570,1484158.PSNIH1_08275,1488328.JMCL01000179_gene3609,1492922.GY26_06990,1499686.BN1079_00467,1500890.JQNL01000001_gene2109,1500893.JQNB01000001_gene1060,1515746.HR45_05935,1517681.HW45_07820,1523503.JPMY01000010_gene2641,1524467.IV04_00325,1535422.ND16A_0750,1537994.JQFW01000056_gene934,155864.EDL933_0062,1565129.JSFF01000001_gene1169,156578.ATW7_09016,156578.ATW7_09021,157783.LK03_16860,1577887.JSYG01000001_gene667,158822.LH89_15865,160488.PP_2393,198214.SF0055,198628.Dda3937_01373,199310.c0071,203122.Sde_1511,205918.Psyr_2361,205922.Pfl01_3734,207954.MED92_02394,211586.SO_1820,214092.YPO0518,216142.LT40_12375,216595.PFLU_2067,218491.ECA3852,218493.SBG_0085,220341.16501382,220664.PFL_2480,223283.PSPTO_2621,223926.28806938,225849.swp_3211,234831.PSM_A2035,237609.PSAKL28_32080,243277.VC_1212,247633.GP2143_14671,247634.GPB2148_2486,264730.PSPPH_2495,273526.SMDB11_0043,28152.DJ57_2731,28229.ND2E_3600,28258.KP05_04835,283699.D172_3177,287.DR97_5961,29486.NJ56_03155,29495.EA26_06645,298386.PBPRA1531,301.JNHE01000034_gene4200,305900.GV64_17865,312309.VF_1628,314275.MADE_1016665,314282.PCNPT3_06855,314292.VAS14_03703,316275.VSAL_I2142,316407.85674307,318161.Sden_2428,318167.Sfri_2629,319224.Sputcn32_1514,32042.PstZobell_08221,321846.PS417_09565,323850.Shew_2482,326297.Sama_1250,326442.PSHAa1990,342610.Patl_3630,345073.VC395_1331,349521.HCH_01557,349965.yinte0001_42170,349965.yinte0001_42180,349966.DJ58_1292,351746.Pput_3302,35703.DQ02_16995,357804.Ping_1857,362663.ECP_0061,371042.NG99_21800,379731.PST_2786,380703.AHA_2107,382245.ASA_2190,384676.PSEEN3365,390235.PputW619_1903,392500.Swoo_3085,393305.YE0637,398579.Spea_2655,399739.Pmen_3572,399741.Spro_0731,399742.Ent638_0607,400668.Mmwyl1_2210,406817.XNC1_4058,406818.XBJ1_1795,425104.Ssed_1567,42565.FP66_08165,440512.C211_04698,458817.Shal_2729,465817.ETA_07310,469008.B21_00061,469595.CSAG_03356,471874.PROSTU_01143,471881.PROPEN_00979,471881.PROPEN_00980,471881.PROPEN_00981,477228.YO5_10215,481805.EcolC_3597,491952.Mar181_1745,493475.GARC_4665,498211.CJA_2886,500637.PROVRUST_06038,500640.CIT292_09418,502347.ESCAB7627_3201,511062.GU3_08445,511145.b0060,517433.PanABDRAFT_1681,520999.PROVALCAL_02276,521000.PROVRETT_06774,527002.yaldo0001_28460,529507.PMI2327,549.BW31_03194,550540.Fbal_1160,55207.KP22_11025,553385.JEMF01000001_gene570,55601.VANGNB10_cI1556,558884.JRGM01000153_gene949,561229.Dd1591_0570,561230.PC1_3628,561231.Pecwa_3819,568768.CM001975_gene3021,571.MC52_12495,573.JG24_29705,575788.VS_1957,579405.Dd703_0609,585.DR95_2445,587753.EY04_11270,589873.EP13_15875,590409.Dd586_3566,592316.Pat9b_0640,593105.S7A_15715,595494.Tola_2147,598467.BrE312_0653,61647.LG71_24775,617140.AJZE01000124_gene2547,629265.PMA4326_20105,630626.EBL_c32930,633.DJ40_1710,634499.EpC_07180,634500.EbC_06980,637905.SVI_2972,637910.ROD_00651,640513.Entas_0660,642227.HA49_19235,644801.Psest_1519,658612.MD26_00790,66269.NL54_08390,665029.EAMY_2919,667121.ET1_10_00490,672.VV93_v1c11490,674977.VMC_25520,674977.VMC_25530,675806.VII_002578,675812.VHA_002685,675813.VIB_001445,675814.VIC_004442,675815.VOA_000407,675816.VIA_002663,675817.VDA_002186,690597.JH730923_gene1694,690597.JH730923_gene1695,69328.PVLB_14735,693444.D782_3810,701176.VIBRN418_07876,701347.Entcl_3665,706191.PANA_0690,712898.Pvag_0098,715451.ambt_00830,716541.ECL_00856,717774.Marme_2143,722419.PH505_ad00150,741091.Rahaq_3758,743720.Psefu_4133,745277.GRAQ_02860,745411.B3C1_16722,754331.AEME01000001_gene2079,754436.JCM19237_4567,754436.JCM19237_4568,754436.JCM19237_4569,76869.PputGB1_1999,78398.KS43_07875,796620.VIBC2010_15254,82995.CR62_04670,82996.sch_03475,870967.VIS19158_15604,87626.PTD2_10919,891974.E05_44490,891974.E05_44500,891974.E05_44510,90371.CY43_00480,910964.GEAM_0249,911008.GLAD_03089,911239.CF149_11719,930166.CD58_19980,932213.SPM24T3_00530,932677.PAJ_0037,94122.Shewana3_2629,945543.VIBR0546_10799,945550.VISI1226_06239,95619.PM1_0216955,983545.Glaag_3623,998088.B565_1731,998674.ATTE01000001_gene309 diff --git a/q2_types/feature_data_mag/tests/test_format.py b/q2_types/feature_data_mag/tests/test_format.py new file mode 100644 index 00000000..03d13a71 --- /dev/null +++ b/q2_types/feature_data_mag/tests/test_format.py @@ -0,0 +1,77 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from pathlib import Path +import shutil +import unittest + +from qiime2.plugin.testing import TestPluginBase +from qiime2.plugin import ValidationError + +from q2_types.feature_data_mag._format import ( + MAGSequencesDirFmt, OrthologAnnotationDirFmt, + ) + + +class TestFormats(TestPluginBase): + package = 'q2_types.feature_data_mag.tests' + + def test_mag_dirfmt_fa(self): + dirpath = self.get_data_path('mags-fa') + format = MAGSequencesDirFmt(dirpath, mode='r') + + format.validate() + + def test_mag_dirfmt_fasta(self): + dirpath = self.get_data_path('mags-fasta') + format = MAGSequencesDirFmt(dirpath, mode='r') + + format.validate() + + def test_mag_dirfmt_feature_dict(self): + dirpath = self.get_data_path('mags-fasta') + shutil.copytree(dirpath, self.temp_dir.name, dirs_exist_ok=True) + mags = MAGSequencesDirFmt(self.temp_dir.name, mode='r') + + # non-mags should not be collected + with open(Path(self.temp_dir.name) / 'not-a-mag.fasta', 'w') as fh: + fh.write('not a mag') + + obs = mags.feature_dict() + exp = { + '3b7d53fb-5b60-46c6-8819-aeda065b12e9': + str(mags.path / '3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta'), + '6232c7e1-8ed7-47c8-9bdb-b94706a26931': + str(mags.path / '6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta'), + } + self.assertEqual(obs, exp) + + obs = mags.feature_dict(relative=True) + exp = { + '3b7d53fb-5b60-46c6-8819-aeda065b12e9': + '3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta', + '6232c7e1-8ed7-47c8-9bdb-b94706a26931': + '6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta', + } + self.assertEqual(obs, exp) + + def test_ortholog_annotation_dir_fmt_passing(self): + dirpath = self.get_data_path('good_ortholog_annotation') + fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') + fmt_obj.validate() + + def test_ortholog_annotation_dir_fmt_fails_extra_file(self): + dirpath = self.get_data_path('ortholog_annotation_extra') + fmt_obj = OrthologAnnotationDirFmt(dirpath, mode='r') + + with self.assertRaisesRegex(ValidationError, "Unrecognized file"): + fmt_obj.validate() + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/feature_data_mag/tests/test_transformers.py b/q2_types/feature_data_mag/tests/test_transformers.py new file mode 100644 index 00000000..810560cc --- /dev/null +++ b/q2_types/feature_data_mag/tests/test_transformers.py @@ -0,0 +1,134 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import glob +import unittest +from itertools import repeat + +import pandas as pd +import skbio +from qiime2.plugin.testing import TestPluginBase +from skbio import DNA + +from q2_types.feature_data_mag import ( + MAGSequencesDirFmt, MAGIterator +) +from q2_types.feature_data_mag._transformer import _get_filename + + +class TestTransformers(TestPluginBase): + package = 'q2_types.feature_data_mag.tests' + + def setUp(self): + super().setUp() + self.mags_fa = { + '23c5b64e-3f3e-4688-9862-e9dae4fa0f5b': { + 'k129_5480': 'TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCG' + 'CCAAGCTGCACGATCACACCTTTAA' + }, + '70c5a728-96a6-4eed-b9f9-9a73153c1385': { + 'k129_5112': 'CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCC' + 'CAACTCGCGGCCGATGGCGCGTACCTCGTC' + }, + '7e2a749a-a19a-4b62-8195-0ee601b5fdfb': { + 'k129_6525': 'AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATC' + 'AGTCAGCTCATTATTGAATCGGA', + 'k129_6531': 'TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATT' + 'TGGCGATAGCGAGCCCGTATTTACGGT' + } + } + self.mags_fasta = { + '3b7d53fb-5b60-46c6-8819-aeda065b12e9': { + 'k129_5401': 'CCATTGTATGTCTTTAGGTAGCTCCTCATGTTTGAGGTTCAT' + 'GTCTTGGATTTTGTTTTCTCCAAAAATC' + }, + '6232c7e1-8ed7-47c8-9bdb-b94706a26931': { + 'k129_4684': 'TGATACCGACGCGGCACTTGAGTGCGCGCTATCCTTCAAGGA' + 'AGCCACATGCGTTATTGTTAAACA', + 'k129_5618': 'GTGCTAATCGCACCCTCATGAGCGACACCATTATTCTTTATT' + 'TTTGAGTCTTCAGCAAAA', + 'k129_5631': 'TCATGATGATCCAAAAGCAGTTGCGGAAGCATCTGGGATAAT' + 'TACGCGGAGTGGATGTCGCCG', + 'k129_2817': 'GTCGCCAATTAGCAACTATGATGTCTTCTGGAGTACCTTTGG' + 'TCCAATCATTTGAAATCA' + }, + } + + @staticmethod + def mags_to_df(mags): + df = pd.DataFrame.from_dict(mags, orient='index') + df = df.astype(str).replace({'nan': None}) + df.index.name = 'Feature ID' + return df + + @staticmethod + def create_multi_generator(seqs_dict): + for k1, v1 in seqs_dict.items(): + yield from zip( + repeat(k1), + (DNA(v2, metadata={'id': k2, 'description': ''}) + for k2, v2 in v1.items()) + ) + + @staticmethod + def read_seqs_into_dict(loc): + seqs = {} + for f in sorted(glob.glob(f'{loc}/*')): + seqs[_get_filename(f)] = { + seq.metadata['id']: str(seq) + for seq in skbio.read(f, format='fasta') + } + return seqs + + def test_mag_sequences_dir_fmt_to_dataframe(self): + _, obs = self.transform_format( + MAGSequencesDirFmt, pd.DataFrame, + filenames=[ + 'mags-fasta/3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta', + 'mags-fasta/6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta', + ] + ) + exp = self.mags_to_df(self.mags_fasta) + pd.testing.assert_frame_equal(exp, obs) + + def test_dataframe_to_mag_sequences_dir_fmt(self): + transformer = self.get_transformer(pd.DataFrame, MAGSequencesDirFmt) + df = self.mags_to_df(self.mags_fasta) + + obs = transformer(df) + self.assertIsInstance(obs, MAGSequencesDirFmt) + + obs_seqs = self.read_seqs_into_dict(str(obs)) + self.assertDictEqual(self.mags_fasta, obs_seqs) + + def test_mag_sequences_dir_fmt_to_mag_iterator(self): + _, obs = self.transform_format( + MAGSequencesDirFmt, MAGIterator, + filenames=[ + 'mags-fasta/6232c7e1-8ed7-47c8-9bdb-b94706a26931.fasta', + 'mags-fasta/3b7d53fb-5b60-46c6-8819-aeda065b12e9.fasta', + ] + ) + + exp = self.create_multi_generator(self.mags_fasta) + for e, o in zip(exp, obs): + self.assertEqual(e, o) + + def test_mag_iterator_to_mag_sequences_dir_fmt(self): + transformer = self.get_transformer(MAGIterator, MAGSequencesDirFmt) + seq_iter = self.create_multi_generator(self.mags_fa) + + obs = transformer(seq_iter) + self.assertIsInstance(obs, MAGSequencesDirFmt) + + obs_seqs = self.read_seqs_into_dict(str(obs)) + self.assertDictEqual(self.mags_fa, obs_seqs) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/feature_data_mag/tests/test_type.py b/q2_types/feature_data_mag/tests/test_type.py new file mode 100644 index 00000000..f2b30eee --- /dev/null +++ b/q2_types/feature_data_mag/tests/test_type.py @@ -0,0 +1,57 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +from q2_types.feature_data import FeatureData +from qiime2.plugin.testing import TestPluginBase + +from q2_types.feature_data_mag import ( + MAG, MAGSequencesDirFmt, OrthologAnnotationDirFmt, NOG, OG, KEGG, + ) + + +class TestTypes(TestPluginBase): + package = 'q2_types.feature_data_mag.tests' + + def test_mag_semantic_type_registration(self): + self.assertRegisteredSemanticType(MAG) + + def test_mags_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[MAG], + MAGSequencesDirFmt + ) + + def test_nog_type_registration(self): + self.assertRegisteredSemanticType(NOG) + + def test_og_type_registration(self): + self.assertRegisteredSemanticType(OG) + + def test_kegg_type_registration(self): + self.assertRegisteredSemanticType(KEGG) + + def test_nog_registered_to_format(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[NOG], + OrthologAnnotationDirFmt) + + def test_og_registered_to_format(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[OG], + OrthologAnnotationDirFmt) + + def test_kegg_registered_to_format(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[KEGG], + OrthologAnnotationDirFmt) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/genome_data/__init__.py b/q2_types/genome_data/__init__.py new file mode 100644 index 00000000..b237850f --- /dev/null +++ b/q2_types/genome_data/__init__.py @@ -0,0 +1,27 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import importlib + +from ._format import ( + GenesDirectoryFormat, ProteinsDirectoryFormat, + GFF3Format, LociDirectoryFormat, OrthologFileFmt, SeedOrthologDirFmt, +) +from ._transformer import IntervalMetadataIterator +from ._type import ( + GenomeData, Genes, Proteins, Loci, Ortholog, BLAST6 +) + +__all__ = [ + 'GenomeData', 'Genes', 'Proteins', 'Loci', 'GFF3Format', + 'GenesDirectoryFormat', 'ProteinsDirectoryFormat', 'LociDirectoryFormat', + 'IntervalMetadataIterator', 'OrthologFileFmt', 'Ortholog', + 'SeedOrthologDirFmt', 'BLAST6', + ] + +importlib.import_module('q2_types.genome_data._transformer') diff --git a/q2_types/genome_data/_format.py b/q2_types/genome_data/_format.py new file mode 100644 index 00000000..3c3a3081 --- /dev/null +++ b/q2_types/genome_data/_format.py @@ -0,0 +1,186 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import qiime2.plugin.model as model +from q2_types.feature_data import DNAFASTAFormat, ProteinFASTAFormat +from qiime2.core.exceptions import ValidationError + +from ..plugin_setup import plugin + + +class OrthologFileFmt(model.TextFileFormat): + def _validate_(self, level): + pass + + +class GenesDirectoryFormat(model.DirectoryFormat): + genes = model.FileCollection(r'(.*\_)?genes[0-9]*\.(fa|fna|fasta)$', + format=DNAFASTAFormat) + + @genes.set_path_maker + def genes_path_maker(self, genome_id): + return '%s_genes.fasta' % genome_id + + +class ProteinsDirectoryFormat(model.DirectoryFormat): + proteins = model.FileCollection(r'(.*\_)?proteins[0-9]*\.(fa|faa|fasta)$', + format=ProteinFASTAFormat) + + @proteins.set_path_maker + def proteins_path_maker(self, genome_id): + return '%s_proteins.fasta' % genome_id + + +class GFF3Format(model.TextFileFormat): + """ + Generic Feature Format Version 3 (GFF3) spec: + https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md + NCBI modifications to the above: + https://www.ncbi.nlm.nih.gov/datasets/docs/reference-docs/file-formats/about-ncbi-gff3/ + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.directives = {} + self.directives_unofficial = {} + + def _update_directives(self, line, line_number): + directive = line[2:].split(maxsplit=1) + if len(directive) <= 1: + raise ValidationError( + f'GFF directive entry on line {line_number} is incomplete.' + ) + elif line.startswith('##'): + self.directives.update({directive[0]: directive[1]}) + elif line.startswith('#!'): + self.directives_unofficial.update({directive[0]: directive[1]}) + + def _validate_directives(self) -> bool: + if 'gff-version' not in self.directives.keys(): + raise ValidationError( + '"gff-version" directive is missing from the file headers.' + ) + if not self.directives['gff-version'].startswith('3'): + raise ValidationError( + f'Invalid GFF format version: ' + f'{self.directives["gff-version"]}. Only version 3 ' + f'is supported.' + ) + return True + + @staticmethod + def _validate_feature_line(line, line_number): + line_elements = line.split('\t') + line_len = len(line_elements) + if line_len != 9: + raise ValidationError( + f'The entry on line {line_number} ' + f'has an incorrect number of elements. All ' + f'entries need to have 9 elements in a valid ' + f'GFF3 file.' + ) + + # 1: seqid, 2: source, 3: type, 4: start, 5:stop, + # 6: score, 7:strand, 8:phase, 9:attributes + if any([True for x in line_elements + if x in {"", " "}]): + raise ValidationError( + f'An empty feature found on line ' + f'{line_number}. Empty features should be ' + f'denoted with a ".".' + ) + + if str(line_elements[0]).startswith('>'): + raise ValidationError( + 'Landmark IDs must not start with an unescaped' + f' ">". The ID on line {line_number} was ' + f'"{line_elements[0]}".' + ) + + if int(line_elements[3]) > int(line_elements[4]): + raise ValidationError( + f'Start position on line {line_number} ' + f'is bigger than stop position.' + ) + + if any([int(line_elements[3]) <= 0, + int(line_elements[4]) <= 0]): + raise ValidationError( + 'Coordinates should be expressed as ' + f'positive, 1-based integers. At least ' + f'one of the positions on line {line_number} ' + f'is incorrect.' + ) + + if str(line_elements[6]) not in ['+', '-', '?', '.']: + raise ValidationError( + f'Strand of the feature on line {line_number} ' + f'is not one of the allowed symbols (+-.?).' + ) + + if str(line_elements[2]) == 'CDS' and \ + str(line_elements[7]) not in ['0', '1', '2']: + raise ValidationError( + 'Features of type CDS require the phase to ' + 'be one of: 0, 1, 3. The phase on line ' + f'{line_number} was {line_elements[7]}.' + ) + + def _validate_(self, level): + level_map = {'min': 100, 'max': float('inf')} + max_lines = level_map[level] + + directives_validated = False + + with self.path.open('rb') as fh: + try: + for line_number, line in enumerate(fh, 1): + line = line.strip() + if line_number >= max_lines: + return + line = line.decode('utf-8-sig') + + if line.startswith(("##", "#!")) and not self.directives: + self._update_directives(line, line_number) + elif line.startswith('#'): + continue + else: + if not directives_validated: + directives_validated = self._validate_directives() + self._validate_feature_line(line, line_number) + + except UnicodeDecodeError as e: + raise ValidationError(f'utf-8 cannot decode byte on line ' + f'{line_number}') from e + + +class LociDirectoryFormat(model.DirectoryFormat): + loci = model.FileCollection(r'(.*\_)?loci[0-9]*\.gff$', + format=GFF3Format) + + @loci.set_path_maker + def loci_path_maker(self, genome_id): + return '%s_loci.gff' % genome_id + + +plugin.register_formats( + GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat +) + + +class SeedOrthologDirFmt(model.DirectoryFormat): + seed_orthologs = model.FileCollection(r'.*\..*\.seed_orthologs', + format=OrthologFileFmt, + optional=False) + + @seed_orthologs.set_path_maker + def seed_ortholog_pathmaker(self, sample_name): + return str(sample_name.split(sep=".")[0] + ".seed_orthologs") + + +plugin.register_formats(OrthologFileFmt, SeedOrthologDirFmt) diff --git a/q2_types/genome_data/_transformer.py b/q2_types/genome_data/_transformer.py new file mode 100644 index 00000000..504e8002 --- /dev/null +++ b/q2_types/genome_data/_transformer.py @@ -0,0 +1,106 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import collections.abc +import os + +import pandas as pd +import skbio +from skbio.io import read + +from . import ( + GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, OrthologFileFmt +) +from ..plugin_setup import plugin + +CONSTRUCTORS = { + 'DNA': skbio.DNA, + 'RNA': skbio.RNA, + 'protein': skbio.Protein +} + + +@plugin.register_transformer +def _8(ortholog_file: OrthologFileFmt) -> pd.DataFrame: + + seed_ortholog_column_names = ['qseqid', 'sseqid', 'evalue', 'bitscore', + 'qstart', 'qend', 'sstart', 'send', 'pident', + 'qcov', 'scov'] + + return pd.read_csv(ortholog_file.path, sep="\t", + names=seed_ortholog_column_names, + header='infer', + comment="#" + ) + + +def _series_to_fasta(series, ff, seq_type='DNA'): + fp = os.path.join(ff.path, f'{series.name}.fasta') + with open(fp, 'w') as fh: + for id_, seq in series.iteritems(): + if seq: + sequence = CONSTRUCTORS[seq_type](seq, metadata={'id': id_}) + skbio.io.write(sequence, format='fasta', into=fh) + + +def _multi_sequences_to_df(seq_iter_view): + data = { + os.path.splitext(fp)[0]: pds + for fp, pds in seq_iter_view + } + df = pd.DataFrame.from_dict(data, orient='index') + df.index.name = 'Genome ID' + df = df.astype(str).replace({'nan': None}) + return df + + +@plugin.register_transformer +def _1(dirfmt: GenesDirectoryFormat) -> pd.DataFrame: + return _multi_sequences_to_df(dirfmt.genes.iter_views(pd.Series)) + + +@plugin.register_transformer +def _2(df: pd.DataFrame) -> GenesDirectoryFormat: + result = GenesDirectoryFormat() + df.apply(_series_to_fasta, axis=1, ff=result, seq_type='DNA') + return result + + +@plugin.register_transformer +def _3(dirfmt: ProteinsDirectoryFormat) -> pd.DataFrame: + return _multi_sequences_to_df(dirfmt.proteins.iter_views(pd.Series)) + + +@plugin.register_transformer +def _4(df: pd.DataFrame) -> ProteinsDirectoryFormat: + result = ProteinsDirectoryFormat() + df.apply(_series_to_fasta, axis=1, ff=result, seq_type='protein') + return result + + +class IntervalMetadataIterator(collections.abc.Iterable): + def __init__(self, generator): + self.generator = generator + + def __iter__(self): + yield from self.generator + + +@plugin.register_transformer +def _5(fmt: GFF3Format) -> IntervalMetadataIterator: + generator = read(str(fmt), format='gff3') + return IntervalMetadataIterator(generator) + + +@plugin.register_transformer +def _7(data: IntervalMetadataIterator) -> GFF3Format: + ff = GFF3Format() + with ff.open() as fh: + for _id, im in data: + im.write(fh, format='gff3', seq_id=_id) + return ff diff --git a/q2_types/genome_data/_type.py b/q2_types/genome_data/_type.py new file mode 100644 index 00000000..ad60cfd5 --- /dev/null +++ b/q2_types/genome_data/_type.py @@ -0,0 +1,45 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin import SemanticType +from q2_types.feature_data import BLAST6 + +from . import ( + GenesDirectoryFormat, ProteinsDirectoryFormat, LociDirectoryFormat, + SeedOrthologDirFmt, +) +from ..plugin_setup import plugin + +GenomeData = SemanticType('GenomeData', field_names='type', + field_members={'type': BLAST6}) +Genes = SemanticType('Genes', variant_of=GenomeData.field['type']) +Proteins = SemanticType('Proteins', variant_of=GenomeData.field['type']) +Loci = SemanticType('Loci', variant_of=GenomeData.field['type']) +Ortholog = SemanticType('Ortholog', variant_of=GenomeData.field['type']) + +plugin.register_semantic_types(GenomeData, Genes, Proteins, Loci) + +plugin.register_semantic_type_to_format( + GenomeData[Genes], + artifact_format=GenesDirectoryFormat +) + +plugin.register_semantic_type_to_format( + GenomeData[Proteins], + artifact_format=ProteinsDirectoryFormat +) + +plugin.register_semantic_type_to_format( + GenomeData[Loci], + artifact_format=LociDirectoryFormat +) + +plugin.register_semantic_type_to_format( + GenomeData[BLAST6], + artifact_format=SeedOrthologDirFmt +) diff --git a/q2_types/genome_data/tests/__init__.py b/q2_types/genome_data/tests/__init__.py new file mode 100644 index 00000000..afcc05c2 --- /dev/null +++ b/q2_types/genome_data/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/genome_data/tests/data/genes-with-prefix/ABC_123.1_genes.fa b/q2_types/genome_data/tests/data/genes-with-prefix/ABC_123.1_genes.fa new file mode 100644 index 00000000..e00ad6c1 --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-prefix/ABC_123.1_genes.fa @@ -0,0 +1,9 @@ +>gene1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene2 some_description2 +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGG +>gene3 some_description3 +GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA +CTGGAGG diff --git a/q2_types/genome_data/tests/data/genes-with-prefix/BCD_234.2_genes.fa b/q2_types/genome_data/tests/data/genes-with-prefix/BCD_234.2_genes.fa new file mode 100644 index 00000000..dd12c09a --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-prefix/BCD_234.2_genes.fa @@ -0,0 +1,6 @@ +>gene11 some_description11 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene12 some_description12 +AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA +CGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/genes-with-suffix/genes1.fa b/q2_types/genome_data/tests/data/genes-with-suffix/genes1.fa new file mode 100644 index 00000000..e00ad6c1 --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-suffix/genes1.fa @@ -0,0 +1,9 @@ +>gene1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene2 some_description2 +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGG +>gene3 some_description3 +GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA +CTGGAGG diff --git a/q2_types/genome_data/tests/data/genes-with-suffix/genes2.fa b/q2_types/genome_data/tests/data/genes-with-suffix/genes2.fa new file mode 100644 index 00000000..dd12c09a --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-suffix/genes2.fa @@ -0,0 +1,6 @@ +>gene11 some_description11 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene12 some_description12 +AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA +CGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix1genes.fa b/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix1genes.fa new file mode 100644 index 00000000..e00ad6c1 --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix1genes.fa @@ -0,0 +1,9 @@ +>gene1 some_description1 +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene2 some_description2 +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGG +>gene3 some_description3 +GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACCCGCCCGCTATAGTGAACACCTAAGAA +CTGGAGG diff --git a/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix2genes.fa b/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix2genes.fa new file mode 100644 index 00000000..dd12c09a --- /dev/null +++ b/q2_types/genome_data/tests/data/genes-with-wrong-prefix/wrongprefix2genes.fa @@ -0,0 +1,6 @@ +>gene11 some_description11 +ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAGACCCGCCCGCACCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGC +>gene12 some_description12 +AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAGA +CGGGGGGCCTTGG diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-corrupt.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-corrupt.gff new file mode 100644 index 00000000..8edc937e --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-corrupt.gff @@ -0,0 +1,2 @@ +>This data is corrupt +åååååååååå \ No newline at end of file diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-directive-empty.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-directive-empty.gff new file mode 100644 index 00000000..d8a2935e --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-directive-empty.gff @@ -0,0 +1,10 @@ +## +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-empty-feature.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-empty-feature.gff new file mode 100644 index 00000000..aab83ae3 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-empty-feature.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-char.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-char.gff new file mode 100644 index 00000000..3393b016 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-char.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +>AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-phase.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-phase.gff new file mode 100644 index 00000000..dc131675 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-phase.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 8 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-start.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-start.gff new file mode 100644 index 00000000..1a593cec --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-start.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1528 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-strand.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-strand.gff new file mode 100644 index 00000000..131bdbb8 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-invalid-strand.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . $ 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-lines-unequal.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-lines-unequal.gff new file mode 100644 index 00000000..f4cd35c7 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-lines-unequal.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-negative-start.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-negative-start.gff new file mode 100644 index 00000000..9b7a2194 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-negative-start.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region -1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-no-version.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-no-version.gff new file mode 100644 index 00000000..08e34887 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-no-version.gff @@ -0,0 +1,9 @@ +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-invalid/loci-wrong-version.gff b/q2_types/genome_data/tests/data/loci-invalid/loci-wrong-version.gff new file mode 100644 index 00000000..b5ba9c12 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-invalid/loci-wrong-version.gff @@ -0,0 +1,10 @@ +##gff-version 2 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-with-prefix/ABC_123.1_loci.gff b/q2_types/genome_data/tests/data/loci-with-prefix/ABC_123.1_loci.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-prefix/ABC_123.1_loci.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-with-prefix/BCD_234.2_loci.gff b/q2_types/genome_data/tests/data/loci-with-prefix/BCD_234.2_loci.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-prefix/BCD_234.2_loci.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/loci-with-suffix/loci1.gff b/q2_types/genome_data/tests/data/loci-with-suffix/loci1.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-suffix/loci1.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-with-suffix/loci2.gff b/q2_types/genome_data/tests/data/loci-with-suffix/loci2.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-suffix/loci2.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix1loci.gff b/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix1loci.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix1loci.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix2loci.gff b/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix2loci.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/loci-with-wrong-prefix/wrongprefix2loci.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/ortholog/test_sample.emapper.seed_orthologs b/q2_types/genome_data/tests/data/ortholog/test_sample.emapper.seed_orthologs new file mode 100644 index 00000000..7ab4ebd8 --- /dev/null +++ b/q2_types/genome_data/tests/data/ortholog/test_sample.emapper.seed_orthologs @@ -0,0 +1,3 @@ +0_0 316407.85674276 0.0 1597.0 1 2460 1 820 100.0 48.8 100.0 +0_2 316407.85674277 4.42e-217 629.0 1 930 1 310 100.0 18.5 100.0 +0_1 316407.21321894 2.29e-303 857.0 1 1284 1 428 100.0 25.5 100.0 diff --git a/q2_types/genome_data/tests/data/proteins-with-prefix/ABC_123.1_proteins.faa b/q2_types/genome_data/tests/data/proteins-with-prefix/ABC_123.1_proteins.faa new file mode 100644 index 00000000..2271b842 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-prefix/ABC_123.1_proteins.faa @@ -0,0 +1,6 @@ +>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442 +MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM +ADATYIEPITP +>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426 +MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG +RIEQVEAILSLAEIIDPAK diff --git a/q2_types/genome_data/tests/data/proteins-with-prefix/BCD_234.2_proteins.faa b/q2_types/genome_data/tests/data/proteins-with-prefix/BCD_234.2_proteins.faa new file mode 100644 index 00000000..4423cfa8 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-prefix/BCD_234.2_proteins.faa @@ -0,0 +1,6 @@ +>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669 +MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG +GPSSVYEPGAPTLDPAVFDLGVP +>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650 +MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF +APVLVG diff --git a/q2_types/genome_data/tests/data/proteins-with-suffix/proteins1.faa b/q2_types/genome_data/tests/data/proteins-with-suffix/proteins1.faa new file mode 100644 index 00000000..2271b842 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-suffix/proteins1.faa @@ -0,0 +1,6 @@ +>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442 +MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM +ADATYIEPITP +>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426 +MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG +RIEQVEAILSLAEIIDPAK diff --git a/q2_types/genome_data/tests/data/proteins-with-suffix/proteins2.faa b/q2_types/genome_data/tests/data/proteins-with-suffix/proteins2.faa new file mode 100644 index 00000000..4423cfa8 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-suffix/proteins2.faa @@ -0,0 +1,6 @@ +>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669 +MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG +GPSSVYEPGAPTLDPAVFDLGVP +>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650 +MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF +APVLVG diff --git a/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix1proteins.faa b/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix1proteins.faa new file mode 100644 index 00000000..2271b842 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix1proteins.faa @@ -0,0 +1,6 @@ +>k129_5480_1 # 3 # 1988 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442 +MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYRVVLINSNPATIMTDPNM +ADATYIEPITP +>k129_5480_2 # 2150 # 2623 # 1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426 +MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLSENAEYHAAREQQSFIEG +RIEQVEAILSLAEIIDPAK diff --git a/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix2proteins.faa b/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix2proteins.faa new file mode 100644 index 00000000..4423cfa8 --- /dev/null +++ b/q2_types/genome_data/tests/data/proteins-with-wrong-prefix/wrongprefix2proteins.faa @@ -0,0 +1,6 @@ +>k129_5112_1 # 1 # 1218 # -1 # ID=1_1;partial=10;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.669 +MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTATAEEIAAKDPIGIILSG +GPSSVYEPGAPTLDPAVFDLGVP +>k129_5112_2 # 1261 # 1797 # -1 # ID=1_2;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.650 +MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYSPLHVELFAGGSGGFLWF +APVLVG diff --git a/q2_types/genome_data/tests/test_format.py b/q2_types/genome_data/tests/test_format.py new file mode 100644 index 00000000..b1c040ef --- /dev/null +++ b/q2_types/genome_data/tests/test_format.py @@ -0,0 +1,192 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +from qiime2.core.exceptions import ValidationError +from qiime2.plugin.testing import TestPluginBase + +from .._format import ( + GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, + LociDirectoryFormat, SeedOrthologDirFmt, OrthologFileFmt, +) + + +class TestFormats(TestPluginBase): + package = 'q2_types.genome_data.tests' + + def test_ortholog_file_fmt(self): + dirpath = self.get_data_path( + 'ortholog/test_sample.emapper.seed_orthologs') + fmt_obj = OrthologFileFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_seed_ortholog_dir_fmt_collection_file_name(self): + dirpath = self.get_data_path('ortholog') + fmt_obj = SeedOrthologDirFmt(dirpath, mode='r') + + for relpath, obj in fmt_obj.seed_orthologs.iter_views(OrthologFileFmt): + obs = str(obj).split("/")[-1].split("/")[-1] + + exp = "test_sample.emapper.seed_orthologs" + + self.assertEqual(obs, exp) + + def test_seed_ortholog_dir_fmt_good_validate(self): + dirpath = self.get_data_path('ortholog') + + fmt_obj = SeedOrthologDirFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_seed_ortholog_dir_fmt_collection(self): + dirpath = self.get_data_path('ortholog/') + fmt = SeedOrthologDirFmt(dirpath, mode='r') + + for relpath, obj in fmt.seed_orthologs.iter_views(OrthologFileFmt): + self.assertIsInstance(obj=obj, cls=OrthologFileFmt) + obj.validate() + + def test_genes_dirfmt_fa_with_suffix(self): + dirpath = self.get_data_path('genes-with-suffix') + fmt = GenesDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_genes_dirfmt_fa_with_prefix(self): + dirpath = self.get_data_path('genes-with-prefix') + fmt = GenesDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_genes_dirfmt_fa_with_wrong_prefix(self): + dirpath = self.get_data_path('genes-with-wrong-prefix') + fmt = GenesDirectoryFormat(dirpath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + 'Missing one or more files for GenesDirectoryFormat' + ): + fmt.validate() + + def test_proteins_dirfmt_fa_with_suffix(self): + dirpath = self.get_data_path('proteins-with-suffix') + fmt = ProteinsDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_proteins_dirfmt_fa_with_prefix(self): + dirpath = self.get_data_path('proteins-with-prefix') + fmt = ProteinsDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_proteins_dirfmt_fa_with_wrong_prefix(self): + dirpath = self.get_data_path('proteins-with-wrong-prefix') + fmt = ProteinsDirectoryFormat(dirpath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + 'Missing one or more files for ProteinsDirectoryFormat' + ): + fmt.validate() + + def test_gff_format_positive_with_suffix(self): + filepath = self.get_data_path('loci-with-suffix/loci1.gff') + fmt = GFF3Format(filepath, mode='r') + + fmt.validate() + + def test_loci_dirfmt_with_suffix(self): + dirpath = self.get_data_path('loci-with-suffix') + fmt = LociDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_loci_dirfmt_with_prefix(self): + dirpath = self.get_data_path('loci-with-prefix') + fmt = LociDirectoryFormat(dirpath, mode='r') + + fmt.validate() + + def test_loci_dirfmt_with_wrong_prefix(self): + dirpath = self.get_data_path('loci-with-wrong-prefix') + fmt = LociDirectoryFormat(dirpath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + 'Missing one or more files for LociDirectoryFormat' + ): + fmt.validate() + + def test_gff_format_wrong_version(self): + filepath = self.get_data_path('loci-invalid/loci-wrong-version.gff') + with self.assertRaisesRegex( + ValidationError, 'Invalid GFF format version: 2.'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_no_version(self): + filepath = self.get_data_path('loci-invalid/loci-no-version.gff') + with self.assertRaisesRegex( + ValidationError, '"gff-version" directive is missing'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_empty_directive(self): + filepath = self.get_data_path('loci-invalid/loci-directive-empty.gff') + with self.assertRaisesRegex( + ValidationError, 'directive entry on line 1 is incomplete.'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_lines_nonequal(self): + filepath = self.get_data_path('loci-invalid/loci-lines-unequal.gff') + with self.assertRaisesRegex( + ValidationError, 'line 9 has an incorrect number of elements'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_empty_feature(self): + filepath = self.get_data_path('loci-invalid/loci-empty-feature.gff') + with self.assertRaisesRegex( + ValidationError, r'empty feature found on line 9'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_invalid_char(self): + filepath = self.get_data_path('loci-invalid/loci-invalid-char.gff') + with self.assertRaisesRegex( + ValidationError, r'unescaped ">". The ID on line 10 ' + r'was ">AL123456.3"'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_invalid_start_stop(self): + filepath = self.get_data_path('loci-invalid/loci-invalid-start.gff') + with self.assertRaisesRegex( + ValidationError, 'position on line 9 is bigger than stop'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_negative_position(self): + filepath = self.get_data_path('loci-invalid/loci-negative-start.gff') + with self.assertRaisesRegex( + ValidationError, 'positions on line 8 is incorrect'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_invalid_strand(self): + filepath = self.get_data_path('loci-invalid/loci-invalid-strand.gff') + with self.assertRaisesRegex( + ValidationError, 'feature on line 10 is not one ' + 'of the allowed'): + GFF3Format(filepath, mode='r').validate() + + def test_gff_format_invalid_phase(self): + filepath = self.get_data_path('loci-invalid/loci-invalid-phase.gff') + with self.assertRaisesRegex( + ValidationError, 'The phase on line 10 was 8.'): + GFF3Format(filepath, mode='r').validate() + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/genome_data/tests/test_transformer.py b/q2_types/genome_data/tests/test_transformer.py new file mode 100644 index 00000000..d7558ef7 --- /dev/null +++ b/q2_types/genome_data/tests/test_transformer.py @@ -0,0 +1,125 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +import pandas as pd +import skbio.io +from qiime2.plugin.testing import TestPluginBase + +from q2_types.genome_data import ( + GenesDirectoryFormat, ProteinsDirectoryFormat, GFF3Format, + IntervalMetadataIterator +) + + +class TestTransformers(TestPluginBase): + package = 'q2_types.genome_data.tests' + + def setUp(self): + super().setUp() + self.genes = { + 'genes1': { + 'gene1': 'GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCC' + 'TCATCGCTGGGCACAGCCCAGAGGGTATAAACAGTGCTGGAGGC', + 'gene2': 'CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACT' + 'AAATACCATATAGTGAACACCTAAGACGGGGGGCCTTGG', + 'gene3': 'GCACCCGGCCAATTTTTGTGTTTTTAGTAGAGAAGATTCCCCCTAGACC' + 'CGCCCGCTATAGTGAACACCTAAGAACTGGAGG' + }, + 'genes2': { + 'gene11': 'ATGGTCAGGCATGCCCCTCCTCATCGCTGGGCGGCAGATTCCCCCTAG' + 'ACCCGCCCGCACCACAGCCCAGAGGGTATAAACAGTGCTGGAGGC', + 'gene12': 'AATACCATATAGTGAACACCTAACCACTGCACTCACCGCACCCGGCCA' + 'ATTTTTGTGTTTTTAGTAGAGACTAGACGGGGGGCCTTGG' + } + } + self.proteins = { + 'proteins1': { + 'k129_5480_1': 'MPKRTDISSICIIGAGPIVIGQACEFDYSGAQACKALKEEGYR' + 'VVLINSNPATIMTDPNMADATYIEPITP', + 'k129_5480_2': 'MQKIPLTKQGHTDLEAELKDLKHRQRPAVIAAISEAREHGDLS' + 'ENAEYHAAREQQSFIEGRIEQVEAILSLAEIIDPAK' + }, + 'proteins2': { + 'k129_5112_1': 'MTEQTETSQRPVLVVDFGAQYAQLIARRVREAGVYSELVPHTA' + 'TAEEIAAKDPIGIILSGGPSSVYEPGAPTLDPAVFDLGVP', + 'k129_5112_2': 'MPREPKPSSFPAIRGALTFYQVASIITGVMLLLLLAEMILKYS' + 'PLHVELFAGGSGGFLWFAPVLVG' + } + } + + @staticmethod + def seqs_to_df(seqs): + df = pd.DataFrame.from_dict(seqs, orient='index') + df = df.astype(str).replace({'nan': None}) + df.index.name = 'Genome ID' + return df + + def test_genes_to_dataframe(self): + _, obs = self.transform_format(GenesDirectoryFormat, pd.DataFrame, + filenames=[ + 'genes-with-suffix/genes1.fa', + 'genes-with-suffix/genes2.fa' + ]) + exp = self.seqs_to_df(self.genes) + pd.testing.assert_frame_equal(exp, obs) + + def test_dataframe_to_genes(self): + transformer = self.get_transformer(pd.DataFrame, GenesDirectoryFormat) + df = self.seqs_to_df(self.genes) + + obs = transformer(df) + self.assertIsInstance(obs, GenesDirectoryFormat) + + def test_proteins_to_dataframe(self): + _, obs = self.transform_format( + ProteinsDirectoryFormat, + pd.DataFrame, + filenames=[ + 'proteins-with-suffix/proteins1.faa', + 'proteins-with-suffix/proteins2.faa' + ]) + exp = self.seqs_to_df(self.proteins) + pd.testing.assert_frame_equal(exp, obs) + + def test_dataframe_to_proteins(self): + transformer = self.get_transformer( + pd.DataFrame, ProteinsDirectoryFormat) + df = self.seqs_to_df(self.proteins) + + obs = transformer(df) + self.assertIsInstance(obs, ProteinsDirectoryFormat) + + def test_gff_to_interval_metadata_iterator(self): + input, obs = self.transform_format( + GFF3Format, + IntervalMetadataIterator, + filename='loci-with-suffix/loci1.gff') + exp = skbio.io.read(str(input), format='gff3') + + for o, e in zip(obs, exp): + self.assertEqual(o, e) + + def test_interval_metadata_iterator_to_gff(self): + transformer = self.get_transformer(IntervalMetadataIterator, + GFF3Format) + filepath = self.get_data_path('loci-with-suffix/loci1.gff') + generator = skbio.io.read(filepath, format='gff3') + input = IntervalMetadataIterator(generator) + + obs = transformer(input) + self.assertIsInstance(obs, GFF3Format) + obs = skbio.io.read(str(obs), format='gff3') + + for o, e in zip(obs, input): + self.assertEqual(o, e) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/genome_data/tests/test_type.py b/q2_types/genome_data/tests/test_type.py new file mode 100644 index 00000000..9d8d1b69 --- /dev/null +++ b/q2_types/genome_data/tests/test_type.py @@ -0,0 +1,54 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +from qiime2.plugin.testing import TestPluginBase + + +from q2_types.genome_data import ( + GenomeData, Genes, Proteins, Loci, GenesDirectoryFormat, + ProteinsDirectoryFormat, LociDirectoryFormat, SeedOrthologDirFmt, BLAST6 + ) + + +class TestTypes(TestPluginBase): + package = 'q2_types.genome_data.tests' + + def test_blast6_registered_to_seedorthologdirfmt(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[BLAST6], + SeedOrthologDirFmt) + + def test_genome_data_semantic_type_registration(self): + self.assertRegisteredSemanticType(GenomeData) + + def test_genes_semantic_type_registration(self): + self.assertRegisteredSemanticType(Genes) + + def test_proteins_semantic_type_registration(self): + self.assertRegisteredSemanticType(Proteins) + + def test_loci_semantic_type_registration(self): + self.assertRegisteredSemanticType(Loci) + + def test_genome_data_genes_to_genes_dir_fmt_registration(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[Genes], GenesDirectoryFormat) + + def test_genome_data_proteins_to_proteins_dir_fmt_registration(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[Proteins], ProteinsDirectoryFormat) + + def test_genome_data_loci_to_loci_dir_fmt_registration(self): + self.assertSemanticTypeRegisteredToFormat( + GenomeData[Loci], LociDirectoryFormat) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/kaiju/__init__.py b/q2_types/kaiju/__init__.py new file mode 100644 index 00000000..c1f2d1bb --- /dev/null +++ b/q2_types/kaiju/__init__.py @@ -0,0 +1,12 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from ._format import KaijuIndexFormat, KaijuDBDirectoryFormat +from ._type import KaijuDB + +__all__ = ["KaijuIndexFormat", "KaijuDBDirectoryFormat", "KaijuDB"] diff --git a/q2_types/kaiju/_format.py b/q2_types/kaiju/_format.py new file mode 100644 index 00000000..1a63ed01 --- /dev/null +++ b/q2_types/kaiju/_format.py @@ -0,0 +1,29 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin import model +from q2_types.reference_db._format import ( + NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat +) +from ..plugin_setup import plugin + + +class KaijuIndexFormat(model.BinaryFileFormat): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _validate_(self, level): + pass + + +class KaijuDBDirectoryFormat(model.DirectoryFormat): + nodes = model.File(r"nodes.dmp", format=NCBITaxonomyNodesFormat) + names = model.File(r"names.dmp", format=NCBITaxonomyNamesFormat) + index = model.File(r"kaiju_db.+\.fmi", format=KaijuIndexFormat) + + +plugin.register_formats(KaijuDBDirectoryFormat) diff --git a/q2_types/kaiju/_type.py b/q2_types/kaiju/_type.py new file mode 100644 index 00000000..169a0235 --- /dev/null +++ b/q2_types/kaiju/_type.py @@ -0,0 +1,20 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin import SemanticType + +from . import KaijuDBDirectoryFormat +from ..plugin_setup import plugin + + +KaijuDB = SemanticType("KaijuDB") + +plugin.register_semantic_types(KaijuDB) + +plugin.register_semantic_type_to_format( + KaijuDB, artifact_format=KaijuDBDirectoryFormat +) diff --git a/q2_types/kaiju/tests/__init__.py b/q2_types/kaiju/tests/__init__.py new file mode 100644 index 00000000..afcc05c2 --- /dev/null +++ b/q2_types/kaiju/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/kaiju/tests/data/db-valid/kaiju_db_test.fmi b/q2_types/kaiju/tests/data/db-valid/kaiju_db_test.fmi new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kaiju/tests/data/db-valid/names.dmp b/q2_types/kaiju/tests/data/db-valid/names.dmp new file mode 100644 index 00000000..b89e8a2b --- /dev/null +++ b/q2_types/kaiju/tests/data/db-valid/names.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_types/kaiju/tests/data/db-valid/nodes.dmp b/q2_types/kaiju/tests/data/db-valid/nodes.dmp new file mode 100644 index 00000000..61a662a0 --- /dev/null +++ b/q2_types/kaiju/tests/data/db-valid/nodes.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/kaiju/tests/test_format.py b/q2_types/kaiju/tests/test_format.py new file mode 100644 index 00000000..7a3f4652 --- /dev/null +++ b/q2_types/kaiju/tests/test_format.py @@ -0,0 +1,24 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import unittest +from qiime2.plugin.testing import TestPluginBase + +from q2_types.kaiju._format import KaijuDBDirectoryFormat + + +class TestFormats(TestPluginBase): + package = "q2_types.kaiju.tests" + + def test_kaiju_dirfmt(self): + dirpath = self.get_data_path("db-valid") + format = KaijuDBDirectoryFormat(dirpath, mode="r") + format.validate() + + +if __name__ == "__main__": + unittest.main() diff --git a/q2_types/kaiju/tests/test_type.py b/q2_types/kaiju/tests/test_type.py new file mode 100644 index 00000000..4069c464 --- /dev/null +++ b/q2_types/kaiju/tests/test_type.py @@ -0,0 +1,29 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +from qiime2.plugin.testing import TestPluginBase + +from q2_types.kaiju import KaijuDB, KaijuDBDirectoryFormat + + +class TestTypes(TestPluginBase): + package = "q2_types.kaiju.tests" + + def test_kaijudb_semantic_type_registration(self): + self.assertRegisteredSemanticType(KaijuDB) + + def test_kaijudb_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + KaijuDB, KaijuDBDirectoryFormat + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/q2_types/kraken2/__init__.py b/q2_types/kraken2/__init__.py new file mode 100644 index 00000000..b5628efb --- /dev/null +++ b/q2_types/kraken2/__init__.py @@ -0,0 +1,27 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import importlib + +from ._format import ( + Kraken2ReportFormat, Kraken2ReportDirectoryFormat, + Kraken2OutputFormat, Kraken2OutputDirectoryFormat, + Kraken2DBFormat, Kraken2DBReportFormat, Kraken2DBReportDirectoryFormat, + Kraken2DBDirectoryFormat, BrackenDBFormat, BrackenDBDirectoryFormat +) +from ._type import Kraken2Reports, Kraken2Outputs, Kraken2DB, Kraken2DBReport + +__all__ = [ + 'Kraken2ReportFormat', 'Kraken2ReportDirectoryFormat', 'Kraken2Reports', + 'Kraken2OutputFormat', 'Kraken2OutputDirectoryFormat', 'Kraken2Outputs', + 'Kraken2DBFormat', 'Kraken2DBReportFormat', 'Kraken2DBReport', + 'Kraken2DBReportDirectoryFormat', 'Kraken2DBDirectoryFormat', 'Kraken2DB', + 'BrackenDBFormat', 'BrackenDBDirectoryFormat' +] + +importlib.import_module('q2_types.kraken2._transformer') diff --git a/q2_types/kraken2/_format.py b/q2_types/kraken2/_format.py new file mode 100644 index 00000000..492b9e2f --- /dev/null +++ b/q2_types/kraken2/_format.py @@ -0,0 +1,199 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd +from pandas.core.dtypes.common import is_string_dtype +from qiime2.core.exceptions import ValidationError +from qiime2.plugin import model + +from ..plugin_setup import plugin + + +class Kraken2ReportFormat(model.TextFileFormat): + MEASURE_COLUMNS = { + 'perc_frags_covered': float, 'n_frags_covered': int, + 'n_frags_assigned': int + } + + MINIMIZER_COLUMS = { + 'n_read_minimizers': int, + 'n_uniq_minimizers': int + } + + TAXA_COLUMNS = { + 'rank': str, 'taxon_id': int, 'name': str + } + + NORMAL_COLUMNS = {**MEASURE_COLUMNS, **TAXA_COLUMNS} + ALL_COLUMNS = {**MEASURE_COLUMNS, **MINIMIZER_COLUMS, **TAXA_COLUMNS} + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _to_dataframe(self): + df = pd.read_csv(self.path, sep='\t', header=None) + if len(df.columns) == len(self.NORMAL_COLUMNS): + return df, self.NORMAL_COLUMNS + elif len(df.columns) == len(self.ALL_COLUMNS): + return df, self.ALL_COLUMNS + else: + raise ValueError( + 'Length mismatch: expected 6 or 8 columns, ' + f'found {len(df.columns)}.' + ) + + def _validate_(self, level): + try: + df, COLUMNS = self._to_dataframe() + df.columns = COLUMNS.keys() + except ValueError as e: + if 'Length mismatch' in str(e): + raise ValidationError(str(e)) + else: + raise ValidationError( + 'An error occurred when reading in the ' + 'Kraken2 report file' + ) from e + for col, dtype in COLUMNS.items(): + if dtype == str and is_string_dtype(df[col]): + continue + if df[col].dtype == dtype: + continue + raise ValidationError( + f'Expected {dtype} type in the "{col}" column, ' + f'got {df[col].dtype}' + ) + + +class Kraken2ReportDirectoryFormat(model.DirectoryFormat): + reports = model.FileCollection( + r'.+report\.(txt|tsv)$', format=Kraken2ReportFormat + ) + + @reports.set_path_maker + def reports_path_maker(self, sample_id, mag_id=None): + prefix = f'{sample_id}/{mag_id}_' if mag_id else f'{sample_id}/' + return f'{prefix}report.txt' + + +class Kraken2DBReportFormat(Kraken2ReportFormat): + COLUMNS = { + 'perc_minimizers_covered': float, + 'n_minimizers_covered': int, + 'n_minimizers_assigned': int, + 'rank': str, + 'taxon_id': int, + 'name': str + } + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _to_dataframe(self): + num_headers = self._count_headers() + df = pd.read_csv( + self.path, sep='\t', header=None, skiprows=num_headers + ) + if not len(df.columns) == len(self.COLUMNS): + raise ValueError( + f'Length mismatch: expected {len(self.COLUMNS)} columns, ' + f'found {len(df.columns)}.' + ) + return df, self.COLUMNS + + def _count_headers(self): + ''' + kraken2-inspect adds several headers beginning with '#' which we + wish to ignore + ''' + with open(self.path, 'r') as fh: + lines = fh.readlines() + + headers = filter(lambda line: line[0] == '#', lines) + return len(list(headers)) + + +Kraken2DBReportDirectoryFormat = model.SingleFileDirectoryFormat( + 'Kraken2DBReportDirectoryFormat', 'report.txt', Kraken2DBReportFormat +) + + +class Kraken2OutputFormat(model.TextFileFormat): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + COLUMNS = ( + 'classification', 'sequence_id', 'taxon_id', 'sequence_length', + 'kmer_mappings' + ) + + def _to_dataframe(self): + df = pd.read_csv(self.path, sep='\t', header=None) + return df, self.COLUMNS + + def _validate_(self, level): + df = pd.read_csv(self.path, sep='\t', header=None) + if df.shape[1] != 5: + raise ValidationError( + f'Expected 5 columns in the Kraken2 output file but ' + f'{df.shape[1]} were found.' + ) + if not set(df.iloc[:, 0].unique()).issubset({'C', 'U'}): + raise ValidationError( + 'Expected the first column to contain only "C" or "U" values.' + ) + + +class Kraken2OutputDirectoryFormat(model.DirectoryFormat): + reports = model.FileCollection( + r'.+output\.(txt|tsv)$', format=Kraken2OutputFormat + ) + + @reports.set_path_maker + def reports_path_maker(self, sample_id, mag_id=None): + prefix = f'{sample_id}/{mag_id}_' if mag_id else f'{sample_id}/' + return f'{prefix}output.txt' + + +class Kraken2DBFormat(model.TextFileFormat): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _validate_(self, level): + pass + + +class Kraken2DBDirectoryFormat(model.DirectoryFormat): + hash = model.File(r'hash.k2d', format=Kraken2DBFormat) + opts = model.File(r'opts.k2d', format=Kraken2DBFormat) + taxo = model.File(r'taxo.k2d', format=Kraken2DBFormat) + + +class BrackenDBFormat(model.TextFileFormat): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _validate_(self, level): + pass + + +class BrackenDBDirectoryFormat(model.DirectoryFormat): + kmers = model.FileCollection( + r'database(\d{2,})mers\.kmer_distrib$', format=BrackenDBFormat + ) + + @kmers.set_path_maker + def kmers_path_maker(self, read_len): + return f'database{read_len}mers.kmer_distrib' + + +plugin.register_formats( + Kraken2ReportDirectoryFormat, Kraken2OutputDirectoryFormat, + Kraken2DBDirectoryFormat, Kraken2DBReportDirectoryFormat, + BrackenDBDirectoryFormat +) diff --git a/q2_types/kraken2/_transformer.py b/q2_types/kraken2/_transformer.py new file mode 100644 index 00000000..9473b4a2 --- /dev/null +++ b/q2_types/kraken2/_transformer.py @@ -0,0 +1,33 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pandas as pd + +from . import Kraken2ReportFormat, Kraken2OutputFormat, Kraken2DBReportFormat +from ..plugin_setup import plugin + + +@plugin.register_transformer +def _1(ff: Kraken2ReportFormat) -> pd.DataFrame: + df, cols = ff._to_dataframe() + df.columns = cols.keys() + return df + + +@plugin.register_transformer +def _2(ff: Kraken2OutputFormat) -> pd.DataFrame: + df, cols = ff._to_dataframe() + df.columns = cols + return df + + +@plugin.register_transformer +def _3(ff: Kraken2DBReportFormat) -> pd.DataFrame: + df, cols = ff._to_dataframe() + df.columns = cols.keys() + return df diff --git a/q2_types/kraken2/_type.py b/q2_types/kraken2/_type.py new file mode 100644 index 00000000..29e07e51 --- /dev/null +++ b/q2_types/kraken2/_type.py @@ -0,0 +1,63 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from q2_types.feature_data import FeatureData +from q2_types.sample_data import SampleData +from qiime2.plugin import SemanticType + +from . import ( + Kraken2ReportDirectoryFormat, Kraken2OutputDirectoryFormat, + Kraken2DBDirectoryFormat, Kraken2DBReportDirectoryFormat, + BrackenDBDirectoryFormat +) +from ..plugin_setup import plugin + + +Kraken2Reports = SemanticType( + 'Kraken2Report', + variant_of=[SampleData.field['type'], FeatureData.field['type']] +) +Kraken2Outputs = SemanticType( + 'Kraken2Output', + variant_of=[SampleData.field['type'], FeatureData.field['type']] +) +Kraken2DB = SemanticType('Kraken2DB') +Kraken2DBReport = SemanticType('Kraken2DBReport') +BrackenDB = SemanticType('BrackenDB') + +plugin.register_semantic_types( + Kraken2Reports, Kraken2Outputs, Kraken2DB, Kraken2DBReport, BrackenDB +) + +plugin.register_semantic_type_to_format( + SampleData[Kraken2Reports], + artifact_format=Kraken2ReportDirectoryFormat +) +plugin.register_semantic_type_to_format( + FeatureData[Kraken2Reports], + artifact_format=Kraken2ReportDirectoryFormat +) +plugin.register_semantic_type_to_format( + SampleData[Kraken2Outputs], + artifact_format=Kraken2OutputDirectoryFormat +) +plugin.register_semantic_type_to_format( + FeatureData[Kraken2Outputs], + artifact_format=Kraken2OutputDirectoryFormat +) +plugin.register_semantic_type_to_format( + Kraken2DB, + artifact_format=Kraken2DBDirectoryFormat +) +plugin.register_semantic_type_to_format( + Kraken2DBReport, + artifact_format=Kraken2DBReportDirectoryFormat +) +plugin.register_semantic_type_to_format( + BrackenDB, + artifact_format=BrackenDBDirectoryFormat +) diff --git a/q2_types/kraken2/tests/__init__.py b/q2_types/kraken2/tests/__init__.py new file mode 100644 index 00000000..afcc05c2 --- /dev/null +++ b/q2_types/kraken2/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/kraken2/tests/data/bracken-db/database100mers.kmer_distrib b/q2_types/kraken2/tests/data/bracken-db/database100mers.kmer_distrib new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kraken2/tests/data/bracken-db/database150mers.kmer_distrib b/q2_types/kraken2/tests/data/bracken-db/database150mers.kmer_distrib new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kraken2/tests/data/db-reports/report-dir/report.txt b/q2_types/kraken2/tests/data/db-reports/report-dir/report.txt new file mode 100644 index 00000000..9e58c4be --- /dev/null +++ b/q2_types/kraken2/tests/data/db-reports/report-dir/report.txt @@ -0,0 +1,52 @@ +# Database options: nucleotide db, k = 35, l = 31 +# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011 +# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101 +# Total taxonomy nodes: 46 +# Table size: 26047 +# Table capacity: 51565 +# Min clear hash value = 0 +100.00 26047 0 R 1 root +100.00 26047 0 R1 131567 cellular organisms + 75.81 19746 0 D 2 Bacteria + 75.81 19746 0 D1 1783272 Terrabacteria group + 75.81 19746 0 P 1239 Bacillota + 75.81 19746 0 C 91061 Bacilli + 75.81 19746 0 O 1385 Bacillales + 49.84 12983 0 F 90964 Staphylococcaceae + 49.84 12983 0 G 1279 Staphylococcus + 25.11 6540 6540 S 1282 Staphylococcus epidermidis + 24.74 6443 6443 S 1280 Staphylococcus aureus + 25.96 6763 0 F 186817 Bacillaceae + 25.96 6763 0 G 1386 Bacillus + 25.96 6763 0 G1 86661 Bacillus cereus group + 25.96 6763 6763 S 1392 Bacillus anthracis + 24.19 6301 0 D 2759 Eukaryota + 24.19 6301 0 D1 33154 Opisthokonta + 24.19 6301 0 K 33208 Metazoa + 24.19 6301 0 K1 6072 Eumetazoa + 24.19 6301 0 K2 33213 Bilateria + 24.19 6301 0 K3 33511 Deuterostomia + 24.19 6301 0 P 7711 Chordata + 24.19 6301 0 P1 89593 Craniata + 24.19 6301 0 P2 7742 Vertebrata + 24.19 6301 0 P3 7776 Gnathostomata + 24.19 6301 0 P4 117570 Teleostomi + 24.19 6301 0 P5 117571 Euteleostomi + 24.19 6301 0 P6 8287 Sarcopterygii + 24.19 6301 0 P7 1338369 Dipnotetrapodomorpha + 24.19 6301 0 P8 32523 Tetrapoda + 24.19 6301 0 P9 32524 Amniota + 24.19 6301 0 C 40674 Mammalia + 24.19 6301 0 C1 32525 Theria + 24.19 6301 0 C2 9347 Eutheria + 24.19 6301 0 C3 1437010 Boreoeutheria + 24.19 6301 0 C4 314146 Euarchontoglires + 24.19 6301 0 C5 314147 Glires + 24.19 6301 0 O 9989 Rodentia + 24.19 6301 0 O1 1963758 Myomorpha + 24.19 6301 0 O2 337687 Muroidea + 24.19 6301 0 F 10066 Muridae + 24.19 6301 0 F1 39107 Murinae + 24.19 6301 0 G 10088 Mus + 24.19 6301 0 G1 862507 Mus + 24.19 6301 6301 S 10090 Mus musculus diff --git a/q2_types/kraken2/tests/data/db-reports/report-missing-column.txt b/q2_types/kraken2/tests/data/db-reports/report-missing-column.txt new file mode 100644 index 00000000..39cceeaf --- /dev/null +++ b/q2_types/kraken2/tests/data/db-reports/report-missing-column.txt @@ -0,0 +1,52 @@ +# Database options: nucleotide db, k = 35, l = 31 +# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011 +# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101 +# Total taxonomy nodes: 46 +# Table size: 26047 +# Table capacity: 51565 +# Min clear hash value = 0 +26047 0 R 1 root +26047 0 R1 131567 cellular organisms +19746 0 D 2 Bacteria +19746 0 D1 1783272 Terrabacteria group +19746 0 P 1239 Bacillota +19746 0 C 91061 Bacilli +19746 0 O 1385 Bacillales +12983 0 F 90964 Staphylococcaceae +12983 0 G 1279 Staphylococcus +6540 6540 S 1282 Staphylococcus epidermidis +6443 6443 S 1280 Staphylococcus aureus +6763 0 F 186817 Bacillaceae +6763 0 G 1386 Bacillus +6763 0 G1 86661 Bacillus cereus group +6763 6763 S 1392 Bacillus anthracis +6301 0 D 2759 Eukaryota +6301 0 D1 33154 Opisthokonta +6301 0 K 33208 Metazoa +6301 0 K1 6072 Eumetazoa +6301 0 K2 33213 Bilateria +6301 0 K3 33511 Deuterostomia +6301 0 P 7711 Chordata +6301 0 P1 89593 Craniata +6301 0 P2 7742 Vertebrata +6301 0 P3 7776 Gnathostomata +6301 0 P4 117570 Teleostomi +6301 0 P5 117571 Euteleostomi +6301 0 P6 8287 Sarcopterygii +6301 0 P7 1338369 Dipnotetrapodomorpha +6301 0 P8 32523 Tetrapoda +6301 0 P9 32524 Amniota +6301 0 C 40674 Mammalia +6301 0 C1 32525 Theria +6301 0 C2 9347 Eutheria +6301 0 C3 1437010 Boreoeutheria +6301 0 C4 314146 Euarchontoglires +6301 0 C5 314147 Glires +6301 0 O 9989 Rodentia +6301 0 O1 1963758 Myomorpha +6301 0 O2 337687 Muroidea +6301 0 F 10066 Muridae +6301 0 F1 39107 Murinae +6301 0 G 10088 Mus +6301 0 G1 862507 Mus +6301 6301 S 10090 Mus musculus diff --git a/q2_types/kraken2/tests/data/db-reports/report-ok.csv b/q2_types/kraken2/tests/data/db-reports/report-ok.csv new file mode 100644 index 00000000..69bd7dcd --- /dev/null +++ b/q2_types/kraken2/tests/data/db-reports/report-ok.csv @@ -0,0 +1,46 @@ +perc_minimizers_covered,n_minimizers_covered,n_minimizers_assigned,rank,taxon_id,name +100.00,26047,0,R,1,root +100.00,26047,0,R1,131567, cellular organisms + 75.81,19746,0,D,2, Bacteria + 75.81,19746,0,D1,1783272, Terrabacteria group + 75.81,19746,0,P,1239, Bacillota + 75.81,19746,0,C,91061, Bacilli + 75.81,19746,0,O,1385, Bacillales + 49.84,12983,0,F,90964, Staphylococcaceae + 49.84,12983,0,G,1279, Staphylococcus + 25.11,6540,6540,S,1282, Staphylococcus epidermidis + 24.74,6443,6443,S,1280, Staphylococcus aureus + 25.96,6763,0,F,186817, Bacillaceae + 25.96,6763,0,G,1386, Bacillus + 25.96,6763,0,G1,86661, Bacillus cereus group + 25.96,6763,6763,S,1392, Bacillus anthracis + 24.19,6301,0,D,2759, Eukaryota + 24.19,6301,0,D1,33154, Opisthokonta + 24.19,6301,0,K,33208, Metazoa + 24.19,6301,0,K1,6072, Eumetazoa + 24.19,6301,0,K2,33213, Bilateria + 24.19,6301,0,K3,33511, Deuterostomia + 24.19,6301,0,P,7711, Chordata + 24.19,6301,0,P1,89593, Craniata + 24.19,6301,0,P2,7742, Vertebrata + 24.19,6301,0,P3,7776, Gnathostomata + 24.19,6301,0,P4,117570, Teleostomi + 24.19,6301,0,P5,117571, Euteleostomi + 24.19,6301,0,P6,8287, Sarcopterygii + 24.19,6301,0,P7,1338369, Dipnotetrapodomorpha + 24.19,6301,0,P8,32523, Tetrapoda + 24.19,6301,0,P9,32524, Amniota + 24.19,6301,0,C,40674, Mammalia + 24.19,6301,0,C1,32525, Theria + 24.19,6301,0,C2,9347, Eutheria + 24.19,6301,0,C3,1437010, Boreoeutheria + 24.19,6301,0,C4,314146, Euarchontoglires + 24.19,6301,0,C5,314147, Glires + 24.19,6301,0,O,9989, Rodentia + 24.19,6301,0,O1,1963758, Myomorpha + 24.19,6301,0,O2,337687, Muroidea + 24.19,6301,0,F,10066, Muridae + 24.19,6301,0,F1,39107, Murinae + 24.19,6301,0,G,10088, Mus + 24.19,6301,0,G1,862507, Mus + 24.19,6301,6301,S,10090, Mus musculus diff --git a/q2_types/kraken2/tests/data/db-reports/report-wrong-types.txt b/q2_types/kraken2/tests/data/db-reports/report-wrong-types.txt new file mode 100644 index 00000000..4221258f --- /dev/null +++ b/q2_types/kraken2/tests/data/db-reports/report-wrong-types.txt @@ -0,0 +1,52 @@ +# Database options: nucleotide db, k = 35, l = 31 +# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011 +# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101 +# Total taxonomy nodes: 46 +# Table size: 26047 +# Table capacity: 51565 +# Min clear hash value = 0 +100 26047 0 R 1 root +100 26047 0 R1 131567 cellular organisms + 75 19746 0 D 2 Bacteria + 75 19746 0 D1 1783272 Terrabacteria group + 75 19746 0 P 1239 Bacillota + 75 19746 0 C 91061 Bacilli + 75 19746 0 O 1385 Bacillales + 49 12983 0 F 90964 Staphylococcaceae + 49 12983 0 G 1279 Staphylococcus + 25 6540 6540 S 1282 Staphylococcus epidermidis + 24 6443 6443 S 1280 Staphylococcus aureus + 25 6763 0 F 186817 Bacillaceae + 25 6763 0 G 1386 Bacillus + 25 6763 0 G1 86661 Bacillus cereus group + 25 6763 6763 S 1392 Bacillus anthracis + 24 6301 0 D 2759 Eukaryota + 24 6301 0 D1 33154 Opisthokonta + 24 6301 0 K 33208 Metazoa + 24 6301 0 K1 6072 Eumetazoa + 24 6301 0 K2 33213 Bilateria + 24 6301 0 K3 33511 Deuterostomia + 24 6301 0 P 7711 Chordata + 24 6301 0 P1 89593 Craniata + 24 6301 0 P2 7742 Vertebrata + 24 6301 0 P3 7776 Gnathostomata + 24 6301 0 P4 117570 Teleostomi + 24 6301 0 P5 117571 Euteleostomi + 24 6301 0 P6 8287 Sarcopterygii + 24 6301 0 P7 1338369 Dipnotetrapodomorpha + 24 6301 0 P8 32523 Tetrapoda + 24 6301 0 P9 32524 Amniota + 24 6301 0 C 40674 Mammalia + 24 6301 0 C1 32525 Theria + 24 6301 0 C2 9347 Eutheria + 24 6301 0 C3 1437010 Boreoeutheria + 24 6301 0 C4 314146 Euarchontoglires + 24 6301 0 C5 314147 Glires + 24 6301 0 O 9989 Rodentia + 24 6301 0 O1 1963758 Myomorpha + 24 6301 0 O2 337687 Muroidea + 24 6301 0 F 10066 Muridae + 24 6301 0 F1 39107 Murinae + 24 6301 0 G 10088 Mus + 24 6301 0 G1 862507 Mus + 24 6301 6301 S 10090 Mus musculus diff --git a/q2_types/kraken2/tests/data/db-reports/report.txt b/q2_types/kraken2/tests/data/db-reports/report.txt new file mode 100644 index 00000000..9e58c4be --- /dev/null +++ b/q2_types/kraken2/tests/data/db-reports/report.txt @@ -0,0 +1,52 @@ +# Database options: nucleotide db, k = 35, l = 31 +# Spaced mask = 11111111111111111111111111111111110011001100110011001100110011 +# Toggle mask = 1110001101111110001010001100010000100111000110110101101000101101 +# Total taxonomy nodes: 46 +# Table size: 26047 +# Table capacity: 51565 +# Min clear hash value = 0 +100.00 26047 0 R 1 root +100.00 26047 0 R1 131567 cellular organisms + 75.81 19746 0 D 2 Bacteria + 75.81 19746 0 D1 1783272 Terrabacteria group + 75.81 19746 0 P 1239 Bacillota + 75.81 19746 0 C 91061 Bacilli + 75.81 19746 0 O 1385 Bacillales + 49.84 12983 0 F 90964 Staphylococcaceae + 49.84 12983 0 G 1279 Staphylococcus + 25.11 6540 6540 S 1282 Staphylococcus epidermidis + 24.74 6443 6443 S 1280 Staphylococcus aureus + 25.96 6763 0 F 186817 Bacillaceae + 25.96 6763 0 G 1386 Bacillus + 25.96 6763 0 G1 86661 Bacillus cereus group + 25.96 6763 6763 S 1392 Bacillus anthracis + 24.19 6301 0 D 2759 Eukaryota + 24.19 6301 0 D1 33154 Opisthokonta + 24.19 6301 0 K 33208 Metazoa + 24.19 6301 0 K1 6072 Eumetazoa + 24.19 6301 0 K2 33213 Bilateria + 24.19 6301 0 K3 33511 Deuterostomia + 24.19 6301 0 P 7711 Chordata + 24.19 6301 0 P1 89593 Craniata + 24.19 6301 0 P2 7742 Vertebrata + 24.19 6301 0 P3 7776 Gnathostomata + 24.19 6301 0 P4 117570 Teleostomi + 24.19 6301 0 P5 117571 Euteleostomi + 24.19 6301 0 P6 8287 Sarcopterygii + 24.19 6301 0 P7 1338369 Dipnotetrapodomorpha + 24.19 6301 0 P8 32523 Tetrapoda + 24.19 6301 0 P9 32524 Amniota + 24.19 6301 0 C 40674 Mammalia + 24.19 6301 0 C1 32525 Theria + 24.19 6301 0 C2 9347 Eutheria + 24.19 6301 0 C3 1437010 Boreoeutheria + 24.19 6301 0 C4 314146 Euarchontoglires + 24.19 6301 0 C5 314147 Glires + 24.19 6301 0 O 9989 Rodentia + 24.19 6301 0 O1 1963758 Myomorpha + 24.19 6301 0 O2 337687 Muroidea + 24.19 6301 0 F 10066 Muridae + 24.19 6301 0 F1 39107 Murinae + 24.19 6301 0 G 10088 Mus + 24.19 6301 0 G1 862507 Mus + 24.19 6301 6301 S 10090 Mus musculus diff --git a/q2_types/kraken2/tests/data/kraken2-db/hash.k2d b/q2_types/kraken2/tests/data/kraken2-db/hash.k2d new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kraken2/tests/data/kraken2-db/opts.k2d b/q2_types/kraken2/tests/data/kraken2-db/opts.k2d new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kraken2/tests/data/kraken2-db/taxo.k2d b/q2_types/kraken2/tests/data/kraken2-db/taxo.k2d new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/kraken2/tests/data/outputs-contigs/output-ok-table.csv b/q2_types/kraken2/tests/data/outputs-contigs/output-ok-table.csv new file mode 100644 index 00000000..a0fdd645 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-contigs/output-ok-table.csv @@ -0,0 +1,21 @@ +classification,sequence_id,taxon_id,sequence_length,kmer_mappings +C,contig0,10090,685,10090:651 +C,contig1,10090,522,10090:21 0:66 10090:48 0:108 10090:245 +C,contig2,10090,746,10090:712 +C,contig3,10090,507,10090:132 0:145 10090:196 +C,contig4,10090,361,10090:327 +C,contig5,10090,350,10090:316 +C,contig6,10090,1000,0:13 10090:953 +C,contig7,10090,557,0:15 10090:427 0:81 +C,contig8,10090,835,0:29 10090:113 0:139 10090:335 0:49 10090:136 +C,contig9,10090,441,10090:407 +C,contig10,10090,757,10090:723 +C,contig11,10090,486,10090:452 +C,contig12,10090,629,10090:102 0:41 10090:38 0:41 10090:373 +C,contig13,10090,482,10090:10 0:83 10090:21 0:41 10090:293 +C,contig14,10090,896,10090:556 0:54 10090:252 +C,contig15,10090,205,10090:171 +C,contig16,10090,471,10090:437 +C,contig17,10090,302,10090:268 +C,contig18,10090,626,0:11 10090:581 +C,contig19,10090,322,10090:288 diff --git a/q2_types/kraken2/tests/data/outputs-contigs/output-ok.txt b/q2_types/kraken2/tests/data/outputs-contigs/output-ok.txt new file mode 100644 index 00000000..ffc98b6f --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-contigs/output-ok.txt @@ -0,0 +1,20 @@ +C contig0 10090 685 10090:651 +C contig1 10090 522 10090:21 0:66 10090:48 0:108 10090:245 +C contig2 10090 746 10090:712 +C contig3 10090 507 10090:132 0:145 10090:196 +C contig4 10090 361 10090:327 +C contig5 10090 350 10090:316 +C contig6 10090 1000 0:13 10090:953 +C contig7 10090 557 0:15 10090:427 0:81 +C contig8 10090 835 0:29 10090:113 0:139 10090:335 0:49 10090:136 +C contig9 10090 441 10090:407 +C contig10 10090 757 10090:723 +C contig11 10090 486 10090:452 +C contig12 10090 629 10090:102 0:41 10090:38 0:41 10090:373 +C contig13 10090 482 10090:10 0:83 10090:21 0:41 10090:293 +C contig14 10090 896 10090:556 0:54 10090:252 +C contig15 10090 205 10090:171 +C contig16 10090 471 10090:437 +C contig17 10090 302 10090:268 +C contig18 10090 626 0:11 10090:581 +C contig19 10090 322 10090:288 diff --git a/q2_types/kraken2/tests/data/outputs-mags/sample1/bin1_output.txt b/q2_types/kraken2/tests/data/outputs-mags/sample1/bin1_output.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-mags/sample1/bin1_output.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-mags/sample1/bin2_output.txt b/q2_types/kraken2/tests/data/outputs-mags/sample1/bin2_output.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-mags/sample1/bin2_output.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-mags/sample2/bin1_output.txt b/q2_types/kraken2/tests/data/outputs-mags/sample2/bin1_output.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-mags/sample2/bin1_output.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-reads/sample1/output.txt b/q2_types/kraken2/tests/data/outputs-reads/sample1/output.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-reads/sample1/output.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-reads/sample2/output.txt b/q2_types/kraken2/tests/data/outputs-reads/sample2/output.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-reads/sample2/output.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-single/output-missing-column.txt b/q2_types/kraken2/tests/data/outputs-single/output-missing-column.txt new file mode 100755 index 00000000..3c0406b0 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-single/output-missing-column.txt @@ -0,0 +1,32 @@ +C 1912795 10855 1912795:Q +C 1583098 5698 1583098:Q +C 1323375 5173 1323375:Q +C 182217 17101 182217:Q +C 1472 19997 1472:Q +C 29388 23523 29388:Q +C 545501 25821 545501:Q +C 1218 4423 1218:Q +C 2518177 31450 2518177:Q +C 221027 2908 221027:Q +C 59919 2856 59919:Q +U 0 3045 0:Q +C 851 19053 851:Q +C 2647897 2589 2647897:Q +C 2653681 4515 2653681:Q +C 131567 19174 131567:Q +C 2682541 11848 2682541:Q +C 1977865 3665 1977865:Q +C 2770780 5030 2770780:Q +C 400634 2807 400634:Q +C 2490633 6493 2490633:Q +C 111780 8356 111780:Q +C 2305987 3774 2305987:Q +C 983544 27806 983544:Q +C 2563896 3473 2563896:Q +C 332101 3409 332101:Q +C 2593542 29942 2593542:Q +C 34105 8793 34105:Q +C 1301 4680 1301:Q +C 1547445 10430 1547445:Q +C 491950 68731 491950:Q +C 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-single/output-ok.txt b/q2_types/kraken2/tests/data/outputs-single/output-ok.txt new file mode 100755 index 00000000..21a59921 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-single/output-ok.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-single/output-only-classified.txt b/q2_types/kraken2/tests/data/outputs-single/output-only-classified.txt new file mode 100755 index 00000000..6f1814d2 --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-single/output-only-classified.txt @@ -0,0 +1,31 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +C k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/outputs-single/output-wrong-first-col.txt b/q2_types/kraken2/tests/data/outputs-single/output-wrong-first-col.txt new file mode 100755 index 00000000..274140fd --- /dev/null +++ b/q2_types/kraken2/tests/data/outputs-single/output-wrong-first-col.txt @@ -0,0 +1,32 @@ +C k119_33069 1912795 10855 1912795:Q +C k119_55515 1583098 5698 1583098:Q +C k119_66468 1323375 5173 1323375:Q +C k119_33506 182217 17101 182217:Q +C k119_22814 1472 19997 1472:Q +C k119_23274 29388 23523 29388:Q +C k119_45180 545501 25821 545501:Q +C k119_34380 1218 4423 1218:Q +C k119_1654 2518177 31450 2518177:Q +C k119_45407 221027 2908 221027:Q +C k119_12788 59919 2856 59919:Q +U k119_34900 0 3045 0:Q +C k119_45855 851 19053 851:Q +C k119_90411 2647897 2589 2647897:Q +C k119_57806 2653681 4515 2653681:Q +C k119_58481 131567 19174 131567:Q +C k119_47669 2682541 11848 2682541:Q +C k119_59208 1977865 3665 1977865:Q +C k119_16398 2770780 5030 2770780:Q +C k119_60835 400634 2807 400634:Q +C k119_49584 2490633 6493 2490633:Q +C k119_28869 111780 8356 111780:Q +C k119_94747 2305987 3774 2305987:Q +C k119_40414 983544 27806 983544:Q +C k119_73618 2563896 3473 2563896:Q +C k119_84540 332101 3409 332101:Q +C k119_73768 2593542 29942 2593542:Q +C k119_41848 34105 8793 34105:Q +C k119_43035 1301 4680 1301:Q +C k119_65066 1547445 10430 1547445:Q +C k119_10361 491950 68731 491950:Q +Z k119_10711 52959 8685 52959:Q diff --git a/q2_types/kraken2/tests/data/report-ok-table.csv b/q2_types/kraken2/tests/data/report-ok-table.csv new file mode 100644 index 00000000..ee1a3c38 --- /dev/null +++ b/q2_types/kraken2/tests/data/report-ok-table.csv @@ -0,0 +1,12 @@ +perc_frags_covered,n_frags_covered,n_frags_assigned,rank,taxon_id,name +100.0,229,17,R,1,root +92.58,212,0,R1,131567, cellular organisms +92.58,212,2,D,2, Bacteria +91.7,210,0,D1,1783272, Terrabacteria group +91.7,210,0,P,1239, Firmicutes +91.7,210,0,C,91061, Bacilli +91.27,209,2,O,1385, Bacillales +90.39,207,1,F,90964, Staphylococcaceae +89.96,206,195,G,1279, Staphylococcus +4.8,11,11,S,1280, Staphylococcus aureus +0.44,1,1,O,186826, Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-mags/sample1/bin1_report.txt b/q2_types/kraken2/tests/data/reports-mags/sample1/bin1_report.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-mags/sample1/bin1_report.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-mags/sample1/bin2_report.txt b/q2_types/kraken2/tests/data/reports-mags/sample1/bin2_report.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-mags/sample1/bin2_report.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-mags/sample2/bin1_report.txt b/q2_types/kraken2/tests/data/reports-mags/sample2/bin1_report.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-mags/sample2/bin1_report.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-reads/sample1/report.txt b/q2_types/kraken2/tests/data/reports-reads/sample1/report.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-reads/sample1/report.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-reads/sample2/report.txt b/q2_types/kraken2/tests/data/reports-reads/sample2/report.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-reads/sample2/report.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-single/report-missing-column.txt b/q2_types/kraken2/tests/data/reports-single/report-missing-column.txt new file mode 100644 index 00000000..c71c0510 --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-single/report-missing-column.txt @@ -0,0 +1,11 @@ +100.00 17 R 1 root + 92.58 0 R1 131567 cellular organisms + 92.58 2 D 2 Bacteria + 91.70 0 D1 1783272 Terrabacteria group + 91.70 0 P 1239 Firmicutes + 91.70 0 C 91061 Bacilli + 91.27 2 O 1385 Bacillales + 90.39 1 F 90964 Staphylococcaceae + 89.96 195 G 1279 Staphylococcus + 4.80 11 S 1280 Staphylococcus aureus + 0.44 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-single/report-ok.txt b/q2_types/kraken2/tests/data/reports-single/report-ok.txt new file mode 100644 index 00000000..05398b8e --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-single/report-ok.txt @@ -0,0 +1,11 @@ +100.00 229 17 R 1 root + 92.58 212 0 R1 131567 cellular organisms + 92.58 212 2 D 2 Bacteria + 91.70 210 0 D1 1783272 Terrabacteria group + 91.70 210 0 P 1239 Firmicutes + 91.70 210 0 C 91061 Bacilli + 91.27 209 2 O 1385 Bacillales + 90.39 207 1 F 90964 Staphylococcaceae + 89.96 206 195 G 1279 Staphylococcus + 4.80 11 11 S 1280 Staphylococcus aureus + 0.44 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/data/reports-single/report-wrong-types.txt b/q2_types/kraken2/tests/data/reports-single/report-wrong-types.txt new file mode 100644 index 00000000..42b175c8 --- /dev/null +++ b/q2_types/kraken2/tests/data/reports-single/report-wrong-types.txt @@ -0,0 +1,11 @@ +100 229 17 R 1 root + 92 212 0 R1 131567 cellular organisms + 92 212 2 D 2 Bacteria + 91 210 0 D1 1783272 Terrabacteria group + 91 210 0 P 1239 Firmicutes + 91 210 0 C 91061 Bacilli + 91 209 2 O 1385 Bacillales + 90 207 1 F 90964 Staphylococcaceae + 89 206 195 G 1279 Staphylococcus + 4 11 11 S 1280 Staphylococcus aureus + 0 1 1 O 186826 Lactobacillales diff --git a/q2_types/kraken2/tests/test_format.py b/q2_types/kraken2/tests/test_format.py new file mode 100644 index 00000000..981b1cb2 --- /dev/null +++ b/q2_types/kraken2/tests/test_format.py @@ -0,0 +1,158 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os +import unittest + +from qiime2.core.exceptions import ValidationError +from qiime2.plugin.testing import TestPluginBase + +from q2_types.kraken2._format import ( + Kraken2ReportFormat, Kraken2ReportDirectoryFormat, + Kraken2OutputFormat, Kraken2OutputDirectoryFormat, + Kraken2DBReportFormat, Kraken2DBReportDirectoryFormat, + Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat +) + + +class TestFormats(TestPluginBase): + package = 'q2_types.kraken2.tests' + + def test_report_format_ok(self): + report_fp = self.get_data_path('reports-single/report-ok.txt') + fmt = Kraken2ReportFormat(report_fp, mode='r') + fmt.validate() + + def test_db_report_format_ok(self): + report_fp = self.get_data_path( + os.path.join('db-reports', 'report.txt') + ) + fmt = Kraken2DBReportFormat(report_fp, mode='r') + fmt.validate() + + def test_report_format_missing_col(self): + report_fp = self.get_data_path( + 'reports-single/report-missing-column.txt' + ) + fmt = Kraken2ReportFormat(report_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'found 5' + ): + fmt.validate() + + def test_db_report_format_missing_col(self): + report_fp = self.get_data_path( + os.path.join('db-reports', 'report-missing-column.txt') + ) + fmt = Kraken2DBReportFormat(report_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'found 5' + ): + fmt.validate() + + def test_report_format_wrong_types(self): + report_fp = self.get_data_path( + 'reports-single/report-wrong-types.txt' + ) + fmt = Kraken2ReportFormat(report_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, + 'Expected type in the ' + '"perc_frags_covered" column, got int64' + ): + fmt.validate() + + def test_db_report_format_wrong_types(self): + report_fp = self.get_data_path( + os.path.join('db-reports', 'report-wrong-types.txt') + ) + fmt = Kraken2DBReportFormat(report_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, + 'Expected type in the ' + '"perc_minimizers_covered" column, got int64' + ): + fmt.validate() + + def test_report_dirfmt_from_reads(self): + dirpath = self.get_data_path('reports-reads') + fmt = Kraken2ReportDirectoryFormat(dirpath, mode='r') + fmt.validate() + + def test_report_dirfmt_from_mags(self): + dirpath = self.get_data_path('reports-mags') + fmt = Kraken2ReportDirectoryFormat(dirpath, mode='r') + fmt.validate() + + def test_db_report_dirfmt(self): + dirpath = self.get_data_path( + os.path.join('db-reports', 'report-dir') + ) + fmt = Kraken2DBReportDirectoryFormat(dirpath, mode='r') + fmt.validate() + + def test_output_format_ok(self): + output_fp = self.get_data_path('outputs-single/output-ok.txt') + fmt = Kraken2OutputFormat(output_fp, mode='r') + fmt.validate() + + def test_output_format_missing_col(self): + output_fp = self.get_data_path( + 'outputs-single/output-missing-column.txt' + ) + fmt = Kraken2OutputFormat(output_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, '4 were found' + ): + fmt.validate() + + def test_output_format_wrong_first_col(self): + output_fp = self.get_data_path( + 'outputs-single/output-wrong-first-col.txt' + ) + fmt = Kraken2OutputFormat(output_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'Expected the first column to contain only' + ): + fmt.validate() + + def test_output_format_only_classified(self): + output_fp = self.get_data_path( + 'outputs-single/output-only-classified.txt' + ) + fmt = Kraken2OutputFormat(output_fp, mode='r') + fmt.validate() + + def test_output_dirfmt_from_reads(self): + dirpath = self.get_data_path('outputs-reads') + format = Kraken2OutputDirectoryFormat(dirpath, mode='r') + format.validate() + + def test_output_dirfmt_from_mags(self): + dirpath = self.get_data_path('outputs-mags') + format = Kraken2OutputDirectoryFormat(dirpath, mode='r') + format.validate() + + def test_kraken2db_dirfmt(self): + dirpath = self.get_data_path('kraken2-db') + format = Kraken2DBDirectoryFormat(dirpath, mode='r') + format.validate() + + def test_brackendb_dirfmt(self): + dirpath = self.get_data_path('bracken-db') + format = BrackenDBDirectoryFormat(dirpath, mode='r') + format.validate() + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/kraken2/tests/test_transformers.py b/q2_types/kraken2/tests/test_transformers.py new file mode 100644 index 00000000..7c518d22 --- /dev/null +++ b/q2_types/kraken2/tests/test_transformers.py @@ -0,0 +1,64 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os +import unittest + +import pandas as pd +from pandas._testing import assert_frame_equal +from qiime2.plugin.testing import TestPluginBase + +from q2_types.kraken2 import ( + Kraken2ReportFormat, Kraken2OutputFormat, Kraken2DBReportFormat +) + + +class TestTransformers(TestPluginBase): + package = "q2_types.kraken2.tests" + + def setUp(self): + super().setUp() + + def apply_transformation(self, from_fmt, to_fmt, datafile_fp): + transformer = self.get_transformer(from_fmt, to_fmt) + fp = self.get_data_path(datafile_fp) + return transformer(from_fmt(fp, 'r')) + + def test_kraken2_report_to_df(self): + obs = self.apply_transformation( + Kraken2ReportFormat, + pd.DataFrame, + 'reports-single/report-ok.txt' + ) + exp = pd.read_csv(self.get_data_path('report-ok-table.csv')) + assert_frame_equal(exp, obs) + + def test_kraken2_output_to_df(self): + obs = self.apply_transformation( + Kraken2OutputFormat, + pd.DataFrame, + 'outputs-contigs/output-ok.txt' + ) + exp = pd.read_csv(self.get_data_path( + os.path.join('outputs-contigs', 'output-ok-table.csv')) + ) + assert_frame_equal(exp, obs) + + def test_kraken2_db_report_to_df(self): + obs = self.apply_transformation( + Kraken2DBReportFormat, + pd.DataFrame, + os.path.join('db-reports', 'report.txt') + ) + exp = pd.read_csv(self.get_data_path( + os.path.join('db-reports', 'report-ok.csv')) + ) + assert_frame_equal(exp, obs) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/kraken2/tests/test_type.py b/q2_types/kraken2/tests/test_type.py new file mode 100644 index 00000000..62bca0d2 --- /dev/null +++ b/q2_types/kraken2/tests/test_type.py @@ -0,0 +1,87 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import unittest + +from q2_types.feature_data import FeatureData +from q2_types.sample_data import SampleData +from qiime2.plugin.testing import TestPluginBase + +from q2_types.kraken2._format import ( + Kraken2ReportDirectoryFormat, Kraken2OutputDirectoryFormat, + Kraken2DBDirectoryFormat, Kraken2DBReportDirectoryFormat, + BrackenDBDirectoryFormat +) +from q2_types.kraken2._type import ( + Kraken2Reports, Kraken2Outputs, Kraken2DB, Kraken2DBReport, BrackenDB +) + + +class TestTypes(TestPluginBase): + package = "q2_types.kraken2.tests" + + def test_reports_semantic_type_registration(self): + self.assertRegisteredSemanticType(Kraken2Reports) + + def test_reports_semantic_type_to_format_registration_sd(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[Kraken2Reports], + Kraken2ReportDirectoryFormat + ) + + def test_reports_semantic_type_to_format_registration_fd(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[Kraken2Reports], + Kraken2ReportDirectoryFormat + ) + + def test_outputs_semantic_type_registration(self): + self.assertRegisteredSemanticType(Kraken2Outputs) + + def test_outputs_semantic_type_to_format_registration_sd(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[Kraken2Outputs], + Kraken2OutputDirectoryFormat + ) + + def test_outputs_semantic_type_to_format_registration_fd(self): + self.assertSemanticTypeRegisteredToFormat( + FeatureData[Kraken2Outputs], + Kraken2OutputDirectoryFormat + ) + + def test_kraken2db_semantic_type_registration(self): + self.assertRegisteredSemanticType(Kraken2DB) + + def test_kraken2db_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + Kraken2DB, + Kraken2DBDirectoryFormat + ) + + def test_kraken2dbreport_semantic_type_registration(self): + self.assertRegisteredSemanticType(Kraken2DBReport) + + def test_kraken2dbreport_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + Kraken2DBReport, + Kraken2DBReportDirectoryFormat + ) + + def test_brackendb_semantic_type_registration(self): + self.assertRegisteredSemanticType(BrackenDB) + + def test_brackendb_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + BrackenDB, + BrackenDBDirectoryFormat + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/q2_types/per_sample_sequences/__init__.py b/q2_types/per_sample_sequences/__init__.py index 9b71378a..6f4e9507 100644 --- a/q2_types/per_sample_sequences/__init__.py +++ b/q2_types/per_sample_sequences/__init__.py @@ -23,10 +23,15 @@ PairedEndFastqManifestPhred33V2, PairedEndFastqManifestPhred64V2, QIIME1DemuxFormat, QIIME1DemuxDirFmt, - SampleIdIndexedSingleEndPerSampleDirFmt) + SampleIdIndexedSingleEndPerSampleDirFmt, + MultiMAGSequencesDirFmt, MultiMAGManifestFormat, + ContigSequencesDirFmt, MultiBowtie2IndexDirFmt, + BAMFormat, BAMDirFmt, MultiBAMDirFmt, + MultiFASTADirectoryFormat) from ._type import (Sequences, SequencesWithQuality, PairedEndSequencesWithQuality, - JoinedSequencesWithQuality) + JoinedSequencesWithQuality, MAGs, Contigs, + SingleBowtie2Index, MultiBowtie2Index) __all__ = ['CasavaOneEightSingleLanePerSampleDirFmt', 'CasavaOneEightLanelessPerSampleDirFmt', @@ -41,6 +46,11 @@ 'SingleEndFastqManifestPhred64V2', 'PairedEndFastqManifestPhred33V2', 'PairedEndFastqManifestPhred64V2', 'QIIME1DemuxFormat', - 'QIIME1DemuxDirFmt', 'SampleIdIndexedSingleEndPerSampleDirFmt'] + 'QIIME1DemuxDirFmt', 'SampleIdIndexedSingleEndPerSampleDirFmt', + 'MAGs', 'MultiMAGSequencesDirFmt', 'MultiMAGManifestFormat', + 'ContigSequencesDirFmt', 'Contigs', 'SingleBowtie2Index', + 'MultiBowtie2Index', 'MultiBowtie2IndexDirFmt', + 'BAMFormat', 'BAMDirFmt', 'MultiBAMDirFmt', + 'MultiFASTADirectoryFormat'] importlib.import_module('q2_types.per_sample_sequences._transformer') diff --git a/q2_types/per_sample_sequences/_format.py b/q2_types/per_sample_sequences/_format.py index f1cc9184..25240bc9 100644 --- a/q2_types/per_sample_sequences/_format.py +++ b/q2_types/per_sample_sequences/_format.py @@ -7,10 +7,12 @@ # ---------------------------------------------------------------------------- import os +import re import gzip import itertools import collections import pathlib +import subprocess import pandas as pd import skbio @@ -20,6 +22,9 @@ import qiime2.plugin.model as model from qiime2.plugin import ValidationError +from q2_types.bowtie2 import Bowtie2IndexDirFmt +from q2_types.feature_data import DNAFASTAFormat + from ..plugin_setup import plugin from ._util import ( _parse_sequence_filename, @@ -507,6 +512,178 @@ def _validate_seq(self, seq): 'QIIME1DemuxDirFmt', 'seqs.fna', QIIME1DemuxFormat) +# TODO: that's a copy of the _FastqManifestBase from q2-types +# without the direction check. That could potentially be generalised. +class _FastaManifestBase(model.TextFileFormat): + """ + Base class for mapping of sample and mag identifiers to filepaths. + + """ + EXPECTED_HEADER = None + PATH_HEADER_LABEL = None + + def _check_n_records(self, root, n=None): + with self.open() as fh: + header = None + records_seen = 0 + file_ = enumerate(fh) if n is None else zip(range(n), fh) + for i, line in file_: + i = i + 1 # For easier reporting + if line.lstrip(' ') == '\n': + continue # Blank line + elif line.startswith('#'): + continue # Comment line + + cells = [c.strip() for c in line.rstrip('\n').split(',')] + if header is None: + if cells != self.EXPECTED_HEADER: + raise ValidationError( + 'Found header on line %d with the following ' + 'labels: %s, expected: %s' + % (i, cells, self.EXPECTED_HEADER)) + else: + header = cells + else: + if len(cells) != len(header): + raise ValidationError( + 'Line %d has %s cells (%s), expected %s.' + % (i, len(cells), cells, len(header))) + + # Structure checks out, so let's make lookup easy + cells = dict(zip(header, cells)) + + # TODO: a bunch of tests in this subpackage aren't well + # behaved --- many tests fail on this check because the + # test data isn't constructed correctly. As well, there + # appear to be framework-related issues preventing us from + # making this kind of validation work for the relative + # manifest formats at this time. + if root == '': + fp = os.path.join(root, cells[self.PATH_HEADER_LABEL]) + if not os.path.exists(os.path.expandvars(fp)): + raise ValidationError( + 'File referenced on line %d could not be ' + 'found (%s).' + % (i, fp)) + + records_seen += 1 + + if header is None: + raise ValidationError('No header found, expected: %s.' + % self.EXPECTED_HEADER) + + if records_seen == 0: + raise ValidationError('No sample records found in manifest, ' + 'only observed comments, blank lines, ' + 'and/or a header row.') + + +class MultiMAGManifestFormat(_FastaManifestBase): + EXPECTED_HEADER = ['sample-id', 'mag-id', 'filename'] + PATH_HEADER_LABEL = 'filename' + + def _validate_(self, level): + self._check_n_records(root=str(self.path.parent), + n={'min': 10, 'max': None}[level]) + + +class MultiDirValidationMixin: + def _validate_(self, level): + for p in self.path.iterdir(): + if not p.is_dir() and p.name not in ['MANIFEST']: + raise ValidationError( + "Files should be organised in per-sample directories") + + +class MultiFASTADirectoryFormat(MultiDirValidationMixin, + model.DirectoryFormat): + sequences = model.FileCollection(r'.+\.(fa|fasta)$', format=DNAFASTAFormat) + + @sequences.set_path_maker + def sequences_path_maker(self, sample_id, mag_id): + # write out with fasta extension, regardless if input was fa or fasta + return '%s/%s.fasta' % (sample_id, mag_id) + + +class MultiMAGSequencesDirFmt(MultiFASTADirectoryFormat): + manifest = model.File('MANIFEST', format=MultiMAGManifestFormat) + + +class MultiBowtie2IndexDirFmt(MultiDirValidationMixin, Bowtie2IndexDirFmt): + pass + + +class ContigSequencesDirFmt(model.DirectoryFormat): + pathspec = r'[^\.].+_contigs.(fasta|fa)$' + + sequences = model.FileCollection(pathspec, format=DNAFASTAFormat) + + @sequences.set_path_maker + def sequences_path_maker(self, sample_id): + return r'%s_contigs.fasta' % sample_id + + def sample_dict(self, relative=False): + ''' + Returns a mapping of sample id to filepath for each set of per-sample + contigs. + + Parameters + --------- + relative : bool + Whether to return filepaths relative to the directory's location. + Returns absolute filepaths by default. + + Returns + ------- + dict + Mapping of sample id -> filepath as described above. Sorted + alphabetically by key. + ''' + contigs_pattern = re.compile(self.pathspec) + ids = {} + for path in self.path.iterdir(): + if not contigs_pattern.match(path.name): + continue + + id = path.name.rsplit('_contigs', 1)[0] + absolute_path = path.absolute() + if relative: + ids[id] = str( + absolute_path.relative_to(self.path.absolute()) + ) + else: + ids[id] = str(absolute_path) + + return dict(sorted(ids.items())) + + +# borrowed from q2-phylogenomics +class BAMFormat(model.BinaryFileFormat): + def _validate_(self, level): + cmd = ['samtools', 'quickcheck', '-v', str(self)] + result = subprocess.run(cmd) + if result.returncode != 0: + raise model.ValidationError( + 'samtools quickcheck -v failed on %s' % self.path.name) + + +# borrowed from q2-phylogenomics +class BAMDirFmt(model.DirectoryFormat): + bams = model.FileCollection(r'.+\.bam', format=BAMFormat) + + @bams.set_path_maker + def bams_path_maker(self, sample_id): + return '%s.bam' % sample_id + + +class MultiBAMDirFmt(MultiDirValidationMixin, model.DirectoryFormat): + bams = model.FileCollection(r'.+\/.+\.bam', format=BAMFormat) + + @bams.set_path_maker + def bams_path_maker(self, sample_id, genome_id): + return '%s/%s.bam' % sample_id, genome_id + + plugin.register_formats( FastqManifestFormat, YamlFormat, FastqGzFormat, CasavaOneEightSingleLanePerSampleDirFmt, @@ -517,5 +694,7 @@ def _validate_seq(self, seq): PairedEndFastqManifestPhred64, SingleEndFastqManifestPhred33V2, SingleEndFastqManifestPhred64V2, PairedEndFastqManifestPhred33V2, PairedEndFastqManifestPhred64V2, QIIME1DemuxFormat, QIIME1DemuxDirFmt, - SampleIdIndexedSingleEndPerSampleDirFmt + SampleIdIndexedSingleEndPerSampleDirFmt, MultiFASTADirectoryFormat, + MultiMAGSequencesDirFmt, ContigSequencesDirFmt, MultiBowtie2IndexDirFmt, + BAMDirFmt, MultiBAMDirFmt ) diff --git a/q2_types/per_sample_sequences/_transformer.py b/q2_types/per_sample_sequences/_transformer.py index f8245a8c..1a3e1581 100644 --- a/q2_types/per_sample_sequences/_transformer.py +++ b/q2_types/per_sample_sequences/_transformer.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import os import functools import re import warnings @@ -14,6 +15,7 @@ import yaml import pandas as pd import qiime2.util +from q2_types.feature_data import DNAFASTAFormat from ..plugin_setup import plugin from . import ( @@ -35,6 +37,9 @@ PairedEndFastqManifestPhred64V2, QIIME1DemuxDirFmt, SampleIdIndexedSingleEndPerSampleDirFmt, + MultiMAGManifestFormat, + MultiMAGSequencesDirFmt, + MultiFASTADirectoryFormat ) from ._util import ( _single_lane_per_sample_fastq_helper, @@ -46,6 +51,7 @@ _write_phred64_to_phred33, _manifest_v2_to_v1, _manifest_to_df, + _mag_manifest_helper ) @@ -253,3 +259,20 @@ def _27(dirfmt: SampleIdIndexedSingleEndPerSampleDirFmt) \ return _single_lane_per_sample_fastq_helper_partial( dirfmt, SingleLanePerSampleSingleEndFastqDirFmt, parse_sample_id_only=True) + + +@plugin.register_transformer +def _28(dirfmt: MultiFASTADirectoryFormat) \ + -> MultiMAGSequencesDirFmt: + return _mag_manifest_helper( + dirfmt, MultiMAGSequencesDirFmt, + MultiMAGManifestFormat, DNAFASTAFormat) + + +@plugin.register_transformer +def _29(ff: MultiMAGManifestFormat) -> pd.DataFrame: + df = pd.read_csv(str(ff), header=0, comment='#') + df.filename = df.filename.apply( + lambda f: os.path.join(ff.path.parent, f)) + df.set_index(['sample-id', 'mag-id'], inplace=True) + return df diff --git a/q2_types/per_sample_sequences/_type.py b/q2_types/per_sample_sequences/_type.py index 3b57a920..3d3c7a81 100644 --- a/q2_types/per_sample_sequences/_type.py +++ b/q2_types/per_sample_sequences/_type.py @@ -6,12 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from q2_types.bowtie2 import Bowtie2IndexDirFmt +from q2_types.feature_data import BLAST6 from qiime2.plugin import SemanticType +from ..genome_data import SeedOrthologDirFmt from ..plugin_setup import plugin from ..sample_data import SampleData from . import (QIIME1DemuxDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, - SingleLanePerSamplePairedEndFastqDirFmt) + SingleLanePerSamplePairedEndFastqDirFmt, + MultiMAGSequencesDirFmt, ContigSequencesDirFmt, + MultiBowtie2IndexDirFmt, BAMDirFmt, MultiBAMDirFmt) Sequences = SemanticType('Sequences', variant_of=SampleData.field['type']) @@ -21,10 +26,23 @@ 'PairedEndSequencesWithQuality', variant_of=SampleData.field['type']) JoinedSequencesWithQuality = SemanticType( 'JoinedSequencesWithQuality', variant_of=SampleData.field['type']) +MAGs = SemanticType( + 'MAGs', variant_of=SampleData.field['type']) +Contigs = SemanticType( + 'Contigs', variant_of=SampleData.field['type']) +SingleBowtie2Index = SemanticType( + 'SingleBowtie2Index', variant_of=SampleData.field['type']) +MultiBowtie2Index = SemanticType( + 'MultiBowtie2Index', variant_of=SampleData.field['type']) +AlignmentMap = SemanticType( + 'AlignmentMap', variant_of=SampleData.field['type']) +MultiAlignmentMap = SemanticType( + 'MultiAlignmentMap', variant_of=SampleData.field['type']) -plugin.register_semantic_types(Sequences, SequencesWithQuality, - PairedEndSequencesWithQuality, - JoinedSequencesWithQuality) +plugin.register_semantic_types( + Sequences, SequencesWithQuality, PairedEndSequencesWithQuality, + JoinedSequencesWithQuality, MAGs, Contigs, SingleBowtie2Index, + MultiBowtie2Index, AlignmentMap, MultiAlignmentMap) plugin.register_artifact_class( SampleData[Sequences], @@ -52,3 +70,31 @@ "scores associated with specified samples (i.e., " "demultiplexed sequences).") ) +plugin.register_semantic_type_to_format( + SampleData[MAGs], + artifact_format=MultiMAGSequencesDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[Contigs], + artifact_format=ContigSequencesDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[SingleBowtie2Index], + artifact_format=Bowtie2IndexDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[MultiBowtie2Index], + artifact_format=MultiBowtie2IndexDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[AlignmentMap], + artifact_format=BAMDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[MultiAlignmentMap], + artifact_format=MultiBAMDirFmt +) +plugin.register_semantic_type_to_format( + SampleData[BLAST6], + artifact_format=SeedOrthologDirFmt +) diff --git a/q2_types/per_sample_sequences/_util.py b/q2_types/per_sample_sequences/_util.py index 4f2b229e..0bb1c0a0 100644 --- a/q2_types/per_sample_sequences/_util.py +++ b/q2_types/per_sample_sequences/_util.py @@ -9,6 +9,7 @@ import collections import gzip import os +import re import shutil import pandas as pd @@ -327,3 +328,54 @@ def _manifest_to_df(ff, base_dir): values='filename') df.columns.name = None return df + + +def _parse_mag_filename(path): + filename = re.sub(r'\.(fa|fasta)$', '', str(path)) + sample_id, mag_id = filename.rsplit('/', maxsplit=2) + return sample_id, mag_id + + +# TODO: should there be any metadata written as well? +def _mag_manifest_helper(dirfmt, output_cls, manifest_fmt, + fastq_fmt): + result = output_cls() + manifest = manifest_fmt() + manifest_fh = manifest.open() + manifest_fh.write('sample-id,mag-id,filename\n') + for path, view in dirfmt.sequences.iter_views(fastq_fmt): + sample_id, mag_id = _parse_mag_filename(path) + result.sequences.write_data(view, fastq_fmt, + sample_id=sample_id, + mag_id=mag_id) + + filepath = result.sequences.path_maker(sample_id=sample_id, + mag_id=mag_id) + name = f"{filepath.parent.name}/{filepath.name}" + + manifest_fh.write('%s,%s,%s\n' % (sample_id, mag_id, name)) + + manifest_fh.close() + result.manifest.write_data(manifest, manifest_fmt) + + return result + + +# def _bowtie2_fmt_helper(dirfmt, output_cls, bowtie_fmt): +# result = output_cls() +# for path, view in dirfmt.sequences.iter_views(bowtie_fmt): +# sample_id, mag_id = _parse_mag_filename(path) +# result.sequences.write_data(view, bowtie_fmt, +# sample_id=sample_id, +# mag_id=mag_id) +# +# filepath = result.sequences.path_maker(sample_id=sample_id, +# mag_id=mag_id) +# name = f"{filepath.parent.name}/{filepath.name}" +# +# manifest_fh.write('%s,%s,%s\n' % (sample_id, mag_id, name)) +# +# manifest_fh.close() +# result.manifest.write_data(manifest, manifest_fmt) +# +# return result diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.3.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.3.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.4.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.4.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.rev.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.rev.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.rev.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-unorganized/contigs.rev.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.3.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.3.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.4.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.4.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.rev.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.rev.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.rev.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample1/mag1/contigs.rev.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.3.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.3.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.4.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.4.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.rev.1.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.rev.1.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.rev.2.bt2 b/q2_types/per_sample_sequences/tests/data/bowtie/index-valid/sample2/mag1/contigs.rev.2.bt2 new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-invalid/sample1.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-invalid/sample1.bam new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-invalid/sample2.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-invalid/sample2.bam new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map1.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map1.bam new file mode 100644 index 00000000..fd652071 Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map1.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map2.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map2.bam new file mode 100644 index 00000000..e6f915fa Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample1/map2.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map1.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map1.bam new file mode 100644 index 00000000..fd652071 Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map1.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map2.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map2.bam new file mode 100644 index 00000000..e6f915fa Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-multi/sample2/map2.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample1.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample1.bam new file mode 100644 index 00000000..fd652071 Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample1.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample2.bam b/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample2.bam new file mode 100644 index 00000000..e6f915fa Binary files /dev/null and b/q2_types/per_sample_sequences/tests/data/bowtie/maps-single/sample2.bam differ diff --git a/q2_types/per_sample_sequences/tests/data/contigs/sample1_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs/sample1_contigs.fa new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs/sample1_contigs.fa @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/contigs/sample2_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs/sample2_contigs.fa new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs/sample2_contigs.fa @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/contigs/sample3_contigs.fa b/q2_types/per_sample_sequences/tests/data/contigs/sample3_contigs.fa new file mode 100644 index 00000000..489e644b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/contigs/sample3_contigs.fa @@ -0,0 +1,46 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGACTTCTGTCTCCAATCGATATATTGATGGAGAAGAAGGGCTCCATAAAAGAGGATT +ATTAAGTTCTAAATTAACCTCTGCTGTCTCTTTATTTTCCGCTTCTACATTAAGCGAATG +ATCCATCTGCAGTAATGGTTTATCGCCTTCATTAGAAAATATTTTAAATACTACAGTAAC +TGCGCTCCAATGATTATACGTATTAAAGACCTCTGCGCGTAACAATAATTCTACTTTATT +TAGGTCTTGCTGATCATGGAAATGGCAGCGCACTTGAGTTCCGCAGTAGGGCACATAGAC +GCAATTAGTAGCTATTAGTCGAACATCGCGGTAGATGCCGCCGCCCTCATACGACCATAG +TTCAAATTCTCTGGCATCGCAGCGAACCGCGACCACGTTCGGCACATTCGCATCGCAAAG +CTCAGTGATATCGAGAGTGAAACTGGTGTAGCCTGATAGATGCCGCCCAGCTAAATGGCC +ATTTACCCATATTGTTGCATCTCGGTAAATGCCATCAAATTCAAGGTGAATACGCTGCTT +GCTAGCTTCTTTGGGAATTTCAAACGTTTTGCGATACCAGCCTACATCAGTCGGCAGTGA +GCCATGCACAGCATTCGCGGATGCCCGAAATTCGCCTTCAATTACGAAATCATGAGGTAG +GTTTATGTCGCGCCATGCTTCATCTGGATAGCCCAAGCGCGCGACCCCATGGTTTCCTGC +CTTTAACCACTCGGCTCGCTTAAAACGATTAGCATGAATGGCTTGATGGTTAGTGCTGTC +TAGCTCACCTCGATGAAACTTCCAACCTTGGTTAAATTCATAAGTGGTACGCATAGAATT +ACTGATGTCTTTTAAAAGATTCTACAAGTGGAGTCTATTAATTATTTGATAAGTTACTCT +GATTATTTTTAGAGATTTCTAATACAACTCCGCTGCACGTGCCGTAACGTCCGCCTTGGT +ATGCGCAAAACAGATGGGTGGGGACGCCTTCAGAGTTAATCAGTAACTGCGGTCGTTCGA +ATCGCCCTTCACGATCCAGTCCAGGCAATGTCTCGTCGAAGTAAGTTCCAGCATCTTTGT +AAGCAACCTGCGGATTTTGCCAAATAAGGCCATCGCTGGATTCCATATACAGCCCATACT +CGTGGTTATAAAAACCCATGTCACGCATGATTATTTTATATGGTGCGCAATCCTTCGGTT +CGTACCATGTGTACGCGTC +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGTTGCCGTTAAATCGCCGCCCAGTGTCCCAATCCCAACCTTTATAGTAGAGCC +AATATTCGCCATTAGGATTTTGTAATAGCGATGGGTTGCTAACGACGGCATCGTCCCAAT +CGCCATCGCTGCCAACATTAATAACAGGTTCATCTCCAACGCGTCGCCAAGGTCCATTTA +TATTATCGGCAATGGCTAAACCGATGCGTTTGGTGTAGACTAATTGATTGAAGTATTTTT +CGTACTCTGCAGTAGATAAATTGGGTAGCTCATTTTGCTCGATATCTAGTTTCGAGCCAT +CTGCTCCCATGTAGAAAAGAGCATACTTGTCGCCGACCTTTTGCACAGTCGGATTGTGGA +TTGCCCATGAGTCCCAAGCATTTGCACCGCTGCCTTTTAGAACGACTCCTAAATCTTCGT +AGGGGCCTTCCGGAAGATCAGCAACCGCATGGGCCACTTCGCAGGCACTTACCCAACCAG +AAAATGTATACTCGTTTCGCCAACGTGAGTAAAAAACATGAATGCGCCCGTCGGGTCCGT +AGATAGGCGAACAGCACCAAACATGATAGCCTTCTACTTCAAGAATTCGCCCCAGTGGTT +TGAGTTTTTGCTCGAAGTTCGAAGTGCTTACTTCAGAGGTGATGGGACGTAGCTTCTGTA +AATTAATGAGCGACTTATTGCTAACTGTAGAGTCCATGAAAAAAAGGTAAACTTTATACG +AGTAATGTTATGCTCCTTAAAACTGTCAAGGTTTAGGCATTTTGCTGAGCATTATGGTGT +TTAATGGGCTTGAATCATAACAGGATTAAGCGACATTTAAATATTAATGATAAGAATTAG +TGATATAGCTAAAGAGTTAGGGCTTTCGAGGGTTACAGTCTCGGCTATTTTAAATGGACG +ACACCAGAAAATAGGTATTTCCGAAAAGACCGCGCAAAGGGTTCGTTCGAGTGCAAAGGC +TATGGGTTATCTACCCAATCAGAATGCATTGAGTATGAAGAGAGGTCGAAGCATGACTAT +TGGTATGCTGAGTAGTGCGCTATCGGAGGAGTGGGGTGCTAAAATTCTTGTTGGTGCATT +AAGTGCGATAAAGAACACGCCTTATTCACTGCGCGTTGAGTCAGTACAGGGAGCAGCAGA +AGAGCGCGGTGCCCTAGAGCGCCTCTTGGGGTCACGAATTGAAGGGTTGTTGTGCTGCAA +TATAAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag1.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag1.fa new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag1.fa @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag2.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag2.fa new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag2.fa @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag3.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag3.fa new file mode 100644 index 00000000..489e644b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample1/mag3.fa @@ -0,0 +1,46 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGACTTCTGTCTCCAATCGATATATTGATGGAGAAGAAGGGCTCCATAAAAGAGGATT +ATTAAGTTCTAAATTAACCTCTGCTGTCTCTTTATTTTCCGCTTCTACATTAAGCGAATG +ATCCATCTGCAGTAATGGTTTATCGCCTTCATTAGAAAATATTTTAAATACTACAGTAAC +TGCGCTCCAATGATTATACGTATTAAAGACCTCTGCGCGTAACAATAATTCTACTTTATT +TAGGTCTTGCTGATCATGGAAATGGCAGCGCACTTGAGTTCCGCAGTAGGGCACATAGAC +GCAATTAGTAGCTATTAGTCGAACATCGCGGTAGATGCCGCCGCCCTCATACGACCATAG +TTCAAATTCTCTGGCATCGCAGCGAACCGCGACCACGTTCGGCACATTCGCATCGCAAAG +CTCAGTGATATCGAGAGTGAAACTGGTGTAGCCTGATAGATGCCGCCCAGCTAAATGGCC +ATTTACCCATATTGTTGCATCTCGGTAAATGCCATCAAATTCAAGGTGAATACGCTGCTT +GCTAGCTTCTTTGGGAATTTCAAACGTTTTGCGATACCAGCCTACATCAGTCGGCAGTGA +GCCATGCACAGCATTCGCGGATGCCCGAAATTCGCCTTCAATTACGAAATCATGAGGTAG +GTTTATGTCGCGCCATGCTTCATCTGGATAGCCCAAGCGCGCGACCCCATGGTTTCCTGC +CTTTAACCACTCGGCTCGCTTAAAACGATTAGCATGAATGGCTTGATGGTTAGTGCTGTC +TAGCTCACCTCGATGAAACTTCCAACCTTGGTTAAATTCATAAGTGGTACGCATAGAATT +ACTGATGTCTTTTAAAAGATTCTACAAGTGGAGTCTATTAATTATTTGATAAGTTACTCT +GATTATTTTTAGAGATTTCTAATACAACTCCGCTGCACGTGCCGTAACGTCCGCCTTGGT +ATGCGCAAAACAGATGGGTGGGGACGCCTTCAGAGTTAATCAGTAACTGCGGTCGTTCGA +ATCGCCCTTCACGATCCAGTCCAGGCAATGTCTCGTCGAAGTAAGTTCCAGCATCTTTGT +AAGCAACCTGCGGATTTTGCCAAATAAGGCCATCGCTGGATTCCATATACAGCCCATACT +CGTGGTTATAAAAACCCATGTCACGCATGATTATTTTATATGGTGCGCAATCCTTCGGTT +CGTACCATGTGTACGCGTC +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGTTGCCGTTAAATCGCCGCCCAGTGTCCCAATCCCAACCTTTATAGTAGAGCC +AATATTCGCCATTAGGATTTTGTAATAGCGATGGGTTGCTAACGACGGCATCGTCCCAAT +CGCCATCGCTGCCAACATTAATAACAGGTTCATCTCCAACGCGTCGCCAAGGTCCATTTA +TATTATCGGCAATGGCTAAACCGATGCGTTTGGTGTAGACTAATTGATTGAAGTATTTTT +CGTACTCTGCAGTAGATAAATTGGGTAGCTCATTTTGCTCGATATCTAGTTTCGAGCCAT +CTGCTCCCATGTAGAAAAGAGCATACTTGTCGCCGACCTTTTGCACAGTCGGATTGTGGA +TTGCCCATGAGTCCCAAGCATTTGCACCGCTGCCTTTTAGAACGACTCCTAAATCTTCGT +AGGGGCCTTCCGGAAGATCAGCAACCGCATGGGCCACTTCGCAGGCACTTACCCAACCAG +AAAATGTATACTCGTTTCGCCAACGTGAGTAAAAAACATGAATGCGCCCGTCGGGTCCGT +AGATAGGCGAACAGCACCAAACATGATAGCCTTCTACTTCAAGAATTCGCCCCAGTGGTT +TGAGTTTTTGCTCGAAGTTCGAAGTGCTTACTTCAGAGGTGATGGGACGTAGCTTCTGTA +AATTAATGAGCGACTTATTGCTAACTGTAGAGTCCATGAAAAAAAGGTAAACTTTATACG +AGTAATGTTATGCTCCTTAAAACTGTCAAGGTTTAGGCATTTTGCTGAGCATTATGGTGT +TTAATGGGCTTGAATCATAACAGGATTAAGCGACATTTAAATATTAATGATAAGAATTAG +TGATATAGCTAAAGAGTTAGGGCTTTCGAGGGTTACAGTCTCGGCTATTTTAAATGGACG +ACACCAGAAAATAGGTATTTCCGAAAAGACCGCGCAAAGGGTTCGTTCGAGTGCAAAGGC +TATGGGTTATCTACCCAATCAGAATGCATTGAGTATGAAGAGAGGTCGAAGCATGACTAT +TGGTATGCTGAGTAGTGCGCTATCGGAGGAGTGGGGTGCTAAAATTCTTGTTGGTGCATT +AAGTGCGATAAAGAACACGCCTTATTCACTGCGCGTTGAGTCAGTACAGGGAGCAGCAGA +AGAGCGCGGTGCCCTAGAGCGCCTCTTGGGGTCACGAATTGAAGGGTTGTTGTGCTGCAA +TATAAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag1.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag1.fa new file mode 100644 index 00000000..84f04b0b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag1.fa @@ -0,0 +1,41 @@ +>k129_4684 +TGATACCGACGCGGCACTTGAGTGCGCGCTATCCTTCAAGGAAGCCACATGCGTTATTGT +TAAACATGCAAACCCCTGTGGTGTTGCAACTGGGTCATCATTACTTGAGGCATATGAGGG +AGCTTACGCTACAGACCCAACATCTGCTTTTGGTGGCATTATTGCTTTTAATCGAGAATT +AGATTCAAAAACAGCAAAGGCAGTTATTGATAGACAGTTTGTTGAGGTAATCATTGCCCC +ATCTATATCACCCGACAGCATTAAAATTATTGCAAAAAAAGATGGTATACGTTTATTAGA +AGCTGGTTCACGACAAGAAGACATCAAAACTCTTAACATGAAGCGAGTCAGCGGAGGTTT +ATTACTGCAAGACAATGATATTGGGATTATTGATCGTGGTGATATAAAAATTGTTTCAAA +CGAGGTAATTG +>k129_5618 +GTGCTAATCGCACCCTCATGAGCGACACCATTATTCTTTATTTTTGAGTCTTCAGCAAAA +ATAAATACAGTCAAGTCACAACTCTTAGCGTATTCAAATGCGCGTCTTAATACTTCTGTA +TTTTCAATAGCAACATCACCATTACTTACGCCGATGCAGCCTGCAGCTTTTAATAAAAAC +ATCTCTGTCAGTTCTTTTCCATTTAGTTCTTGTGTTAAAGCGCCAAGTGGAAAAATATTT +GCGCGGTTAGATTCACTAGCACGCCGATTTATAAATTCCACAATGGCCGGCGTATCAATA +GTTGGTTGTATATCAGGTGGGACACAAATTGATGTTATACCACTACGGTTTGCAGCTTTA +AGTTCATTTTTGATAGCTATGTTTTTTTCTGAACCAATTTCGCCAAGCCTACCGCAAATA +TCAACTAATCCAGGTAAAATTATTTTATCTTTTGCGTTTATATCCAGATCCGATTTAAAA +>k129_5631 +TCATGATGATCCAAAAGCAGTTGCGGAAGCATCTGGGATAATTACGCGGAGTGGATGTCG +CCGAATCGCAAGATTTACTTTTGATTATGCTATTAAAACAGGAAGAAAAAAAATTACAAT +AGTTCATAAAGCAAATATCCTAAAAGCTCTAACAGGTCTGTTTCTAGAAACAGCAATGGA +AATCGGCAAAGAGTATGGAAATAAACTGGAAATTGAAGAGCGAATTGTCGACAACACAGC +AATGCAATTAGTAATCGATCCAGCGCAATTTAATATAATACTAACAACAAATATGTTCGG +TGATATTCTCTCAGATGAGATTGCGGGTCTAATAGGCGGACTCGGGTTGGCGCCAGGGGG +GAATATTGGTGATGATATAGCAATTTTTGAAGCGGTACACGGAACGGCTCCTGATATTGC +TGGAAAAGGGATTGCGAATCCAACAGCACTTTACCTAGCTTCAGCAATGATGTTGGAACA +TATAAATCAAAATAATATGGCCAATAACCTAAGGAAAGCAATTAGAGAAACATTGAAGAA +TAAAAAAAATCGCACAATCGATCTAGGTGGCGAAGCATCCACAAAAGATTATATGTCATA +TGTTATCGATAATTTAAACTAGAAAACAAATGAATGCACTTATACTCTTAGCACATGGAA +GTCGAAGAAGTGAATCTAACCTCGAAGTAGAGAGTTTATCAAATGAAATTTATGCGCTGA +TTAGCAACAAATT +>k129_2817 +GTCGCCAATTAGCAACTATGATGTCTTCTGGAGTACCTTTGGTCCAATCATTTGAAATCA +CAGGTCGTGGCCATGACAACCCAGGAATGCAGAGCCTAATTTTAGCCATCAAGGCTGATG +TTGAATCTGGAAATAGTTTGGTTGATGCCCTTAGAAAACATCCATTACATTTTAACTCGC +TTTATTGCAATTTAATTGAAGCTGGTGAACACGCCGGTATTTTAGAGGCAATTTTACACA +AATTAGCAACATACTTAGAAAAGACAGAAGCTCTGAAATCAAAAATAAAATCGGCTTTAT +TTTATCCAATGGCAGTTATTGTCGCAGCAATTATTGTGGTAACAATTCTGATGATATTTG +TAATACCTCAATTTTCTGAATTATTTGGAAGTTTTGGTGCTGACTTACCGGGTTTGACAC +AATTTTTAATAGATGCATCAGATTTCTTTGTTAGCCACTGGTGGAAATTATTTGGGTTAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag2.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag2.fa new file mode 100644 index 00000000..4a6f71b8 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fa/sample2/mag2.fa @@ -0,0 +1,42 @@ +>k129_5401 +CCATTGTATGTCTTTAGGTAGCTCCTCATGTTTGAGGTTCATGTCTTGGATTTTGTTTTC +TCCAAAAATCCTTTGTCCTGCATCATAAGCTTGCATTACTTCCTCATTGGATTTAGTTTT +AGAAACAGCCACCAGTTGGACTGATGGGGGTATCTCCTTGATTATCCTAGAAATATTTTC +TGCTATATTCATAATACAAACTTACAATTTTCACAGAGTATTTTTTAAAGAATGAATTGA +AATTGAAGTTGAATTAAAGCATTTAAAATTTACAACATTCCATGATTTGATGTGCAAGTT +TCAAAGCACGGGTTCCGCTTTCCAGAGGAACAATAATTGGATCATTATTTTCGATGGCAT +CAGCAAAAACCTCTAACTCTTCCAGTATGGCGTTGGAATCTTCAATTCGTGGATTTTCAA +AATAGATTTGTTTTTTTTCTCCCTGTGCATTTTCAAGAATCATGGCAAAGTCTTTCGGTT +TTTTTGGAGCTTTTTTCATTTTTACTACCTCGACTTTTTTCTCTAGAAAATCTACTGCGA +TATAAGTGTTTTTCTGGAAAAAGCGCGTTTTACGCATTTTTTTTAGTGAAATCCTGCTTG +CGGTAAGGTTGGCAACACAGCCGTTTTCAAACTCAATTCTTGCGTTAGTTATATCGGGGG +TGGAGCTTATTACCGCCACCCCAGAAGCGGATACGGATTTTACTTTGGCATCGACAACAC +TCAACAATACATCAATATCGTGAATCATCAAATCTAAAACCACTGGAACATCGGTTCCTC +TTGGATTGAATTCTGCCAAACGATGGGACTCTATAAATTTTGGGTGGGTAATCGATGATT +TTACCGCCTTAAAAGCCGGATTAAAACGTTCTACGTGTCCCACCTGACCAAGAACCATTT +TTTTTGTAGCCATTTCGGTAATTTCAAGGGCTTCGGTAACATTATTGGCAATAGGCTTTT +CTATAAATATATGCTTTCCTTTTTGGATTGCATTTATGGCATTTTTATGGTGGAAAAATG +TAGGAGTGACAATATCTACCATATCGCAAGCTTGGATCAAATCGGCTTCACTTTTAAAAG +CGGTATAGCCATTTTCTTTAGCTAAGGCCTTGGTGTTTTTTTTATCCTGATCATAAAACC +CCACTAATTGGTATCGTTTAGAGGCCTCTAGTAAACGCAGATGAATTTTACCCAAATGTC +CTGCACCCAAGACGCCGACTTTTATCATAGCACTTAATTTTTAATCAAAAATACCATCTT +TTTCTGATTTTTTTTTGGAAGTAATTACATTTGTCCTCATGATTGATTCAACCAAGCATC +AAGGACAAAGAAGACAGTTGGTAAAATTGTTAGAGGAGAAAGGAGTCTACGACAAAAGGG +TTTTGAATGCTGTTGGAAGTGTTCCCCGTCATCTGTTTATGGATTCGGGTTTGGAGGAGT +ATTCCTATATTGACAAAGCCTATCCCATTGCGGCTAATCAGACCATATCACAGCCTTACA +CCGTAGCTTTTCAGACCCAATTGCTGGAACTTCAGAAAGGGGATCGAGTTTTGGAAATCG +GAACGGGTTCGGGCTATCAAACAGCTATTTTAATCGCCCTTGAAGGTCTAAAAGTGTATA +CCATTGAACGCCAACTGGAGTTATATAAAACAACTGTTTTGTTATTTAAAAAGTTGGGGT +TAAATCCCAAAAAAGTGATATTTGGTGATGGTTACCAAGGTTTACCAGATCAAGCACCTT +TTGATGCCATCATCGTTACTGCAGGTGCGCCTCAAGTACCCAAACCTTTGTTGGAACAAT +TGACCATTGGAGGGAGACTCGTAATCCCTGTGGGAGAGAAAGACCAAGTCATGACCCGAT +ATATGCGAACAGGGGAAAAGACCTTTGATCGACAAACCTTTGGGAATTTCAGATTTGTCC +CTTTGCTAAAGGATGAGAGATAGAGCTTGTTAAGTACTTCGTGAATATCGGATTTTCCTT +ACTGAATTTATAGCTCTTGACAATATCAATTGTTTGAAGATGGAAGGTGAAGTATACTTC +AGGCTTCGTAGCTGATAAGAATATTCACTCTTCGATTTTATAAATTTTGTTAAAAAATTG +CTCTACGTCGGTAGTTTTTTGGATTGATTAGAGCAGGTTTACTGTGTGTTGTAAAATTTT +TATAATCTTTAATTTGAGGTTGTTCACTATAATTTGGTGAGAAAAACTATTTATTGAAAT +TTTTTTTAATCCTATCTAAATCCCTTTTATTGTCTAAATCTTTAAGGGCTTCTCTTTTAT +CGTAAAGTTTTTTCCCCCGAGCTAAAGCGATCAACATTTTGGCAAAACCTTTTTCGTTGA +TGAAAAGCTTAAGTGGTACAATGGTCAAGCCGGAATTTTTCACCTGTTTGAAAAGTTTGT +TTAATTCTCTTTTTTTAA diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag1.fasta b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag1.fasta new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag1.fasta @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag2.fasta b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag2.fasta new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag2.fasta @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag3.fasta b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag3.fasta new file mode 100644 index 00000000..489e644b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample1/mag3.fasta @@ -0,0 +1,46 @@ +>k129_6525 +AAACTCTATCAAGCGTATACCAAAGTGAGTGGTGTATTGATCAGTCAGCTCATTATTGAA +TCGGACTTCTGTCTCCAATCGATATATTGATGGAGAAGAAGGGCTCCATAAAAGAGGATT +ATTAAGTTCTAAATTAACCTCTGCTGTCTCTTTATTTTCCGCTTCTACATTAAGCGAATG +ATCCATCTGCAGTAATGGTTTATCGCCTTCATTAGAAAATATTTTAAATACTACAGTAAC +TGCGCTCCAATGATTATACGTATTAAAGACCTCTGCGCGTAACAATAATTCTACTTTATT +TAGGTCTTGCTGATCATGGAAATGGCAGCGCACTTGAGTTCCGCAGTAGGGCACATAGAC +GCAATTAGTAGCTATTAGTCGAACATCGCGGTAGATGCCGCCGCCCTCATACGACCATAG +TTCAAATTCTCTGGCATCGCAGCGAACCGCGACCACGTTCGGCACATTCGCATCGCAAAG +CTCAGTGATATCGAGAGTGAAACTGGTGTAGCCTGATAGATGCCGCCCAGCTAAATGGCC +ATTTACCCATATTGTTGCATCTCGGTAAATGCCATCAAATTCAAGGTGAATACGCTGCTT +GCTAGCTTCTTTGGGAATTTCAAACGTTTTGCGATACCAGCCTACATCAGTCGGCAGTGA +GCCATGCACAGCATTCGCGGATGCCCGAAATTCGCCTTCAATTACGAAATCATGAGGTAG +GTTTATGTCGCGCCATGCTTCATCTGGATAGCCCAAGCGCGCGACCCCATGGTTTCCTGC +CTTTAACCACTCGGCTCGCTTAAAACGATTAGCATGAATGGCTTGATGGTTAGTGCTGTC +TAGCTCACCTCGATGAAACTTCCAACCTTGGTTAAATTCATAAGTGGTACGCATAGAATT +ACTGATGTCTTTTAAAAGATTCTACAAGTGGAGTCTATTAATTATTTGATAAGTTACTCT +GATTATTTTTAGAGATTTCTAATACAACTCCGCTGCACGTGCCGTAACGTCCGCCTTGGT +ATGCGCAAAACAGATGGGTGGGGACGCCTTCAGAGTTAATCAGTAACTGCGGTCGTTCGA +ATCGCCCTTCACGATCCAGTCCAGGCAATGTCTCGTCGAAGTAAGTTCCAGCATCTTTGT +AAGCAACCTGCGGATTTTGCCAAATAAGGCCATCGCTGGATTCCATATACAGCCCATACT +CGTGGTTATAAAAACCCATGTCACGCATGATTATTTTATATGGTGCGCAATCCTTCGGTT +CGTACCATGTGTACGCGTC +>k129_6531 +TCGGATTTGCCGAATGCTTTTTGTAAGGGCCTTCAATTGATTTGGCGATAGCGAGCCCGT +ATTTACGGTTGCCGTTAAATCGCCGCCCAGTGTCCCAATCCCAACCTTTATAGTAGAGCC +AATATTCGCCATTAGGATTTTGTAATAGCGATGGGTTGCTAACGACGGCATCGTCCCAAT +CGCCATCGCTGCCAACATTAATAACAGGTTCATCTCCAACGCGTCGCCAAGGTCCATTTA +TATTATCGGCAATGGCTAAACCGATGCGTTTGGTGTAGACTAATTGATTGAAGTATTTTT +CGTACTCTGCAGTAGATAAATTGGGTAGCTCATTTTGCTCGATATCTAGTTTCGAGCCAT +CTGCTCCCATGTAGAAAAGAGCATACTTGTCGCCGACCTTTTGCACAGTCGGATTGTGGA +TTGCCCATGAGTCCCAAGCATTTGCACCGCTGCCTTTTAGAACGACTCCTAAATCTTCGT +AGGGGCCTTCCGGAAGATCAGCAACCGCATGGGCCACTTCGCAGGCACTTACCCAACCAG +AAAATGTATACTCGTTTCGCCAACGTGAGTAAAAAACATGAATGCGCCCGTCGGGTCCGT +AGATAGGCGAACAGCACCAAACATGATAGCCTTCTACTTCAAGAATTCGCCCCAGTGGTT +TGAGTTTTTGCTCGAAGTTCGAAGTGCTTACTTCAGAGGTGATGGGACGTAGCTTCTGTA +AATTAATGAGCGACTTATTGCTAACTGTAGAGTCCATGAAAAAAAGGTAAACTTTATACG +AGTAATGTTATGCTCCTTAAAACTGTCAAGGTTTAGGCATTTTGCTGAGCATTATGGTGT +TTAATGGGCTTGAATCATAACAGGATTAAGCGACATTTAAATATTAATGATAAGAATTAG +TGATATAGCTAAAGAGTTAGGGCTTTCGAGGGTTACAGTCTCGGCTATTTTAAATGGACG +ACACCAGAAAATAGGTATTTCCGAAAAGACCGCGCAAAGGGTTCGTTCGAGTGCAAAGGC +TATGGGTTATCTACCCAATCAGAATGCATTGAGTATGAAGAGAGGTCGAAGCATGACTAT +TGGTATGCTGAGTAGTGCGCTATCGGAGGAGTGGGGTGCTAAAATTCTTGTTGGTGCATT +AAGTGCGATAAAGAACACGCCTTATTCACTGCGCGTTGAGTCAGTACAGGGAGCAGCAGA +AGAGCGCGGTGCCCTAGAGCGCCTCTTGGGGTCACGAATTGAAGGGTTGTTGTGCTGCAA +TATAAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag1.fasta b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag1.fasta new file mode 100644 index 00000000..84f04b0b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag1.fasta @@ -0,0 +1,41 @@ +>k129_4684 +TGATACCGACGCGGCACTTGAGTGCGCGCTATCCTTCAAGGAAGCCACATGCGTTATTGT +TAAACATGCAAACCCCTGTGGTGTTGCAACTGGGTCATCATTACTTGAGGCATATGAGGG +AGCTTACGCTACAGACCCAACATCTGCTTTTGGTGGCATTATTGCTTTTAATCGAGAATT +AGATTCAAAAACAGCAAAGGCAGTTATTGATAGACAGTTTGTTGAGGTAATCATTGCCCC +ATCTATATCACCCGACAGCATTAAAATTATTGCAAAAAAAGATGGTATACGTTTATTAGA +AGCTGGTTCACGACAAGAAGACATCAAAACTCTTAACATGAAGCGAGTCAGCGGAGGTTT +ATTACTGCAAGACAATGATATTGGGATTATTGATCGTGGTGATATAAAAATTGTTTCAAA +CGAGGTAATTG +>k129_5618 +GTGCTAATCGCACCCTCATGAGCGACACCATTATTCTTTATTTTTGAGTCTTCAGCAAAA +ATAAATACAGTCAAGTCACAACTCTTAGCGTATTCAAATGCGCGTCTTAATACTTCTGTA +TTTTCAATAGCAACATCACCATTACTTACGCCGATGCAGCCTGCAGCTTTTAATAAAAAC +ATCTCTGTCAGTTCTTTTCCATTTAGTTCTTGTGTTAAAGCGCCAAGTGGAAAAATATTT +GCGCGGTTAGATTCACTAGCACGCCGATTTATAAATTCCACAATGGCCGGCGTATCAATA +GTTGGTTGTATATCAGGTGGGACACAAATTGATGTTATACCACTACGGTTTGCAGCTTTA +AGTTCATTTTTGATAGCTATGTTTTTTTCTGAACCAATTTCGCCAAGCCTACCGCAAATA +TCAACTAATCCAGGTAAAATTATTTTATCTTTTGCGTTTATATCCAGATCCGATTTAAAA +>k129_5631 +TCATGATGATCCAAAAGCAGTTGCGGAAGCATCTGGGATAATTACGCGGAGTGGATGTCG +CCGAATCGCAAGATTTACTTTTGATTATGCTATTAAAACAGGAAGAAAAAAAATTACAAT +AGTTCATAAAGCAAATATCCTAAAAGCTCTAACAGGTCTGTTTCTAGAAACAGCAATGGA +AATCGGCAAAGAGTATGGAAATAAACTGGAAATTGAAGAGCGAATTGTCGACAACACAGC +AATGCAATTAGTAATCGATCCAGCGCAATTTAATATAATACTAACAACAAATATGTTCGG +TGATATTCTCTCAGATGAGATTGCGGGTCTAATAGGCGGACTCGGGTTGGCGCCAGGGGG +GAATATTGGTGATGATATAGCAATTTTTGAAGCGGTACACGGAACGGCTCCTGATATTGC +TGGAAAAGGGATTGCGAATCCAACAGCACTTTACCTAGCTTCAGCAATGATGTTGGAACA +TATAAATCAAAATAATATGGCCAATAACCTAAGGAAAGCAATTAGAGAAACATTGAAGAA +TAAAAAAAATCGCACAATCGATCTAGGTGGCGAAGCATCCACAAAAGATTATATGTCATA +TGTTATCGATAATTTAAACTAGAAAACAAATGAATGCACTTATACTCTTAGCACATGGAA +GTCGAAGAAGTGAATCTAACCTCGAAGTAGAGAGTTTATCAAATGAAATTTATGCGCTGA +TTAGCAACAAATT +>k129_2817 +GTCGCCAATTAGCAACTATGATGTCTTCTGGAGTACCTTTGGTCCAATCATTTGAAATCA +CAGGTCGTGGCCATGACAACCCAGGAATGCAGAGCCTAATTTTAGCCATCAAGGCTGATG +TTGAATCTGGAAATAGTTTGGTTGATGCCCTTAGAAAACATCCATTACATTTTAACTCGC +TTTATTGCAATTTAATTGAAGCTGGTGAACACGCCGGTATTTTAGAGGCAATTTTACACA +AATTAGCAACATACTTAGAAAAGACAGAAGCTCTGAAATCAAAAATAAAATCGGCTTTAT +TTTATCCAATGGCAGTTATTGTCGCAGCAATTATTGTGGTAACAATTCTGATGATATTTG +TAATACCTCAATTTTCTGAATTATTTGGAAGTTTTGGTGCTGACTTACCGGGTTTGACAC +AATTTTTAATAGATGCATCAGATTTCTTTGTTAGCCACTGGTGGAAATTATTTGGGTTAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag2.fasta b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag2.fasta new file mode 100644 index 00000000..4a6f71b8 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-fasta/sample2/mag2.fasta @@ -0,0 +1,42 @@ +>k129_5401 +CCATTGTATGTCTTTAGGTAGCTCCTCATGTTTGAGGTTCATGTCTTGGATTTTGTTTTC +TCCAAAAATCCTTTGTCCTGCATCATAAGCTTGCATTACTTCCTCATTGGATTTAGTTTT +AGAAACAGCCACCAGTTGGACTGATGGGGGTATCTCCTTGATTATCCTAGAAATATTTTC +TGCTATATTCATAATACAAACTTACAATTTTCACAGAGTATTTTTTAAAGAATGAATTGA +AATTGAAGTTGAATTAAAGCATTTAAAATTTACAACATTCCATGATTTGATGTGCAAGTT +TCAAAGCACGGGTTCCGCTTTCCAGAGGAACAATAATTGGATCATTATTTTCGATGGCAT +CAGCAAAAACCTCTAACTCTTCCAGTATGGCGTTGGAATCTTCAATTCGTGGATTTTCAA +AATAGATTTGTTTTTTTTCTCCCTGTGCATTTTCAAGAATCATGGCAAAGTCTTTCGGTT +TTTTTGGAGCTTTTTTCATTTTTACTACCTCGACTTTTTTCTCTAGAAAATCTACTGCGA +TATAAGTGTTTTTCTGGAAAAAGCGCGTTTTACGCATTTTTTTTAGTGAAATCCTGCTTG +CGGTAAGGTTGGCAACACAGCCGTTTTCAAACTCAATTCTTGCGTTAGTTATATCGGGGG +TGGAGCTTATTACCGCCACCCCAGAAGCGGATACGGATTTTACTTTGGCATCGACAACAC +TCAACAATACATCAATATCGTGAATCATCAAATCTAAAACCACTGGAACATCGGTTCCTC +TTGGATTGAATTCTGCCAAACGATGGGACTCTATAAATTTTGGGTGGGTAATCGATGATT +TTACCGCCTTAAAAGCCGGATTAAAACGTTCTACGTGTCCCACCTGACCAAGAACCATTT +TTTTTGTAGCCATTTCGGTAATTTCAAGGGCTTCGGTAACATTATTGGCAATAGGCTTTT +CTATAAATATATGCTTTCCTTTTTGGATTGCATTTATGGCATTTTTATGGTGGAAAAATG +TAGGAGTGACAATATCTACCATATCGCAAGCTTGGATCAAATCGGCTTCACTTTTAAAAG +CGGTATAGCCATTTTCTTTAGCTAAGGCCTTGGTGTTTTTTTTATCCTGATCATAAAACC +CCACTAATTGGTATCGTTTAGAGGCCTCTAGTAAACGCAGATGAATTTTACCCAAATGTC +CTGCACCCAAGACGCCGACTTTTATCATAGCACTTAATTTTTAATCAAAAATACCATCTT +TTTCTGATTTTTTTTTGGAAGTAATTACATTTGTCCTCATGATTGATTCAACCAAGCATC +AAGGACAAAGAAGACAGTTGGTAAAATTGTTAGAGGAGAAAGGAGTCTACGACAAAAGGG +TTTTGAATGCTGTTGGAAGTGTTCCCCGTCATCTGTTTATGGATTCGGGTTTGGAGGAGT +ATTCCTATATTGACAAAGCCTATCCCATTGCGGCTAATCAGACCATATCACAGCCTTACA +CCGTAGCTTTTCAGACCCAATTGCTGGAACTTCAGAAAGGGGATCGAGTTTTGGAAATCG +GAACGGGTTCGGGCTATCAAACAGCTATTTTAATCGCCCTTGAAGGTCTAAAAGTGTATA +CCATTGAACGCCAACTGGAGTTATATAAAACAACTGTTTTGTTATTTAAAAAGTTGGGGT +TAAATCCCAAAAAAGTGATATTTGGTGATGGTTACCAAGGTTTACCAGATCAAGCACCTT +TTGATGCCATCATCGTTACTGCAGGTGCGCCTCAAGTACCCAAACCTTTGTTGGAACAAT +TGACCATTGGAGGGAGACTCGTAATCCCTGTGGGAGAGAAAGACCAAGTCATGACCCGAT +ATATGCGAACAGGGGAAAAGACCTTTGATCGACAAACCTTTGGGAATTTCAGATTTGTCC +CTTTGCTAAAGGATGAGAGATAGAGCTTGTTAAGTACTTCGTGAATATCGGATTTTCCTT +ACTGAATTTATAGCTCTTGACAATATCAATTGTTTGAAGATGGAAGGTGAAGTATACTTC +AGGCTTCGTAGCTGATAAGAATATTCACTCTTCGATTTTATAAATTTTGTTAAAAAATTG +CTCTACGTCGGTAGTTTTTTGGATTGATTAGAGCAGGTTTACTGTGTGTTGTAAAATTTT +TATAATCTTTAATTTGAGGTTGTTCACTATAATTTGGTGAGAAAAACTATTTATTGAAAT +TTTTTTTAATCCTATCTAAATCCCTTTTATTGTCTAAATCTTTAAGGGCTTCTCTTTTAT +CGTAAAGTTTTTTCCCCCGAGCTAAAGCGATCAACATTTTGGCAAAACCTTTTTCGTTGA +TGAAAAGCTTAAGTGGTACAATGGTCAAGCCGGAATTTTTCACCTGTTTGAAAAGTTTGT +TTAATTCTCTTTTTTTAA diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag1.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag1.fa new file mode 100644 index 00000000..484d44ec --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag1.fa @@ -0,0 +1,51 @@ +>k129_5480 +TTATTTTCAAGATAATGAGCCAATTTAAGCGGTGTCTGGCCGCCAAGCTGCACGATCACA +CCTTTAACTTTCCCATGCTCATTTTCTGCTTCAATCAATGACAATACATCTTCGCCTGTG +AGCGGCTCGAAATATAATCTGTCAGAGGTATCATAATCCGTTGAAACGGTTTCAGGATTA +CAATTAACCATGATTGTTTCATAACCCGCCTCTTTTAGTGCATAGGCGGCATGGACACAG +CAATAATCAAATTCAATACCTTGCCCGATACGGTTTGGCCCGCCACCTAGAATAACGATT +TTGTCTTTTTTAGTTGCTGTAATTTCAGAAGTAGAATTAAGTGTTTCATAGGTGCCGTAC +ATATACGATGTTAATGACGGTATTTCTGCCGCACAGCTATCCACCCGCTTATAAACAGGT +TTTACTTTGTGCATCAAACGTGTTTTACGGATCGTTGCTTCTGCAACCCCTACCAATTCA +GCCAGACGCGCATCCGAAAATCCTGCGCGCTTCAATGCCATCCATCCCTGAGGATCTTTC +GGCAGGCCGTTTTTCTTAATGGAGGCCTCAGTATCAATAAGAGATTTTATACGCTCTAAA +TACCACATATCAAATTTTGTTAATTGATAGATAGTTTCTAAATCCATACCGTGTCGCATC +GCTTCGGCTGCATAGAGTAAGCGGGCTGGCGTTGGACGTGAAAGTGCTGCCCGAATATCG +TCCATATCAGGCTCAGACTTACCAGCAATCGGAATGGAGCTAAGCCCCTCTAAGCCCTTT +TCTAAAGAGCGCAAAGCTTTTTGCAGAGACTCTTCGAAGCTACGCCCTATAGCCATGGCT +TCACCGACTGACTTCATTGCTGTGGTTAAGGTGTTATCAGAGCCTTTAAATTTCTCGAAA +GCAAAACGAGGCACTTTTGTCACGACATAATCAATGGATGGCTCAAAGGCTGCGGGTGTT +TTGCCGCCTGTAATATCATTGCCTAATTCATCAAGTGTATACCCTACCGCCAATTTCGCT +GCCACTTTAGCAATCGGAAAACCTGTAGCTTTTGAGGCTAAAGCAGAAGAACGAGACACA +CGAGGGTTCATCTCAATCACCACCATACGGCCTGTCTCTGGATCCATTCCAAATTGGACA +TTCGATCCACCTGTTTCAACACCAATCACACGAAGTACGGCCAATGAGGCATTGCGCATG +ATTTGATACTCTTTATCTGTCAGTGTTAAGGCTGGAGCAACGGTAATAGAATCACCTGTA +TGCACGCCCATAGGGTCAATGTTTTCAATCGAACAAATAATGATAGCGTTGTCCTTTGTA +TCACGAACAACCTCCATCTCGTATTCTTTCCAACCCAATAAACTCTCATCAATCAACACT +TCATTGGTTGGTGACGCATCCAAGCCTTCACGAATGATTTGTTCAAACTCATCTTTGTTA +TAAGCAACCCCGCCACCAGAACCACCCATGGTAAAGGATGGACGAATAATCGCTGGTAAG +CCTGTATGTTTCAGAGCCTCTCTAGCCTCTTCCATAGAATGCACCACCGCACTTTTAGGA +CTTTCAAGACCAATCTTCTCCATACAATCTTTAAATAATTGGCGGTCTTCAGCCTTTTCA +ATGGCTTCTTTATTGGCACCGATCAGTTCAATATTGAGTCTTTTTAATACACCCATTTTA +TCAAGAGCCAGTGCAGCATTCAGTGCCGTCTGACCACCCATGGTTGGAAGCAACGCATCG +GGGCGTTCTTTTTCTAAAATCTTTGCGACAATTTCTGGGGTGATTGGCTCAATATAAGTC +GCATCAGCCATATTCGGATCAGTCATAATTGTGGCTGGATTAGAATTAATCAGGACAACG +CGGTACCCCTCTTCTTTCAGCGCTTTACAGGCTTGTGCACCTGAATAGTCAAATTCACAG +GCTTGACCTATCACGATAGGACCAGCGCCAATAATACAAATGGAGGAAATGTCGGTGCGT +TTAGGCATGTGAATCTCGGTTTCTTTTTTTTATACTTACCGAGAGTTAGTTTATGCACTT +ATCAGGGTGTGCAGACAAGCTCTTTCTTGACCTTACCCGCAAGTTTAGCTATATTCTATC +AACAGCCCGCCCTTGATGGCGGGTTATTTTATTGAAAAGGTGCAAGGCTATGCAAAAAAT +ACCCTTAACAAAACAAGGCCACACAGACCTTGAAGCAGAATTAAAAGATTTAAAACACCG +CCAACGTCCAGCGGTTATTGCTGCGATATCTGAAGCCAGAGAACATGGCGATTTATCAGA +AAACGCTGAATATCACGCCGCCCGTGAGCAGCAAAGCTTTATCGAAGGTCGTATCGAGCA +AGTCGAAGCTATTTTATCGCTCGCTGAGATTATTGACCCGGCCAAAATTTCTGGTGACAC +GGTAAAATTTGCAGCAACTGTTAAAGTCGTTGATTGTGACACAGATGATGAACATATCTA +CCAAATCGTCGGTGATGAAGAATCAGACATTGAAACAGGAAAACTGGCTATCTCGTCACC +TGTTGCCCGCGCTTTAATCGGCAAAAAAGTTGAGGACTCAGTCGAAGTCCGCACACCAAA +AGGCACAAGAGAATACGAAATTTTAGAAATTCTGTATAAGTAATTTCTATTCTTCGATCG +GTACGCCAGGCTTCTTGAAATTACGTTTCATAATAAGTGATGACTTAACAGAGCGAACAT +TTTTTAGCGCTGTCAGTTCTTCTGTAATAAAACGCTGATAAGCATCCCAATCTTTGGCCA +CAATACGGAGTGTGAAATCCATATCACCCGCAATCATGTAACAATCACGAACGAGATCCA +TTTTCTCAACGGCTTTGATAAAGGCCTGAAGGTCTTTTTCTGAAGTGTCTTCTAAAGCTA +CATTGGCAAAAACCGCCACACCATAGCCTAACATTGAAGCACTTAAATCCGCATGATAAC +TTTGGATATAACCATAATCTTCCAAT diff --git a/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag2.fa b/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag2.fa new file mode 100644 index 00000000..524cddb5 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/mags/mags-unorganized/mag2.fa @@ -0,0 +1,64 @@ +>k129_5112 +CCCCGGAAAGGGCTGGCGACCGACGATGACCTCGGGAAGCCCCAACTCGCGGCCGATGGC +GCGTACCTCGTCTTTGAAGAGGGTGCGAAGGGGCTCGACGAGGTCGAACTGGAGGTCTTC +GGGCAGCCCACCGACGTTGTGGTGGCTCTTGATGTTGGCGGTTCCAGCCCCGCCACCGGA +CTCGACGACATCCGGATACAGGGTGCCCTGCACGAGGAAGCGGATGGGTTCGCCGTCGGC +CTTGGCCTCATCCACGAGCTCGCGCTGCACCCGCTCGAACGCACGGATGAACTCGCGACC +GATGATCTTGCGCTTCTCTTCGGGATCGCTGACGCCGGCGAGGGCCTCGAGGAACGTCTC +GCGGGCGTCGACGGTCACGAGGCGCACACCGGTCGAGGCTACGTAATCCTGCTCGACCTG +TTCGCGTTCGCCCTTGCGCAGCAGGCCGTGGTCGACGAACACGGCAACGAGCTGGTCGCC +GACTGCCTTGTGAACGAGGGCCGTCGAGACAGCCGAGTCGACTCCGCCCGACAGCGCCGA +GAGCACACGACCCGAGCCGACCTGCGCGCGGATCCGATCGACCTGCTCGGCGATGACGTT +GCCGCTGTTCCAGTCTGCGGGGAGGCCCGCAGCCTTGTGCAGGAAGTTCTCGATGATGCG +CTGCCCGTGGTCGGAGTGCTTGACCTCGGGATGCCACTGCACACCGTACATGCGGCGAGC +GTCGTTGCCGAAAGCGGCGACCGGGGTGGCACCGGTGCGGGCGAGCACCTCGAACCCGGC +GGGGGCTTCGGACACCTGGTCACCATGGCTCATCCAGACGTTCTGCTCCGCGGGCTGGCC +ATCGAACAGTACGCTCTCGTCACGGATGATGCTGGCGTCAGTCGCCCCGTACTCGCGCAG +CCCCGTGTTCGCAACGACGCCACCGAGCGCCTGCGCCATGACCTGGAATCCGTAGCAGAT +GCCAAGGGTCGGAACGCCCAGGTCGAACACCGCCGGGTCGAGCGTCGGCGCGCCAGGCTC +GTACACCGATGACGGTCCGCCCGACAGGATGATGCCGATCGGATCTTTTGCGGCAATCTC +TTCAGCTGTCGCGGTGTGCGGAACCAGCTCGCTGTAGACGCCCGCTTCGCGCACGCGACG +GGCAATGAGCTGGGCGTACTGCGCGCCGAAGTCGACGACGAGGACGGGTCGCTGCGAGGT +CTCGGTCTGTTCTGTCACCGGATGCTTTCGGTCGGCGCCCCTGGAACCCAGGAGCGAAGG +TCAGGACACTGTGGGGTTCTGGCGGGTCACGCTGGAGTGTTCGGCGAGATCGTGGTTCTC +GGACTCGCGCGCAGCAAGGTACGTCTTGACCTCACGGGCGACCCGTGCCTCCATGAAGAA +CGACAGGAACGGGACGATTCCGCCCAGCGCGAGGGCGATGAACCGACCGAACCGCCACCG +CATCAGGCTCCAGATGCGGAAGCACGCGAAGAGGTACACGACGTAGAACCAGCCGTGGCC +GACGAGGATCGACAGCGACACATTGACGCCGTCGCCCGCCGACTCGAGGTCGCAGCCCAG +ACCCCCGGGCACGAAGAGCGAGTACCACTCGCATCCGGGCCCGACCAGCACCGGTGCGAA +CCAGAGGAAGCCACCGGACCCGCCGGCGAACAGTTCGACGTGCAGCGGCGAGTACTTGAG +GATCATCTCGGCCAGCAGCAGGAGCAGCATGACACCGGTGATGATCGAGGCGACCTGGTA +GAAGGTCAAGGCTCCGCGAATGGCCGGGAAAGACGACGGTTTCGGCTCACGGGGCATGGG +CCCATTCTAGTCGCCGGTTGCGGTCGCGCTTCCCGACGAGGATGCCGCGGCTGCGGCATC +CTCGAGCTCTTCGACTTCCTTCTCCCACGCATCCTTGGCGAGGCGGTACCAGAAATAGAA +GGCGAAGCCGGCGAAGACCACCCACTCGGCGGCGTAGAAGATGTTCAGCCAGTTGACTGT +GGACCCGGCATCCGGCGCGGGCGAGGCGATGTCCACAAGGCCCGCCGGCGCAGACTGCGA +GGCGATGTAGCTGCGATAGACGTCCAGGCCCGCGGTGTCGTGCCACTGCGACAGGAGCGC +CGCCGGCGACATCCGTGTCATCGTGAACGGCGGCTCGCCGCGCGGCGGCGGCACCGGACC +CTCGTCCGAGATCAACCGACCGACGACCGTCACGGACTCCCCCGCGACCGCAGTCTGCTC +GAGCGCCTCGGCGGCGGATTCGGCGACGGTGAGCGTCGGCGCCCAGCCGACGGCGACGGC +CACGGATGTCGGCGTCGCGGTGTCGGCGATACGCAGCTGACCGGTGACCCAGAAGCCTTC +GACGCCGTCGTTGAAGCGCGACGAAACGACAAGGAAATCCTCGGGAACCCACGTGCCCGT +CACCTCGACGCGCTGGCCCACGAGCGGCTCGGGAAGGTACTCGCCGGGCCCGGCGATCTC +GGCGAGCGGCCTGACCTCTTCGGTGGTCCCGGGCGGGAGTGGGTCGGTGTCGATAGCGCG +CGAGAGCTGCCACTGCCCGAGCCACGCGAACACCCCCGCTACGACGAGCGCGAGCAGCAG +GACGCCGATCCAGCGGGGTCGGAGCATGACCTCCCGCAGGGTCGGGGGAAAGACTGTCTG +GTCTGTCATCCGCCCGTATACGGCGCGACGACCACCTCGACGCGCTGGAACTCCTTGAGA +TCGGAGTACCCGGTCGTGGCCATCGACTTCTTCAGCGCCCCGATCAGGTTCGCGGTTCCG +TCAGCCACCGGAGCCGGACCGTAGAGCACGGATTCGAGGTTCGTCACCTGATCCACCTTC +ACGCGGCGACCGCGCGGGAGCTTCGAGTGGTGAGCCTCCGGCCCCCAGTGGTATCCACGA +CCGGGGGCGTCGGTTGCCCGCGAGAGAGCGACGCCGAGCATGACGGCATCCGCTCCCATC +GCGAGCGCCTTGACGATGTCGCCTGACGTTCCCACACCGCCATCGGCGATGACGTGGACG +TAGCGCCCGCCCGACTCGTCGAGGTAGTCGCGGCGCGCGCCGGCGACGTCGGCTACCGCC +GTGGCCATCGGGGCGTGGATGCCGAGAACCCCGCGCGTCGTCGAGGCTGCGCCCCCGCCG +AAGCCGACGAGCACGCCCGCGGCGCCCGTGCGCATGAGGTGCAGGGCTGCCGTGTAGGTC +GCAGCACCGCCGACGATGACAGGCACGTCGAGGTCGTAGATGAACTTCTTGAGGTTGAGG +GGCTCGTCGACGCTCGAGACGTGCTCGGCCGAAACCGTCGTGCCACGGATGACGAACAGG +TCCACACCCGCGGCGACCACGGTTTCGTACAGCTGCTGGGTGCGCTGCGGAGTCAAAGCA +CCGGCCACCGTGACTCCGGCGTCACGGATCTGCTGCAGTCGCTCACGGATGAGCTCGGGC +TTGATCGGCTCGGAGTAGAGCTCCTGCATCCGGCAGGTTGCCGTCGCCTCGTCGAGAGAC +GCGATCTCAGCCAGCAGCGGCTCGGGGTCGTCGTACCGGGTCCAGAGCCCCTCGAGATCG +AGGACACCGAGTCCGCCGAGCTGACCGAGCATGATCGCCGTCTGCGGGCTCACAACCGAA +TCCATCGGGGCGCCGAGCACCGGGATGTCGAACTGGAACGCGTCGATCGACCATGCGGTC +GAGACATCCTCGGGATTGCGGGTGCGGCGCGAGGGAACGACGGCGATGTCGTCGAACGAG +TACGCGCGGCGAGCCCGCTTGGCGCGGCCGATCTCGATC diff --git a/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-empty b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-empty new file mode 100644 index 00000000..e69de29b diff --git a/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-mags-fa b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-mags-fa new file mode 100644 index 00000000..e21594a3 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-mags-fa @@ -0,0 +1,6 @@ +sample-id,mag-id,filename +sample1,mag1,sample1/mag1.fasta +sample1,mag2,sample1/mag2.fasta +sample1,mag3,sample1/mag3.fasta +sample2,mag1,sample2/mag1.fasta +sample2,mag2,sample2/mag2.fasta diff --git a/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-column b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-column new file mode 100644 index 00000000..96f1847e --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-column @@ -0,0 +1,6 @@ +sample-id,filename +sample1,sample1/mag1.fa +sample1,sample1/mag2.fa +sample1,sample1/mag3.fa +sample2,sample2/mag1.fa +sample2,sample2/mag2.fa diff --git a/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-filepath b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-filepath new file mode 100644 index 00000000..54ce9a4b --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-missing-filepath @@ -0,0 +1,6 @@ +sample-id,mag-id,filename +sample1,mag1 +sample1,mag2,sample1/mag2.fasta +sample1,mag3 +sample2,mag1,sample2/mag1.fasta +sample2,mag2 diff --git a/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-no-samples b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-no-samples new file mode 100644 index 00000000..fc1abf15 --- /dev/null +++ b/q2_types/per_sample_sequences/tests/data/manifests/MANIFEST-no-samples @@ -0,0 +1 @@ +sample-id,mag-id,filename diff --git a/q2_types/per_sample_sequences/tests/test_format.py b/q2_types/per_sample_sequences/tests/test_format.py index 37d9a276..01ebe0ca 100644 --- a/q2_types/per_sample_sequences/tests/test_format.py +++ b/q2_types/per_sample_sequences/tests/test_format.py @@ -10,6 +10,8 @@ import shutil import unittest import string +from pathlib import Path +from unittest.mock import patch, Mock import pandas as pd from q2_types.per_sample_sequences import ( @@ -24,7 +26,9 @@ SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, QIIME1DemuxFormat, QIIME1DemuxDirFmt, - SampleIdIndexedSingleEndPerSampleDirFmt + SampleIdIndexedSingleEndPerSampleDirFmt, MultiMAGManifestFormat, + MultiFASTADirectoryFormat, MultiBowtie2IndexDirFmt, ContigSequencesDirFmt, + MultiBAMDirFmt, BAMDirFmt ) from qiime2.plugin.testing import TestPluginBase from qiime2.plugin import ValidationError @@ -600,5 +604,146 @@ def test_directory_format_wrong_filename(self): QIIME1DemuxDirFmt(self.temp_dir.name, mode='r').validate() +class TestMultiMAGManifestFormat(TestPluginBase): + package = 'q2_types.per_sample_sequences.tests' + + def template_manifest(self, filepath, ctx): + with open(filepath) as fh: + tmpl = string.Template(fh.read()) + basename = os.path.basename(filepath) + file_ = os.path.join(self.temp_dir.name, basename) + with open(file_, 'w') as fh: + fh.write(tmpl.substitute(**ctx)) + return file_ + + def test_multifasta_manifest(self): + manifest_fp = self.get_data_path('manifests/MANIFEST-mags-fa') + format = MultiMAGManifestFormat(manifest_fp, mode='r') + + format.validate() + + def test_multifasta_manifest_missing_column(self): + manifest_fp = self.get_data_path('manifests/MANIFEST-missing-column') + format = MultiMAGManifestFormat(manifest_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'Found header .* with the following labels'): + format.validate() + + def test_multifasta_manifest_missing_file(self): + manifest_fp = self.get_data_path('manifests/MANIFEST-missing-filepath') + format = MultiMAGManifestFormat(manifest_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'Line 2 has 2 cells .* expected 3'): + format.validate() + + def test_multifasta_manifest_no_samples(self): + manifest_fp = self.get_data_path('manifests/MANIFEST-no-samples') + format = MultiMAGManifestFormat(manifest_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'No sample records found'): + format.validate() + + def test_multifasta_manifest_empty(self): + manifest_fp = self.get_data_path('manifests/MANIFEST-empty') + format = MultiMAGManifestFormat(manifest_fp, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'No header found, expected'): + format.validate() + + +class TestMultiFormats(TestPluginBase): + package = 'q2_types.per_sample_sequences.tests' + + def test_multifasta_dirfmt_fa(self): + dirpath = self.get_data_path('mags/mags-fa') + format = MultiFASTADirectoryFormat(dirpath, mode='r') + + format.validate() + + def test_multifasta_dirfmt_fasta(self): + dirpath = self.get_data_path('mags/mags-fasta') + format = MultiFASTADirectoryFormat(dirpath, mode='r') + + format.validate() + + def test_multifasta_dirfmt_unorganized(self): + dirpath = self.get_data_path('mags/mags-unorganized') + format = MultiFASTADirectoryFormat(dirpath, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'should be .* per-sample directories'): + format.validate() + + def test_multibowtie_index_dirfmt(self): + dirpath = self.get_data_path('bowtie/index-valid') + format = MultiBowtie2IndexDirFmt(dirpath, mode='r') + + format.validate() + + def test_multibowtie_index_dirfmt_unorganized(self): + dirpath = self.get_data_path('bowtie/index-unorganized') + format = MultiBowtie2IndexDirFmt(dirpath, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'should be .* per-sample directories'): + format.validate() + + def test_contig_seqs_dirfmt(self): + filepath = self.get_data_path('contigs/') + shutil.copytree(filepath, self.temp_dir.name, dirs_exist_ok=True) + ContigSequencesDirFmt(self.temp_dir.name, mode='r').validate() + + def test_contig_seqs_dirfmt_sample_dict(self): + filepath = self.get_data_path('contigs/') + shutil.copytree(filepath, self.temp_dir.name, dirs_exist_ok=True) + contigs = ContigSequencesDirFmt(self.temp_dir.name, mode='r') + + obs = contigs.sample_dict() + exp = { + 'sample1': str(Path(contigs.path / 'sample1_contigs.fa')), + 'sample2': str(Path(contigs.path / 'sample2_contigs.fa')), + 'sample3': str(Path(contigs.path / 'sample3_contigs.fa')) + } + self.assertEqual(obs, exp) + + obs = contigs.sample_dict(relative=True) + exp = { + 'sample1': 'sample1_contigs.fa', + 'sample2': 'sample2_contigs.fa', + 'sample3': 'sample3_contigs.fa' + } + self.assertEqual(obs, exp) + + @patch('subprocess.run', return_value=Mock(returncode=0)) + def test_bam_dirmt(self, p): + filepath = self.get_data_path('bowtie/maps-single') + format = BAMDirFmt(filepath, mode='r') + + format.validate() + + @patch('subprocess.run', return_value=Mock(returncode=3)) + def test_bam_dirmt_invalid(self, p): + # this patch is not ideal but samtools' installation sometimes can + # be messed up and the tool returns an error regardless of the invoked + # command, so let's just assume here that it works as it should + filepath = self.get_data_path('bowtie/maps-invalid') + format = BAMDirFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, 'samtools quickcheck -v failed on'): + format.validate() + + @patch('subprocess.run', return_value=Mock(returncode=0)) + def test_multibam_dirmt(self, p): + filepath = self.get_data_path('bowtie/maps-multi') + format = MultiBAMDirFmt(filepath, mode='r') + + format.validate() + + if __name__ == "__main__": unittest.main() diff --git a/q2_types/per_sample_sequences/tests/test_transformer.py b/q2_types/per_sample_sequences/tests/test_transformer.py index dd948fdd..e4e146a7 100644 --- a/q2_types/per_sample_sequences/tests/test_transformer.py +++ b/q2_types/per_sample_sequences/tests/test_transformer.py @@ -16,6 +16,7 @@ import skbio import yaml import pandas as pd +from pandas._testing import assert_frame_equal from q2_types.per_sample_sequences import ( SingleLanePerSampleSingleEndFastqDirFmt, @@ -34,7 +35,8 @@ PairedEndFastqManifestPhred64V2, QIIME1DemuxDirFmt, FastqGzFormat, - SampleIdIndexedSingleEndPerSampleDirFmt) + SampleIdIndexedSingleEndPerSampleDirFmt, + MultiFASTADirectoryFormat, MultiMAGManifestFormat, MultiMAGSequencesDirFmt) from q2_types.per_sample_sequences._util import ( _validate_header, _validate_single_end_fastq_manifest_directions, @@ -1188,5 +1190,75 @@ def test_paired_end_fastq_manifest_phred64_to_slpspefdf(self): self.assertEqual(obs_manifest.read(), self.exp_pe_manifest) +class TestMAGTransformers(TestPluginBase): + package = "q2_types.per_sample_sequences.tests" + + def setUp(self): + super().setUp() + + @staticmethod + def construct_manifest(ext): + exp_manifest = ( + "sample-id,mag-id,filename\n" + f"sample1,mag1,sample1/mag1.{ext}\n" + f"sample1,mag2,sample1/mag2.{ext}\n" + f"sample1,mag3,sample1/mag3.{ext}\n" + f"sample2,mag1,sample2/mag1.{ext}\n" + f"sample2,mag2,sample2/mag2.{ext}\n" + ) + return exp_manifest + + def apply_transformation(self, from_fmt, to_fmt, datafile_fp): + transformer = self.get_transformer(from_fmt, to_fmt) + fp = self.get_data_path(datafile_fp) + return transformer(from_fmt(fp, 'r')) + + def test_multifile_dirfmt_to_mag_seqs_dirfmt_fa(self): + obs = self.apply_transformation( + MultiFASTADirectoryFormat, + MultiMAGSequencesDirFmt, + 'mags/mags-fa' + ) + with obs.manifest.view(MultiMAGManifestFormat).open() as obs_manifest: + self.assertEqual( + obs_manifest.read(), self.construct_manifest('fasta') + ) + + def test_multifile_dirfmt_to_mag_seqs_dirfmt_fasta(self): + obs = self.apply_transformation( + MultiFASTADirectoryFormat, + MultiMAGSequencesDirFmt, + 'mags/mags-fasta' + ) + with obs.manifest.view(MultiMAGManifestFormat).open() as obs_manifest: + self.assertEqual( + obs_manifest.read(), self.construct_manifest('fasta') + ) + + def test_mag_manifest_to_df(self): + obs = self.apply_transformation( + MultiMAGManifestFormat, + pd.DataFrame, + 'manifests/MANIFEST-mags-fa' + ) + exp = pd.DataFrame({ + 'sample-id': [ + 'sample1', 'sample1', 'sample1', 'sample2', 'sample2' + ], + 'mag-id': ['mag1', 'mag2', 'mag3', 'mag1', 'mag2'], + 'filename': [ + os.path.join(self.get_data_path('manifests'), x) + for x in [ + 'sample1/mag1.fasta', 'sample1/mag2.fasta', + 'sample1/mag3.fasta', 'sample2/mag1.fasta', + 'sample2/mag2.fasta' + ] + ] + }) + exp.set_index(['sample-id', 'mag-id'], inplace=True) + + assert_frame_equal(exp, obs) + + if __name__ == '__main__': unittest.main() diff --git a/q2_types/per_sample_sequences/tests/test_type.py b/q2_types/per_sample_sequences/tests/test_type.py index a94137de..584875e5 100644 --- a/q2_types/per_sample_sequences/tests/test_type.py +++ b/q2_types/per_sample_sequences/tests/test_type.py @@ -8,13 +8,22 @@ import unittest +from q2_types.bowtie2 import Bowtie2IndexDirFmt +from q2_types.feature_data import BLAST6 from q2_types.sample_data import SampleData from q2_types.per_sample_sequences import ( Sequences, SequencesWithQuality, PairedEndSequencesWithQuality, JoinedSequencesWithQuality, QIIME1DemuxDirFmt, SingleLanePerSampleSingleEndFastqDirFmt, - SingleLanePerSamplePairedEndFastqDirFmt + SingleLanePerSamplePairedEndFastqDirFmt, + MAGs, MultiMAGSequencesDirFmt, + Contigs, ContigSequencesDirFmt, + SingleBowtie2Index, MultiBowtie2Index, MultiBowtie2IndexDirFmt, BAMDirFmt, + MultiBAMDirFmt ) +from q2_types.per_sample_sequences._type import (AlignmentMap, + MultiAlignmentMap) +from q2_types.genome_data import SeedOrthologDirFmt from qiime2.plugin.testing import TestPluginBase @@ -57,6 +66,66 @@ def test_joined_sequences_with_quality_semantic_type_to_format_reg(self): SingleLanePerSampleSingleEndFastqDirFmt ) + def test_mags_semantic_type_registration(self): + self.assertRegisteredSemanticType(MAGs) + + def test_mags_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[MAGs], + MultiMAGSequencesDirFmt + ) + + def test_contigs_semantic_type_registration(self): + self.assertRegisteredSemanticType(Contigs) + + def test_contigs_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[Contigs], + ContigSequencesDirFmt + ) + + def test_singlebowtie_semantic_type_registration(self): + self.assertRegisteredSemanticType(SingleBowtie2Index) + + def test_singlebowtie_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[SingleBowtie2Index], + Bowtie2IndexDirFmt + ) + + def test_multibowtie_index_semantic_type_registration(self): + self.assertRegisteredSemanticType(MultiBowtie2Index) + + def test_multibowtie_index_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[MultiBowtie2Index], + MultiBowtie2IndexDirFmt + ) + + def test_aln_map_semantic_type_registration(self): + self.assertRegisteredSemanticType(AlignmentMap) + + def test_aln_map_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[AlignmentMap], + BAMDirFmt + ) + + def test_multi_aln_map_semantic_type_registration(self): + self.assertRegisteredSemanticType(MultiAlignmentMap) + + def test_multi_aln_map_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[MultiAlignmentMap], + MultiBAMDirFmt + ) + + def test_sdb6_semantic_type_to_format_registration(self): + self.assertSemanticTypeRegisteredToFormat( + SampleData[BLAST6], + SeedOrthologDirFmt + ) + if __name__ == '__main__': unittest.main() diff --git a/q2_types/reference_db/__init__.py b/q2_types/reference_db/__init__.py new file mode 100644 index 00000000..41eeeaa4 --- /dev/null +++ b/q2_types/reference_db/__init__.py @@ -0,0 +1,28 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + + +from q2_types.reference_db._type import ( + ReferenceDB, Diamond, Eggnog, NCBITaxonomy, + EggnogProteinSequences +) + +from q2_types.reference_db._format import ( + EggnogRefDirFmt, + EggnogRefTextFileFmt, + EggnogRefBinFileFmt, + DiamondDatabaseFileFmt, + DiamondDatabaseDirFmt, + NCBITaxonomyDirFmt, + EggnogProteinSequencesDirFmt + ) + +__all__ = ['ReferenceDB', 'Diamond', 'Eggnog', 'DiamondDatabaseFileFmt', + 'DiamondDatabaseDirFmt', 'EggnogRefDirFmt', 'EggnogRefTextFileFmt', + 'EggnogRefBinFileFmt', 'NCBITaxonomyDirFmt', 'NCBITaxonomy', + 'EggnogProteinSequencesDirFmt', 'EggnogProteinSequences'] diff --git a/q2_types/reference_db/_format.py b/q2_types/reference_db/_format.py new file mode 100644 index 00000000..ad6697f2 --- /dev/null +++ b/q2_types/reference_db/_format.py @@ -0,0 +1,296 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + + +import gzip +import re +from qiime2.plugin import model +from qiime2.core.exceptions import ValidationError +from q2_types.plugin_setup import plugin +from q2_types.reference_db._type import ( + ReferenceDB, Eggnog, Diamond, NCBITaxonomy, + EggnogProteinSequences +) +from q2_types.feature_data import MixedCaseProteinFASTAFormat + + +class EggnogRefTextFileFmt(model.TextFileFormat): + _expected_columns = [ + '# Taxid', + 'Sci.Name', + 'Rank', + 'Named Lineage', + 'Taxid Lineage' + ] + _line_pattern = re.compile( + r'^\d+\t' # Taxid + r'([^\t]*\t)' # Sci.Name + r'(no rank|species|subspecies)\t' # Rank + r'([^\t]*\t)' # Named Lineage + r'\d+(,\d+)*$' # 'Taxid Lineage' + ) + + def _validate_1st_line(self, line): + fields = line.strip("\n").split("\t") + if len(fields) > 5: + raise ValidationError( + "Too many columns.\n" + "Expected columns:\n" + f"{self._expected_columns}\n" + "Columns given:\n" + f"{fields}" + ) + + if not ( + fields[0] == '# Taxid' and + fields[1] == 'Sci.Name' and + fields[2] == 'Rank' and + fields[3] == 'Named Lineage' and + fields[4] == 'Taxid Lineage' + ): + raise ValidationError( + "Wrong columns.\n" + "Expected columns:\n" + f"{self._expected_columns}\n" + "Columns given:\n" + f"{fields}" + ) + + def _validate_Nth_line(self, line, line_no): + if not self._line_pattern.match(line): + raise ValidationError( + f"Invalid line at line {line_no}:\n" + f"{line}" + ) + + def _validate_(self, level): + with open(str(self), "r") as file: + line_no = 0 + is_fist_line = True + + for line in file: + # Validate first line + if is_fist_line: + self._validate_1st_line(line) + line_no += 1 + is_fist_line = False + + # Validate N'th line + else: + self._validate_Nth_line(line, line_no) + line_no += 1 + + +class EggnogRefBinFileFmt(model.BinaryFileFormat): + def _validate_(self, level): + pass + + +plugin.register_formats(EggnogRefTextFileFmt, EggnogRefBinFileFmt) + + +class EggnogRefDirFmt(model.DirectoryFormat): + eggnog = model.FileCollection(r'eggnog.*db.*', + format=EggnogRefBinFileFmt) + + @eggnog.set_path_maker + def eggnog_path_maker(self, name): + return str(name) + + +plugin.register_formats(EggnogRefDirFmt) + +plugin.register_semantic_type_to_format( + ReferenceDB[Eggnog], + EggnogRefDirFmt) + + +class DiamondDatabaseFileFmt(model.BinaryFileFormat): + def _validate_(self, level): + # TODO: have native diamond validation run on db/self.path + pass + + +DiamondDatabaseDirFmt = model.SingleFileDirectoryFormat( + 'DiamondDatabaseDirFmt', 'ref_db.dmnd', DiamondDatabaseFileFmt) + +plugin.register_formats(DiamondDatabaseFileFmt, DiamondDatabaseDirFmt) +plugin.register_semantic_type_to_format(ReferenceDB[Diamond], + DiamondDatabaseDirFmt) + + +class NCBITaxonomyNodesFormat(model.TextFileFormat): + def _validate_n_records(self, n=None): + with open(str(self), "r") as fh: + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for i, line in file_: + line = line.rstrip("\n").split("\t|\t") + if 13 > len(line) or len(line) > 18: + raise ValidationError( + "NCBI taxonomy nodes file must have 13 columns, " + f"found {len(line)} columns on line {i + 1}." + ) + if not line[0].isnumeric() or not line[1].isnumeric(): + raise ValidationError( + "NCBI taxonomy nodes file must contain a numeric " + "taxonomy ID in the first two columns, found " + f"non-numeric value on line {i + 1}." + ) + for col in (5, 7, 9, 10, 11): + if not line[col].isnumeric() or \ + not int(line[col]) in (0, 1): + raise ValidationError( + "NCBI taxonomy nodes file must contain 0 or 1 " + "in columns 6, 8, 10, 11, and 12, found a " + f"non-allowed value on line {i + 1}, column " + f"{col + 1}: {line[col]}." + ) + + def _validate_(self, level): + self._validate_n_records(n={"min": 10, "max": None}[level]) + + +class NCBITaxonomyNamesFormat(model.TextFileFormat): + def _validate_n_records(self, n=None): + with open(str(self), "r") as fh: + file_ = enumerate(fh) if n is None else zip(range(n), fh) + + for i, line in file_: + line = line.rstrip("\n").split("\t|\t") + if len(line) != 4: + raise ValidationError( + "NCBI taxonomy names file must have 4 columns, " + f"found {len(line)} columns on line {i + 1}." + ) + if not line[0].isnumeric(): + raise ValidationError( + "NCBI taxonomy name file must contain a numeric " + "taxonomy ID in the first column, found non-numeric " + f"value on line {i + 1}: {line[0]}." + ) + + def _validate_(self, level): + self._validate_n_records(n={"min": 10, "max": None}[level]) + + +class NCBITaxonomyBinaryFileFmt(model.BinaryFileFormat): + _accession_regex = re.compile( + r'[OPQ][0-9][A-Z0-9]{3}[0-9]|' # UniProt + r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}|' # UniProt + r'[A-Z]{3}\d{3,7}|' # EMBL-EBI + r'[A-Z]+[-._]?\d+' # NCBI + ) + _accession_version_regex = re.compile( + r'[OPQ][0-9][A-Z0-9]{3}[0-9]\.\d+|' + r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}\.\d+|' + r'[A-Z]{3}\d{3,7}\.\d+|' + r'[A-Z]+[-._]?\d+\.\d+' + ) + _taxid_regex = r'\d{1,10}' + _gi_regex = r'\d+' + _line_regex = re.compile( + rf"^({_accession_regex.pattern})\t" + rf"({_accession_version_regex.pattern})" + rf"\t({_taxid_regex})" + rf"\t({_gi_regex})\n$" + ) + + def _validate_1st_line(self, line: list): + if not ( + line[0] == "accession" and + line[1] == "accession.version" and + line[2] == "taxid" and + line[3] == "gi" + ): + raise ValidationError( + "NCBI prot.accession2taxid file must have " + "columns: 'accession', 'accession.version'" + f", 'taxid' and 'gi'. Got {line} instead." + ) + + def _validate_Nth_line(self, line: list, line_no: int): + # For every filed validate one record + splitted_line = line.rstrip("\n").split(sep="\t") + + # Raise exception if the entry does not match pattern + if not re.match(self._line_regex, line): + raise ValidationError( + f"Non-allowed value found in line {line_no}.\n" + "Printing line:\n" + f"{splitted_line}" + ) + + def _validate_(self, level): + with gzip.open(str(self), 'rt') as file: + # Flag first line + is_first_line = True + line_no = 1 + + # Set the number of rows to be parsed + max_lines = {"min": 100, "max": 10000000}[level] + + for line in file: + # Check time + if line_no >= max_lines: + break + + # Get line and split it into fields + splitted_line = line.rstrip("\n").split(sep="\t") + + # Check that it is split in 4 + if len(splitted_line) != 4: + raise ValidationError( + "NCBI prot.accession2taxid file must have 4 columns, " + f"found {len(splitted_line)} columns in line " + f"{line_no}. \nPrinting line: \n{splitted_line}" + ) + + # Parse first line + if is_first_line: + self._validate_1st_line(splitted_line) + is_first_line = False + line_no += 1 + + # Parse Nth line + else: + self._validate_Nth_line(line, line_no) + line_no += 1 + + +plugin.register_formats( + NCBITaxonomyNodesFormat, NCBITaxonomyNamesFormat, NCBITaxonomyBinaryFileFmt + ) + + +class NCBITaxonomyDirFmt(model.DirectoryFormat): + node = model.File('nodes.dmp', format=NCBITaxonomyNodesFormat) + names = model.File('names.dmp', format=NCBITaxonomyNamesFormat) + tax_map = model.File( + 'prot.accession2taxid.gz', + format=NCBITaxonomyBinaryFileFmt + ) + + +plugin.register_formats(NCBITaxonomyDirFmt) + +plugin.register_semantic_type_to_format( + ReferenceDB[NCBITaxonomy], + NCBITaxonomyDirFmt) + + +class EggnogProteinSequencesDirFmt(model.DirectoryFormat): + taxid_info = model.File("e5.taxid_info.tsv", format=EggnogRefTextFileFmt) + proteins = model.File( + "e5.proteomes.faa", format=MixedCaseProteinFASTAFormat + ) + + +plugin.register_formats(EggnogProteinSequencesDirFmt) +plugin.register_semantic_type_to_format(ReferenceDB[EggnogProteinSequences], + EggnogProteinSequencesDirFmt) diff --git a/q2_types/reference_db/_type.py b/q2_types/reference_db/_type.py new file mode 100644 index 00000000..0f773a79 --- /dev/null +++ b/q2_types/reference_db/_type.py @@ -0,0 +1,25 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + + +from qiime2.plugin import SemanticType +from q2_types.plugin_setup import plugin + +ReferenceDB = SemanticType('ReferenceDB', field_names='type') +Diamond = SemanticType('Diamond', variant_of=ReferenceDB.field['type']) +Eggnog = SemanticType('Eggnog', variant_of=ReferenceDB.field['type']) +NCBITaxonomy = SemanticType( + 'NCBITaxonomy', variant_of=ReferenceDB.field['type'] + ) +EggnogProteinSequences = SemanticType( + 'EggnogProteinSequences', variant_of=ReferenceDB.field['type'] +) + +plugin.register_semantic_types( + ReferenceDB, Diamond, Eggnog, NCBITaxonomy, EggnogProteinSequences +) diff --git a/q2_types/reference_db/tests/__init__.py b/q2_types/reference_db/tests/__init__.py new file mode 100644 index 00000000..afcc05c2 --- /dev/null +++ b/q2_types/reference_db/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_types/reference_db/tests/data/bad_dmnd_db/bad_name.dmnd b/q2_types/reference_db/tests/data/bad_dmnd_db/bad_name.dmnd new file mode 100644 index 00000000..42627cc3 --- /dev/null +++ b/q2_types/reference_db/tests/data/bad_dmnd_db/bad_name.dmnd @@ -0,0 +1,2862 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eggnog-mapper/eggnog_proteins.dmnd at master · eggnogdb/eggnog-mapper + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+ + + + + + + +
+ + + + + + +
+ + + + + + + + + +
+ + + + + + + + + + + + + + + + + +
+ +
+ + + + eggnogdb  /   + eggnog-mapper  /   + +
+
+ + + +
+ + +
+
+ Clear Command Palette +
+
+ + + +
+
+ Tip: + Type # to search pull requests +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type # to search issues +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type # to search discussions +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type ! to search projects +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type @ to search teams +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type @ to search people and organizations +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type > to activate command mode +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Go to your accessibility settings to change your keyboard shortcuts +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type author:@me to search your content +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:pr to filter to pull requests +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:issue to filter to issues +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:project to filter to projects +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:open to filter to open content +
+
+ Type ? for help and tips +
+
+
+ +
+ +
+
+ We’ve encountered an error and some results aren't available at this time. Type a new search or try again later. +
+
+ + No results matched your search + + + + + + + + + + +
+ + + + + Search for issues and pull requests + + # + + + + Search for issues, pull requests, discussions, and projects + + # + + + + Search for organizations, repositories, and users + + @ + + + + Search for projects + + ! + + + + Search for files + + / + + + + Activate command mode + + > + + + + Search your issues, pull requests, and discussions + + # author:@me + + + + Search your issues, pull requests, and discussions + + # author:@me + + + + Filter to pull requests + + # is:pr + + + + Filter to issues + + # is:issue + + + + Filter to discussions + + # is:discussion + + + + Filter to projects + + # is:project + + + + Filter to open issues, pull requests, and discussions + + # is:open + + + + + + + + + + + + + + + + +
+
+
+ +
+ + + + + + + + + + +
+ + +
+
+
+ + + + + + + + + + + + + + +
+ +
+ +
+ +
+ + + + / + + eggnog-mapper + + + Public +
+ + +
+ +
    + +
  • + +
    +
    + Edit Pins + + +
    +
    +
    + Pin to… + +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
  • + + +
  • + +
    + + + + + + + Watch + + + 14 + + + +
    +
    +

    Notifications

    + +
    + +
    +
    + + + + + + + + +
    + + +
    + + + + + Get push notifications on iOS or Android. + +
    +
    +
    +
    + + + + +
    +
    +
    + + + +
  • + +
  • +
    +
    + Fork + 98 + Fork your own copy of eggnogdb/eggnog-mapper +
    +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
  • + +
  • + + +
    +
    +
    + + +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
    +
    +
    + +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
    +
  • + + + +
+ +
+ +
+
+ + + + +
+ + + + + + +
+ Open in github.dev + Open in a new github.dev tab + + + + + +
+ + +
+ + + + + + + +Permalink + +
+ +
+
+ + + master + + + + +
+
+
+ Switch branches/tags + +
+ + + +
+ +
+ +
+ + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + +
+
+
+
+ +
+ +
+ + +
+ +
+
+
+

Name already in use

+
+
+ +
+
+
+
+ +
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch? +
+ +
+
+ + +
+
+ + + + Go to file + +
+ + + + +
+
+
+ + + + + + + + + +
+ +
+
+
 
+
+ +
+
 
+ Cannot retrieve contributors at this time +
+
+ + + + + + + + + + + + + +
+ +
+ + +
+ + 3.44 MB +
+ +
+ + + + +
+
+ +
+
+ +
+ +
+
+ + + +
+ +
+ View raw +

(Sorry about that, but we can’t show files that are this big right now.)

+
+
+ +
+ + + + +
+ + +
+ + +
+
+ + + +
+
+ Give feedback + + + +
+ +

Provide feedback

+
+
+ + + + + + + +
+
+
+
+
+ +
+ + +
+ +
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ + + diff --git a/q2_types/reference_db/tests/data/dmnd_db/ref_db.dmnd b/q2_types/reference_db/tests/data/dmnd_db/ref_db.dmnd new file mode 100644 index 00000000..42627cc3 --- /dev/null +++ b/q2_types/reference_db/tests/data/dmnd_db/ref_db.dmnd @@ -0,0 +1,2862 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + eggnog-mapper/eggnog_proteins.dmnd at master · eggnogdb/eggnog-mapper + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +
+ + + + + + + +
+ + + + + + +
+ + + + + + + + + +
+ + + + + + + + + + + + + + + + + +
+ +
+ + + + eggnogdb  /   + eggnog-mapper  /   + +
+
+ + + +
+ + +
+
+ Clear Command Palette +
+
+ + + +
+
+ Tip: + Type # to search pull requests +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type # to search issues +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type # to search discussions +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type ! to search projects +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type @ to search teams +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type @ to search people and organizations +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type > to activate command mode +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Go to your accessibility settings to change your keyboard shortcuts +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type author:@me to search your content +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:pr to filter to pull requests +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:issue to filter to issues +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:project to filter to projects +
+
+ Type ? for help and tips +
+
+
+ +
+
+ Tip: + Type is:open to filter to open content +
+
+ Type ? for help and tips +
+
+
+ +
+ +
+
+ We’ve encountered an error and some results aren't available at this time. Type a new search or try again later. +
+
+ + No results matched your search + + + + + + + + + + +
+ + + + + Search for issues and pull requests + + # + + + + Search for issues, pull requests, discussions, and projects + + # + + + + Search for organizations, repositories, and users + + @ + + + + Search for projects + + ! + + + + Search for files + + / + + + + Activate command mode + + > + + + + Search your issues, pull requests, and discussions + + # author:@me + + + + Search your issues, pull requests, and discussions + + # author:@me + + + + Filter to pull requests + + # is:pr + + + + Filter to issues + + # is:issue + + + + Filter to discussions + + # is:discussion + + + + Filter to projects + + # is:project + + + + Filter to open issues, pull requests, and discussions + + # is:open + + + + + + + + + + + + + + + + +
+
+
+ +
+ + + + + + + + + + +
+ + +
+
+
+ + + + + + + + + + + + + + +
+ +
+ +
+ +
+ + + + / + + eggnog-mapper + + + Public +
+ + +
+ +
    + +
  • + +
    +
    + Edit Pins + + +
    +
    +
    + Pin to… + +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
  • + + +
  • + +
    + + + + + + + Watch + + + 14 + + + +
    +
    +

    Notifications

    + +
    + +
    +
    + + + + + + + + +
    + + +
    + + + + + Get push notifications on iOS or Android. + +
    +
    +
    +
    + + + + +
    +
    +
    + + + +
  • + +
  • +
    +
    + Fork + 98 + Fork your own copy of eggnogdb/eggnog-mapper +
    +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
  • + +
  • + + +
    +
    +
    + + +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
    +
    +
    + +
    + + + +
    + +
    +
    + + + + + + + +
    + +
    +
    +
    +
    +
    +
  • + + + +
+ +
+ +
+
+ + + + +
+ + + + + + +
+ Open in github.dev + Open in a new github.dev tab + + + + + +
+ + +
+ + + + + + + +Permalink + +
+ +
+
+ + + master + + + + +
+
+
+ Switch branches/tags + +
+ + + +
+ +
+ +
+ + +
+ +
+ + + + + + + + + + + + + + + + + +
+ + +
+
+
+
+ +
+ +
+ + +
+ +
+
+
+

Name already in use

+
+
+ +
+
+
+
+ +
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch? +
+ +
+
+ + +
+
+ + + + Go to file + +
+ + + + +
+
+
+ + + + + + + + + +
+ +
+
+
 
+
+ +
+
 
+ Cannot retrieve contributors at this time +
+
+ + + + + + + + + + + + + +
+ +
+ + +
+ + 3.44 MB +
+ +
+ + + + +
+
+ +
+
+ +
+ +
+
+ + + +
+ +
+ View raw +

(Sorry about that, but we can’t show files that are this big right now.)

+
+
+ +
+ + + + +
+ + +
+ + +
+
+ + + +
+
+ Give feedback + + + +
+ +

Provide feedback

+
+
+ + + + + + + +
+
+
+
+
+ +
+ + +
+ +
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + +
+ +
+ + + diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.proteomes.faa b/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.proteomes.faa new file mode 100644 index 00000000..d575f943 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.proteomes.faa @@ -0,0 +1,6 @@ +>sequence1 +MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA +AAAQIR +>sequence2 +MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD +VASECEVKCMPTFQFFKKGQKVGEFSGAN* \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.taxid_info.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.taxid_info.tsv new file mode 100644 index 00000000..50816631 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax/e5.taxid_info.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_col.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_col.tsv new file mode 100644 index 00000000..2ffe0e32 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_col.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineagee +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_rank.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_rank.tsv new file mode 100644 index 00000000..01feabc1 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_rank.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 subsubspecies root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid.tsv new file mode 100644 index 00000000..f7675333 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014a Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid_lineage.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid_lineage.tsv new file mode 100644 index 00000000..669ccd83 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/invalid_taxid_lineage.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393,s, \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/too_many_cols.tsv b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/too_many_cols.tsv new file mode 100644 index 00000000..57edf286 --- /dev/null +++ b/q2_types/reference_db/tests/data/eggnog_seq_tax_bad/too_many_cols.tsv @@ -0,0 +1,10 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage Another columnn +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 \ No newline at end of file diff --git a/q2_types/reference_db/tests/data/good_eggnog/eggnog.db b/q2_types/reference_db/tests/data/good_eggnog/eggnog.db new file mode 100644 index 00000000..9ed31352 Binary files /dev/null and b/q2_types/reference_db/tests/data/good_eggnog/eggnog.db differ diff --git a/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db b/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db new file mode 100644 index 00000000..136b907a Binary files /dev/null and b/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db differ diff --git a/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db.traverse.pkl b/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db.traverse.pkl new file mode 100644 index 00000000..e1e3d659 Binary files /dev/null and b/q2_types/reference_db/tests/data/good_eggnog/eggnog.taxa.db.traverse.pkl differ diff --git a/q2_types/reference_db/tests/data/ncbi/db-valid/names.dmp b/q2_types/reference_db/tests/data/ncbi/db-valid/names.dmp new file mode 100644 index 00000000..b89e8a2b --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/db-valid/names.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_types/reference_db/tests/data/ncbi/db-valid/nodes.dmp b/q2_types/reference_db/tests/data/ncbi/db-valid/nodes.dmp new file mode 100644 index 00000000..61a662a0 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/db-valid/nodes.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/reference_db/tests/data/ncbi/db-valid/prot.accession2taxid.gz b/q2_types/reference_db/tests/data/ncbi/db-valid/prot.accession2taxid.gz new file mode 100644 index 00000000..8b78ac0f Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/db-valid/prot.accession2taxid.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/names-non-numeric.dmp b/q2_types/reference_db/tests/data/ncbi/names-non-numeric.dmp new file mode 100644 index 00000000..a536c76d --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/names-non-numeric.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +x | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_types/reference_db/tests/data/ncbi/names-ok.dmp b/q2_types/reference_db/tests/data/ncbi/names-ok.dmp new file mode 100644 index 00000000..b89e8a2b --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/names-ok.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_types/reference_db/tests/data/ncbi/names-wrong-cols.dmp b/q2_types/reference_db/tests/data/ncbi/names-wrong-cols.dmp new file mode 100644 index 00000000..a5dd6198 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/names-wrong-cols.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric-other.dmp b/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric-other.dmp new file mode 100644 index 00000000..a28dca11 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric-other.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | x | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric.dmp b/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric.dmp new file mode 100644 index 00000000..1cec0fb5 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/nodes-non-numeric.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | x | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/reference_db/tests/data/ncbi/nodes-ok.dmp b/q2_types/reference_db/tests/data/ncbi/nodes-ok.dmp new file mode 100644 index 00000000..61a662a0 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/nodes-ok.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/reference_db/tests/data/ncbi/nodes-wrong-cols.dmp b/q2_types/reference_db/tests/data/ncbi/nodes-wrong-cols.dmp new file mode 100644 index 00000000..b6a94556 --- /dev/null +++ b/q2_types/reference_db/tests/data/ncbi/nodes-wrong-cols.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_types/reference_db/tests/data/ncbi/too_many_cols.gz b/q2_types/reference_db/tests/data/ncbi/too_many_cols.gz new file mode 100644 index 00000000..4575c613 Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/too_many_cols.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/wrong_accession.gz b/q2_types/reference_db/tests/data/ncbi/wrong_accession.gz new file mode 100644 index 00000000..12e8d8ca Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/wrong_accession.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/wrong_accession_version.gz b/q2_types/reference_db/tests/data/ncbi/wrong_accession_version.gz new file mode 100644 index 00000000..e8bb0f5b Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/wrong_accession_version.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/wrong_col.gz b/q2_types/reference_db/tests/data/ncbi/wrong_col.gz new file mode 100644 index 00000000..784fee98 Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/wrong_col.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/wrong_gi.gz b/q2_types/reference_db/tests/data/ncbi/wrong_gi.gz new file mode 100644 index 00000000..34a6c314 Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/wrong_gi.gz differ diff --git a/q2_types/reference_db/tests/data/ncbi/wrong_taxid.gz b/q2_types/reference_db/tests/data/ncbi/wrong_taxid.gz new file mode 100644 index 00000000..37d7dbde Binary files /dev/null and b/q2_types/reference_db/tests/data/ncbi/wrong_taxid.gz differ diff --git a/q2_types/reference_db/tests/data/single_eggnog/eggnog.db b/q2_types/reference_db/tests/data/single_eggnog/eggnog.db new file mode 100644 index 00000000..9ed31352 Binary files /dev/null and b/q2_types/reference_db/tests/data/single_eggnog/eggnog.db differ diff --git a/q2_types/reference_db/tests/test_format.py b/q2_types/reference_db/tests/test_format.py new file mode 100644 index 00000000..58b91491 --- /dev/null +++ b/q2_types/reference_db/tests/test_format.py @@ -0,0 +1,268 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +from qiime2.plugin.testing import TestPluginBase +from q2_types.reference_db._format import ( + DiamondDatabaseFileFmt, DiamondDatabaseDirFmt, EggnogRefBinFileFmt, + EggnogRefDirFmt, NCBITaxonomyNamesFormat, NCBITaxonomyNodesFormat, + NCBITaxonomyDirFmt, NCBITaxonomyBinaryFileFmt, + EggnogProteinSequencesDirFmt, EggnogRefTextFileFmt + ) +from qiime2.plugin import ValidationError + + +class TestRefFormats(TestPluginBase): + package = 'q2_types.reference_db.tests' + + def test_dmnd_ff(self): + dmd_obj = DiamondDatabaseFileFmt( + self.get_data_path('dmnd_db/ref_db.dmnd'), + mode='r' + ) + + dmd_obj.validate() + + def test_dmnd_df(self): + dmnd_obj = DiamondDatabaseDirFmt( + self.get_data_path('dmnd_db'), + mode='r' + ) + + dmnd_obj.validate() + + def test_dmnd_dir_fmt_fails_bad_name(self): + dmnd_obj = DiamondDatabaseDirFmt( + + self.get_data_path('bad_dmnd_db'), + mode='r' + ) + with self.assertRaisesRegexp( + ValidationError, + "Missing one or more files for DiamondDatabaseDirFmt"): + dmnd_obj.validate() + + def test_eggnog_ref_bin_main(self): + dirpath = self.get_data_path('good_eggnog/eggnog.db') + fmt_obj = EggnogRefBinFileFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_eggnog_ref_bin_pickle(self): + dirpath = self.get_data_path('good_eggnog/eggnog.taxa.db.traverse.pkl') + fmt_obj = EggnogRefBinFileFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_eggnog_ref_bin_taxa(self): + dirpath = self.get_data_path('good_eggnog/eggnog.taxa.db') + fmt_obj = EggnogRefBinFileFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_eggnog_dir_fmt_all_files(self): + dirpath = self.get_data_path('good_eggnog') + fmt_obj = EggnogRefDirFmt(dirpath, mode='r') + + self.assertEqual( + len([(relpath, obj) for relpath, obj + in fmt_obj.eggnog.iter_views(EggnogRefBinFileFmt)]), + 3) + + def test_eggnog_dir_fmt_single_file(self): + dirpath = self.get_data_path('single_eggnog') + fmt_obj = EggnogRefDirFmt(dirpath, mode='r') + + self.assertEqual( + len([(relpath, obj) for relpath, obj + in fmt_obj.eggnog.iter_views(EggnogRefBinFileFmt)]), + 1) + + fmt_obj.validate() + + def test_eggnog_dir_fmt(self): + dirpath = self.get_data_path('good_eggnog') + fmt_obj = EggnogRefDirFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_eggnog_sequence_taxa_dir_fmt(self): + dirpath = self.get_data_path('eggnog_seq_tax') + fmt_obj = EggnogProteinSequencesDirFmt(dirpath, mode='r') + + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_valid(self): + filepath = self.get_data_path('eggnog_seq_tax/e5.taxid_info.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_invalid_col(self): + filepath = self.get_data_path('eggnog_seq_tax_bad/invalid_col.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + r"Wrong columns" + ): + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_too_many_cols(self): + filepath = self.get_data_path('eggnog_seq_tax_bad/too_many_cols.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + r"Too many columns." + ): + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_invalid_rank(self): + filepath = self.get_data_path('eggnog_seq_tax_bad/invalid_rank.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + r"Invalid line at line 3:" + ): + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_invalid_taxid(self): + filepath = self.get_data_path('eggnog_seq_tax_bad/invalid_taxid.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + r"Invalid line at line 4" + ): + fmt_obj.validate() + + def test_EggnogRefTextFileFmt_invalid_taxid_lineage(self): + filepath = self.get_data_path( + 'eggnog_seq_tax_bad/invalid_taxid_lineage.tsv') + fmt_obj = EggnogRefTextFileFmt(filepath, mode='r') + + with self.assertRaisesRegex( + ValidationError, + r"Invalid line at line 9" + ): + fmt_obj.validate() + + +class TestNCBIFormats(TestPluginBase): + package = "q2_types.reference_db.tests" + + def test_ncbi_tax_names_dmp_ok(self): + fp = self.get_data_path("ncbi/names-ok.dmp") + format = NCBITaxonomyNamesFormat(fp, "r") + format.validate() + + def test_ncbi_tax_names_dmp_too_few_cols(self): + fp = self.get_data_path("ncbi/names-wrong-cols.dmp") + format = NCBITaxonomyNamesFormat(fp, "r") + with self.assertRaisesRegex( + ValidationError, r"found 3 columns on line 2." + ): + format.validate() + + def test_ncbi_tax_names_dmp_nonnumeric(self): + fp = self.get_data_path("ncbi/names-non-numeric.dmp") + format = NCBITaxonomyNamesFormat(fp, "r") + with self.assertRaisesRegex( + ValidationError, r"value on line 3: x." + ): + format.validate() + + def test_ncbi_tax_nodes_dmp_ok(self): + fp = self.get_data_path("ncbi/nodes-ok.dmp") + format = NCBITaxonomyNodesFormat(fp, "r") + format.validate() + + def test_ncbi_tax_nodes_dmp_too_few_cols(self): + fp = self.get_data_path("ncbi/nodes-wrong-cols.dmp") + format = NCBITaxonomyNodesFormat(fp, "r") + with self.assertRaisesRegex( + ValidationError, r"found 12 columns on line 2." + ): + format.validate() + + def test_ncbi_tax_nodes_dmp_nonnumeric_id(self): + fp = self.get_data_path("ncbi/nodes-non-numeric.dmp") + format = NCBITaxonomyNodesFormat(fp, "r") + with self.assertRaisesRegex(ValidationError, r"value on line 3."): + format.validate() + + def test_ncbi_tax_nodes_dmp_nonnumeric_other(self): + fp = self.get_data_path("ncbi/nodes-non-numeric-other.dmp") + format = NCBITaxonomyNodesFormat(fp, "r") + with self.assertRaisesRegex(ValidationError, r"line 2, column 6: x."): + format.validate() + + def test_ncbi_taxonomy_dir_fmt(self): + dirpath = self.get_data_path("ncbi/db-valid") + format = NCBITaxonomyDirFmt(dirpath, mode="r") + format.validate() + + def test_binary_file_fmt_positive(self): + dirpath = self.get_data_path("ncbi/db-valid/prot.accession2taxid.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + format.validate() + + def test_binary_file_fmt_wrong_col(self): + dirpath = self.get_data_path("ncbi/wrong_col.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['accession', 'accession_version', 'taxid', 'gi']" + ): + format.validate() + + def test_binary_file_fmt_extra_col(self): + dirpath = self.get_data_path("ncbi/too_many_cols.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['accession', 'accession.version', " + r"'taxid', 'gi', 'something_else']" + ): + format.validate() + + def test_binary_file_fmt_wrong_accession(self): + dirpath = self.get_data_path("ncbi/wrong_accession.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['P1ABC1234', 'A0A009IHW8.1', '1310613', '1835922267']" + ): + format.validate() + + def test_binary_file_fmt_wrong_accession_version(self): + dirpath = self.get_data_path("ncbi/wrong_accession_version.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['A0A009IHW8', 'A0A009IHW8.1a', '1310613', '1835922267']" + ): + format.validate() + + def test_binary_file_fmt_wrong_taxid(self): + dirpath = self.get_data_path("ncbi/wrong_taxid.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['A0A009IHW8', 'A0A009IHW8.1', '1310613a', '1835922267']" + ): + format.validate() + + def test_binary_file_fmt_wrong_gi(self): + dirpath = self.get_data_path("ncbi/wrong_gi.gz") + format = NCBITaxonomyBinaryFileFmt(dirpath, mode="r") + with self.assertRaisesRegex( + ValidationError, + r"['A0A009IHW8', 'A0A009IHW8.1', '1310613', '1835922267s']" + ): + format.validate() diff --git a/q2_types/reference_db/tests/test_type.py b/q2_types/reference_db/tests/test_type.py new file mode 100644 index 00000000..01d3a44e --- /dev/null +++ b/q2_types/reference_db/tests/test_type.py @@ -0,0 +1,56 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from qiime2.plugin.testing import TestPluginBase + +from q2_types.reference_db._format import ( + DiamondDatabaseDirFmt, EggnogRefDirFmt, NCBITaxonomyDirFmt, + EggnogProteinSequencesDirFmt +) +from q2_types.reference_db._type import ( + ReferenceDB, Diamond, Eggnog, NCBITaxonomy, EggnogProteinSequences +) + + +class TestReferenceType(TestPluginBase): + package = 'q2_types.reference_db.tests' + + def test_ref_db_registration(self): + self.assertRegisteredSemanticType(ReferenceDB) + + def test_diamond_registration(self): + self.assertRegisteredSemanticType(Diamond) + + def test_diamond_semantic_type_registered_to_dmnd_db_dir_fmt(self): + self.assertSemanticTypeRegisteredToFormat( + ReferenceDB[Diamond], + DiamondDatabaseDirFmt) + + def test_eggnog_registration(self): + self.assertRegisteredSemanticType(Eggnog) + + def test_eggnog_semantic_type_registered_to_eggnog_dir_fmt(self): + self.assertSemanticTypeRegisteredToFormat( + ReferenceDB[Eggnog], + EggnogRefDirFmt) + + def test_ncbi_registration(self): + self.assertRegisteredSemanticType(NCBITaxonomy) + + def test_ncbi_semantic_type_registered_to_eggnog_dir_fmt(self): + self.assertSemanticTypeRegisteredToFormat( + ReferenceDB[NCBITaxonomy], + NCBITaxonomyDirFmt) + + def test_EggnogProteinSequences_registration(self): + self.assertRegisteredSemanticType(EggnogProteinSequences) + + def test_EggnogProteinSequences_semantic_type_registered_to_DirFmt(self): + self.assertSemanticTypeRegisteredToFormat( + ReferenceDB[EggnogProteinSequences], + EggnogProteinSequencesDirFmt) diff --git a/setup.py b/setup.py index 16b8006c..9a3617bd 100644 --- a/setup.py +++ b/setup.py @@ -45,9 +45,42 @@ 'data/qiime1-demux-format/*', 'data/single-end-two-sample-data1/*', 'data/single-end-two-sample-data2/*', - 'data/single-end-two-sample-data3/*'], + 'data/single-end-two-sample-data3/*', + 'data/mags/*/*', 'data/mags/*/*/*', + 'data/manifests/*', 'data/contigs/*', + 'data/diamond_hit/*', + 'data/bowtie/*/*', 'data/bowtie/*/*/*/*', 'data/bowtie/*/*/*'], 'q2_types.sample_data.tests': ['data/*'], - 'q2_types.tree.tests': ['data/*'] + 'q2_types.tree.tests': ['data/*'], + 'q2_types.feature_data_mag.tests': + ['data/*', 'data/*/*', + 'data/mags-fa/*', 'data/mags-fasta/*'], + 'q2_types.genome_data.tests': + ['data/*/', 'data/genes-with-prefix/*', + 'data/genes-with-suffix/*', 'data/genes-with-wrong-prefix/*', + 'data/loci-invalid/*', 'data/loci-with-prefix/*', + 'data/loci-with-suffix/*', 'data/loci-with-wrong-prefix/*', + 'data/ortholog/*', 'data/proteins-with-suffix/*', + 'data/proteins-with-prefix/*', + 'data/proteins-with-wrong-prefix/*', + ], + 'q2_types.kraken2.tests': [ + 'data/*', + 'data/kraken2-db/*', + 'data/bracken-db/*', + 'data/outputs-single/*', + 'data/outputs-reads/*/*', + 'data/outputs-contigs/*', + 'data/outputs-mags/*/*', + 'data/reports-single/*', + 'data/reports-reads/*/*', + 'data/reports-mags/*/*', + 'data/db-reports/**/*' + ], + 'q2_types.kaiju.tests': + ['data/*', 'data/db-valid/*'], + 'q2_types.reference_db.tests': + ['data/*', 'data/*/*', 'data/*/*/*'], }, zip_safe=False, )