diff --git a/q2_annotate/kraken2/select.py b/q2_annotate/kraken2/select.py index 2cd21b46..4c5f9273 100644 --- a/q2_annotate/kraken2/select.py +++ b/q2_annotate/kraken2/select.py @@ -230,6 +230,14 @@ def _combine_ncbi_trees(trees): continue # for clarity else: parents = list(tip.ancestors())[:-1] # ignore unnamed root + + # check if node is a infra-clade (infra-clades have length 0). + # then adds self to ancestor list, if it is an infra-clade. + # this mimics what happens if the node isn't an infra-clade. + # i.e node had an id_node and then self gets added to the + # list of ancestors if you call .parent on an id_node. + if tip.length == 0: + parents.insert(0, tip) matching = full_tree subtree_inserted = False while parents and not subtree_inserted: @@ -243,6 +251,12 @@ def _combine_ncbi_trees(trees): break if not ancestor_found: matching.append(node) + # This may be overkill but this checks to make sure + # that the tip is an infra clade (tip.length = 0) + # and doesn't have children. If thats these are both + # true then add this tip to tip cache. + if len(tip.children) == 0 and tip.length == 0: + tip_cache[tip.name] = tip for t in node.tips(): tip_cache[t.name] = t assert tip.name in tip_cache diff --git a/q2_annotate/kraken2/tests/data/infra-clade/first-wo-infra-clade.report.txt b/q2_annotate/kraken2/tests/data/infra-clade/first-wo-infra-clade.report.txt new file mode 100644 index 00000000..ce84fb0f --- /dev/null +++ b/q2_annotate/kraken2/tests/data/infra-clade/first-wo-infra-clade.report.txt @@ -0,0 +1,44 @@ +100 9332144 0 R 1 root +96.08 8966446 0 R1 131567 cellular organisms +5.3 494861 0 D 2759 Eukaryota +4.81 448908 0 D1 33154 Opisthokonta +4.54 423718 0 K 33208 Metazoa +4.53 422551 0 K1 6072 Eumetazoa +4.5 419698 0 K2 33213 Bilateria +3.42 318923 0 K3 33317 Protostomia +3.1 289656 0 K4 1206794 Ecdysozoa +3.07 286854 0 K5 88770 Panarthropoda +3.07 286847 0 P 6656 Arthropoda +3.01 281163 0 P1 197563 Mandibulata +3.01 281054 0 P2 197562 Pancrustacea +2.94 274151 0 P3 6960 Hexapoda +2.93 273688 0 C 50557 Insecta +2.93 273688 0 C1 85512 Dicondylia +2.93 273688 0 C2 7496 Pterygota +2.92 272146 0 C3 33340 Neoptera +0.06 5786 0 C4 33342 Paraneoptera +0.06 5578 0 O 7524 Hemiptera +0.02 2099 0 O1 33343 Prosorrhyncha +0.02 2099 0 O2 33345 Heteroptera +0.02 2099 0 O3 33347 Euheteroptera +0.02 2099 0 O4 33349 Neoheteroptera +0.02 1673 0 O5 33351 Panheteroptera +0.01 962 0 O6 33354 Cimicomorpha +0.01 935 0 O7 33355 Cimicoidea +0.01 927 0 F 30083 Miridae +0 228 0 F1 236635 Phylinae +0 228 0 F2 236648 Pilophorini +0 228 0 G 237084 Pilophorus +0.27 25092 0 K 4751 Fungi +0.24 22558 0 K1 451864 Dikarya +0.22 20530 0 P 4890 Ascomycota +0.22 20419 0 P1 716545 saccharomyceta +0.2 18909 0 P2 147538 Pezizomycotina +0.2 18778 0 P3 716546 leotiomyceta +0 58 0 C 147547 Lecanoromycetes +0 58 0 C1 1520881 OSLEUM clade +0 58 0 C2 388435 Lecanoromycetidae +0 58 0 O 5197 Lecanorales +0 58 0 O1 157822 Lecanorineae +0 53 0 F 5198 Cladoniaceae +0 53 0 G 5199 Cladonia \ No newline at end of file diff --git a/q2_annotate/kraken2/tests/data/infra-clade/second-with-infra-clade.report.txt b/q2_annotate/kraken2/tests/data/infra-clade/second-with-infra-clade.report.txt new file mode 100644 index 00000000..89beb1a0 --- /dev/null +++ b/q2_annotate/kraken2/tests/data/infra-clade/second-with-infra-clade.report.txt @@ -0,0 +1,45 @@ +100 9332144 0 R 1 root +96.08 8966446 0 R1 131567 cellular organisms +5.3 494861 0 D 2759 Eukaryota +4.81 448908 0 D1 33154 Opisthokonta +4.54 423718 0 K 33208 Metazoa +4.53 422551 0 K1 6072 Eumetazoa +4.5 419698 0 K2 33213 Bilateria +3.42 318923 0 K3 33317 Protostomia +3.1 289656 0 K4 1206794 Ecdysozoa +3.07 286854 0 K5 88770 Panarthropoda +3.07 286847 0 P 6656 Arthropoda +3.01 281163 0 P1 197563 Mandibulata +3.01 281054 0 P2 197562 Pancrustacea +2.94 274151 0 P3 6960 Hexapoda +2.93 273688 0 C 50557 Insecta +2.93 273688 0 C1 85512 Dicondylia +2.93 273688 0 C2 7496 Pterygota +2.92 272146 0 C3 33340 Neoptera +0.06 5786 0 C4 33342 Paraneoptera +0.06 5578 0 O 7524 Hemiptera +0.02 2099 0 O1 33343 Prosorrhyncha +0.02 2099 0 O2 33345 Heteroptera +0.02 2099 0 O3 33347 Euheteroptera +0.02 2099 0 O4 33349 Neoheteroptera +0.02 1673 0 O5 33351 Panheteroptera +0.01 962 0 O6 33354 Cimicomorpha +0.01 935 0 O7 33355 Cimicoidea +0.01 927 0 F 30083 Miridae +0 228 0 F1 236635 Phylinae +0 228 0 F2 236648 Pilophorini +0 228 0 G 237084 Pilophorus +0.27 25092 0 K 4751 Fungi +0.24 22558 0 K1 451864 Dikarya +0.22 20530 0 P 4890 Ascomycota +0.22 20419 0 P1 716545 saccharomyceta +0.2 18909 0 P2 147538 Pezizomycotina +0.2 18778 0 P3 716546 leotiomyceta +0 58 0 C 147547 Lecanoromycetes +0 58 0 C1 1520881 OSLEUM clade +0 58 0 C2 388435 Lecanoromycetidae +0 58 0 O 5197 Lecanorales +0 58 0 O1 157822 Lecanorineae +0 53 0 F 5198 Cladoniaceae +0 53 0 G 5199 Cladonia +0 53 0 G1 51991 Cladonia 2 diff --git a/q2_annotate/kraken2/tests/test_selection.py b/q2_annotate/kraken2/tests/test_selection.py index b1e87896..bcadcdf7 100644 --- a/q2_annotate/kraken2/tests/test_selection.py +++ b/q2_annotate/kraken2/tests/test_selection.py @@ -86,6 +86,18 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.temp_dir) + def test_kraken2_to_features_infra_clade(self): + reports = Kraken2ReportDirectoryFormat( + self.get_data_path("infra-clade/"), "r" + ) + obs_table, obs_taxonomy = kraken2_to_features( + reports, coverage_threshold=0.0) + # Check that expected taxons(tip of the tree) are in the taxonomy/table + assert '237084' in obs_taxonomy.index + assert '237084' in obs_table.columns + assert '5199' in obs_taxonomy.index + assert '5199' in obs_table.columns + def test_kraken2_to_features_duplicated_genus(self): reports = Kraken2ReportDirectoryFormat( self.get_data_path("duplicated-genus/"), "r"