Skip to content

Commit

Permalink
BUG: kraken2-to-features errors on infra-clades as terminal taxons in…
Browse files Browse the repository at this point in the history
… some situations (#237)

Co-authored-by: Colin Wood <[email protected]>
  • Loading branch information
cherman2 and colinvwood authored Feb 7, 2025
1 parent 39373a2 commit 37d04de
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 0 deletions.
14 changes: 14 additions & 0 deletions q2_annotate/kraken2/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,14 @@ def _combine_ncbi_trees(trees):
continue # for clarity
else:
parents = list(tip.ancestors())[:-1] # ignore unnamed root

# check if node is a infra-clade (infra-clades have length 0).
# then adds self to ancestor list, if it is an infra-clade.
# this mimics what happens if the node isn't an infra-clade.
# i.e node had an id_node and then self gets added to the
# list of ancestors if you call .parent on an id_node.
if tip.length == 0:
parents.insert(0, tip)
matching = full_tree
subtree_inserted = False
while parents and not subtree_inserted:
Expand All @@ -243,6 +251,12 @@ def _combine_ncbi_trees(trees):
break
if not ancestor_found:
matching.append(node)
# This may be overkill but this checks to make sure
# that the tip is an infra clade (tip.length = 0)
# and doesn't have children. If thats these are both
# true then add this tip to tip cache.
if len(tip.children) == 0 and tip.length == 0:
tip_cache[tip.name] = tip
for t in node.tips():
tip_cache[t.name] = t
assert tip.name in tip_cache
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
100 9332144 0 R 1 root
96.08 8966446 0 R1 131567 cellular organisms
5.3 494861 0 D 2759 Eukaryota
4.81 448908 0 D1 33154 Opisthokonta
4.54 423718 0 K 33208 Metazoa
4.53 422551 0 K1 6072 Eumetazoa
4.5 419698 0 K2 33213 Bilateria
3.42 318923 0 K3 33317 Protostomia
3.1 289656 0 K4 1206794 Ecdysozoa
3.07 286854 0 K5 88770 Panarthropoda
3.07 286847 0 P 6656 Arthropoda
3.01 281163 0 P1 197563 Mandibulata
3.01 281054 0 P2 197562 Pancrustacea
2.94 274151 0 P3 6960 Hexapoda
2.93 273688 0 C 50557 Insecta
2.93 273688 0 C1 85512 Dicondylia
2.93 273688 0 C2 7496 Pterygota
2.92 272146 0 C3 33340 Neoptera
0.06 5786 0 C4 33342 Paraneoptera
0.06 5578 0 O 7524 Hemiptera
0.02 2099 0 O1 33343 Prosorrhyncha
0.02 2099 0 O2 33345 Heteroptera
0.02 2099 0 O3 33347 Euheteroptera
0.02 2099 0 O4 33349 Neoheteroptera
0.02 1673 0 O5 33351 Panheteroptera
0.01 962 0 O6 33354 Cimicomorpha
0.01 935 0 O7 33355 Cimicoidea
0.01 927 0 F 30083 Miridae
0 228 0 F1 236635 Phylinae
0 228 0 F2 236648 Pilophorini
0 228 0 G 237084 Pilophorus
0.27 25092 0 K 4751 Fungi
0.24 22558 0 K1 451864 Dikarya
0.22 20530 0 P 4890 Ascomycota
0.22 20419 0 P1 716545 saccharomyceta
0.2 18909 0 P2 147538 Pezizomycotina
0.2 18778 0 P3 716546 leotiomyceta
0 58 0 C 147547 Lecanoromycetes
0 58 0 C1 1520881 OSLEUM clade
0 58 0 C2 388435 Lecanoromycetidae
0 58 0 O 5197 Lecanorales
0 58 0 O1 157822 Lecanorineae
0 53 0 F 5198 Cladoniaceae
0 53 0 G 5199 Cladonia
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
100 9332144 0 R 1 root
96.08 8966446 0 R1 131567 cellular organisms
5.3 494861 0 D 2759 Eukaryota
4.81 448908 0 D1 33154 Opisthokonta
4.54 423718 0 K 33208 Metazoa
4.53 422551 0 K1 6072 Eumetazoa
4.5 419698 0 K2 33213 Bilateria
3.42 318923 0 K3 33317 Protostomia
3.1 289656 0 K4 1206794 Ecdysozoa
3.07 286854 0 K5 88770 Panarthropoda
3.07 286847 0 P 6656 Arthropoda
3.01 281163 0 P1 197563 Mandibulata
3.01 281054 0 P2 197562 Pancrustacea
2.94 274151 0 P3 6960 Hexapoda
2.93 273688 0 C 50557 Insecta
2.93 273688 0 C1 85512 Dicondylia
2.93 273688 0 C2 7496 Pterygota
2.92 272146 0 C3 33340 Neoptera
0.06 5786 0 C4 33342 Paraneoptera
0.06 5578 0 O 7524 Hemiptera
0.02 2099 0 O1 33343 Prosorrhyncha
0.02 2099 0 O2 33345 Heteroptera
0.02 2099 0 O3 33347 Euheteroptera
0.02 2099 0 O4 33349 Neoheteroptera
0.02 1673 0 O5 33351 Panheteroptera
0.01 962 0 O6 33354 Cimicomorpha
0.01 935 0 O7 33355 Cimicoidea
0.01 927 0 F 30083 Miridae
0 228 0 F1 236635 Phylinae
0 228 0 F2 236648 Pilophorini
0 228 0 G 237084 Pilophorus
0.27 25092 0 K 4751 Fungi
0.24 22558 0 K1 451864 Dikarya
0.22 20530 0 P 4890 Ascomycota
0.22 20419 0 P1 716545 saccharomyceta
0.2 18909 0 P2 147538 Pezizomycotina
0.2 18778 0 P3 716546 leotiomyceta
0 58 0 C 147547 Lecanoromycetes
0 58 0 C1 1520881 OSLEUM clade
0 58 0 C2 388435 Lecanoromycetidae
0 58 0 O 5197 Lecanorales
0 58 0 O1 157822 Lecanorineae
0 53 0 F 5198 Cladoniaceae
0 53 0 G 5199 Cladonia
0 53 0 G1 51991 Cladonia 2
12 changes: 12 additions & 0 deletions q2_annotate/kraken2/tests/test_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,18 @@ def setUp(self):
def tearDown(self):
shutil.rmtree(self.temp_dir)

def test_kraken2_to_features_infra_clade(self):
reports = Kraken2ReportDirectoryFormat(
self.get_data_path("infra-clade/"), "r"
)
obs_table, obs_taxonomy = kraken2_to_features(
reports, coverage_threshold=0.0)
# Check that expected taxons(tip of the tree) are in the taxonomy/table
assert '237084' in obs_taxonomy.index
assert '237084' in obs_table.columns
assert '5199' in obs_taxonomy.index
assert '5199' in obs_table.columns

def test_kraken2_to_features_duplicated_genus(self):
reports = Kraken2ReportDirectoryFormat(
self.get_data_path("duplicated-genus/"), "r"
Expand Down

0 comments on commit 37d04de

Please sign in to comment.