diff --git a/sc2ts/info.py b/sc2ts/info.py index f30057c..f9b44b5 100644 --- a/sc2ts/info.py +++ b/sc2ts/info.py @@ -54,7 +54,7 @@ class LineageDetails: "20H", "Beta", "2020-09", - None, # TODO: From which source? + None, # TODO: From which source? ), LineageDetails( "B.1.617.2", @@ -74,7 +74,7 @@ class LineageDetails: "20J", "Gamma", "2020-12", - None, # TODO: From which source? + None, # TODO: From which source? ), LineageDetails( "BA.1", @@ -274,7 +274,9 @@ def get_num_muts(ts): assert np.min(tree_nodes_preorder) >= 0 tree_parent_array = tree.parent_array mut_pos = ts.sites_position[ts.mutations_site] - is_mut_in_tree = (tree.interval.left <= mut_pos) & (mut_pos < tree.interval.right) + is_mut_in_tree = (tree.interval.left <= mut_pos) & ( + mut_pos < tree.interval.right + ) tree_nodes_num_muts = np.bincount( ts.mutations_node[is_mut_in_tree], minlength=ts.num_nodes, @@ -893,6 +895,50 @@ def recombinants_summary(self): ) return pd.DataFrame(data) + def deletions_summary(self): + deletion_ids = np.where(self.mutations_derived_state == "-")[0] + df = pd.DataFrame( + { + "mutation": deletion_ids, + "position": self.mutations_position[deletion_ids], + "node": self.ts.mutations_node[deletion_ids], + } + ) + df = df.sort_values(["position", "node"]) + events = {} + for row in df.itertuples(): + if row.node not in events: + events[row.node] = [ + DeletionEvent(row.position, row.node, 1, [row.mutation]) + ] + else: + for e in events[row.node]: + if row.position == e.start + e.length: + e.length += 1 + e.mutations.append(row.mutation) + break + else: + # Didn't find an event to extend, add another one + events[row.node].append( + DeletionEvent(row.position, row.node, 1, [row.mutation]) + ) + # Now unwrap the events and compute summaries + data = [] + for event_list in events.values(): + for e in event_list: + num_inheritors = self.mutations_num_inheritors[e.mutations] + data.append( + { + "start": e.start, + "node": e.node, + "length": e.length, + "max_inheritors": np.max(num_inheritors), + "min_inheritors": np.min(num_inheritors), + } + ) + + return pd.DataFrame(data) + def combine_recombinant_info(self): def get_imputed_pango(u, pango_source): # Can set pango_source to "Nextclade_pango" or "GISAID_lineage" @@ -1383,7 +1429,9 @@ def plot_mutations_per_site(self, annotate_threshold=0.9, select=None): if select is None: count = self.sites_num_mutations else: - count = np.bincount(self.ts.mutations_site[select], minlength=self.ts.num_sites) + count = np.bincount( + self.ts.mutations_site[select], minlength=self.ts.num_sites + ) pos = self.ts.sites_position zero_fraction = np.sum(count == 0) / self.ts.num_sites @@ -1518,6 +1566,14 @@ def get_sample_group_info(self, group_id): ) +@dataclasses.dataclass +class DeletionEvent: + start: int + node: int + length: int + mutations: List + + def country_abbr(country): return { "United Kingdom": "UK",