Skip to content

Commit

Permalink
Merge pull request #341 from jeromekelleher/deletions-summary
Browse files Browse the repository at this point in the history
Add deletions summary data frame
  • Loading branch information
jeromekelleher authored Oct 7, 2024
2 parents 56c98c3 + b1761bf commit 15539f8
Showing 1 changed file with 60 additions and 4 deletions.
64 changes: 60 additions & 4 deletions sc2ts/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class LineageDetails:
"20H",
"Beta",
"2020-09",
None, # TODO: From which source?
None, # TODO: From which source?
),
LineageDetails(
"B.1.617.2",
Expand All @@ -74,7 +74,7 @@ class LineageDetails:
"20J",
"Gamma",
"2020-12",
None, # TODO: From which source?
None, # TODO: From which source?
),
LineageDetails(
"BA.1",
Expand Down Expand Up @@ -274,7 +274,9 @@ def get_num_muts(ts):
assert np.min(tree_nodes_preorder) >= 0
tree_parent_array = tree.parent_array
mut_pos = ts.sites_position[ts.mutations_site]
is_mut_in_tree = (tree.interval.left <= mut_pos) & (mut_pos < tree.interval.right)
is_mut_in_tree = (tree.interval.left <= mut_pos) & (
mut_pos < tree.interval.right
)
tree_nodes_num_muts = np.bincount(
ts.mutations_node[is_mut_in_tree],
minlength=ts.num_nodes,
Expand Down Expand Up @@ -893,6 +895,50 @@ def recombinants_summary(self):
)
return pd.DataFrame(data)

def deletions_summary(self):
deletion_ids = np.where(self.mutations_derived_state == "-")[0]
df = pd.DataFrame(
{
"mutation": deletion_ids,
"position": self.mutations_position[deletion_ids],
"node": self.ts.mutations_node[deletion_ids],
}
)
df = df.sort_values(["position", "node"])
events = {}
for row in df.itertuples():
if row.node not in events:
events[row.node] = [
DeletionEvent(row.position, row.node, 1, [row.mutation])
]
else:
for e in events[row.node]:
if row.position == e.start + e.length:
e.length += 1
e.mutations.append(row.mutation)
break
else:
# Didn't find an event to extend, add another one
events[row.node].append(
DeletionEvent(row.position, row.node, 1, [row.mutation])
)
# Now unwrap the events and compute summaries
data = []
for event_list in events.values():
for e in event_list:
num_inheritors = self.mutations_num_inheritors[e.mutations]
data.append(
{
"start": e.start,
"node": e.node,
"length": e.length,
"max_inheritors": np.max(num_inheritors),
"min_inheritors": np.min(num_inheritors),
}
)

return pd.DataFrame(data)

def combine_recombinant_info(self):
def get_imputed_pango(u, pango_source):
# Can set pango_source to "Nextclade_pango" or "GISAID_lineage"
Expand Down Expand Up @@ -1383,7 +1429,9 @@ def plot_mutations_per_site(self, annotate_threshold=0.9, select=None):
if select is None:
count = self.sites_num_mutations
else:
count = np.bincount(self.ts.mutations_site[select], minlength=self.ts.num_sites)
count = np.bincount(
self.ts.mutations_site[select], minlength=self.ts.num_sites
)

pos = self.ts.sites_position
zero_fraction = np.sum(count == 0) / self.ts.num_sites
Expand Down Expand Up @@ -1518,6 +1566,14 @@ def get_sample_group_info(self, group_id):
)


@dataclasses.dataclass
class DeletionEvent:
start: int
node: int
length: int
mutations: List


def country_abbr(country):
return {
"United Kingdom": "UK",
Expand Down

0 comments on commit 15539f8

Please sign in to comment.