Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add deletions summary data frame #341

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 60 additions & 4 deletions sc2ts/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class LineageDetails:
"20H",
"Beta",
"2020-09",
None, # TODO: From which source?
None, # TODO: From which source?
),
LineageDetails(
"B.1.617.2",
Expand All @@ -74,7 +74,7 @@ class LineageDetails:
"20J",
"Gamma",
"2020-12",
None, # TODO: From which source?
None, # TODO: From which source?
),
LineageDetails(
"BA.1",
Expand Down Expand Up @@ -274,7 +274,9 @@ def get_num_muts(ts):
assert np.min(tree_nodes_preorder) >= 0
tree_parent_array = tree.parent_array
mut_pos = ts.sites_position[ts.mutations_site]
is_mut_in_tree = (tree.interval.left <= mut_pos) & (mut_pos < tree.interval.right)
is_mut_in_tree = (tree.interval.left <= mut_pos) & (
mut_pos < tree.interval.right
)
tree_nodes_num_muts = np.bincount(
ts.mutations_node[is_mut_in_tree],
minlength=ts.num_nodes,
Expand Down Expand Up @@ -893,6 +895,50 @@ def recombinants_summary(self):
)
return pd.DataFrame(data)

def deletions_summary(self):
deletion_ids = np.where(self.mutations_derived_state == "-")[0]
df = pd.DataFrame(
{
"mutation": deletion_ids,
"position": self.mutations_position[deletion_ids],
"node": self.ts.mutations_node[deletion_ids],
}
)
df = df.sort_values(["position", "node"])
events = {}
for row in df.itertuples():
if row.node not in events:
events[row.node] = [
DeletionEvent(row.position, row.node, 1, [row.mutation])
]
else:
for e in events[row.node]:
if row.position == e.start + e.length:
e.length += 1
e.mutations.append(row.mutation)
break
else:
# Didn't find an event to extend, add another one
events[row.node].append(
DeletionEvent(row.position, row.node, 1, [row.mutation])
)
# Now unwrap the events and compute summaries
data = []
for event_list in events.values():
for e in event_list:
num_inheritors = self.mutations_num_inheritors[e.mutations]
data.append(
{
"start": e.start,
"node": e.node,
"length": e.length,
"max_inheritors": np.max(num_inheritors),
"min_inheritors": np.min(num_inheritors),
}
)

return pd.DataFrame(data)

def combine_recombinant_info(self):
def get_imputed_pango(u, pango_source):
# Can set pango_source to "Nextclade_pango" or "GISAID_lineage"
Expand Down Expand Up @@ -1383,7 +1429,9 @@ def plot_mutations_per_site(self, annotate_threshold=0.9, select=None):
if select is None:
count = self.sites_num_mutations
else:
count = np.bincount(self.ts.mutations_site[select], minlength=self.ts.num_sites)
count = np.bincount(
self.ts.mutations_site[select], minlength=self.ts.num_sites
)

pos = self.ts.sites_position
zero_fraction = np.sum(count == 0) / self.ts.num_sites
Expand Down Expand Up @@ -1518,6 +1566,14 @@ def get_sample_group_info(self, group_id):
)


@dataclasses.dataclass
class DeletionEvent:
start: int
node: int
length: int
mutations: List


def country_abbr(country):
return {
"United Kingdom": "UK",
Expand Down