Skip to content

Commit

Permalink
Add the script (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
pawel-czyz authored Jan 25, 2025
1 parent eaa6dc9 commit cc05a57
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 13 deletions.
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@ matplotlib = "^3.8.0"
scipy = "^1.11.3"
jax = "^0.4.20"
jaxlib = "^0.4.20"
diffrax = "^0.4.1"
# diffrax = "^0.4.1"
# Pin dependencies for PyMC
numpy = "==1.24.3"
numpy = "^1.24.3"
#pytensor = "==2.11.1"
#pymc = "==5.3.0"
seaborn = "^0.13.2"
numpyro = "^0.14.0"
subplots-from-axsize = "^0.1.9"
pango-aliasor = "^0.3.0"
typer = "^0.15.1"
jaxtyping = "^0.2.36"

[tool.poetry.scripts]
covvfit = "covvfit.cli:main"
Expand Down
155 changes: 144 additions & 11 deletions src/covvfit/_cli/freyja.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,151 @@
"""Script gathering Freyja files into one CSV."""
from typing import Annotated
from dataclasses import dataclass
from pathlib import Path
from typing import Annotated, Optional

import pandas as pd
import typer


@dataclass
class _FreyjaData:
lineages: list[str]
abundances: list[float]
resid: float
coverage: float

location: str | None
timepoint: str | None


def _read_freyja(file_path: str, /) -> _FreyjaData:
with open(file_path) as fp:
content = fp.read()
lines = content.split("\n")
data = {}
for line in lines:
name, _, line_content = line.partition("\t")
data[name.strip()] = line_content.strip()

return _FreyjaData(
lineages=data["lineages"].split(),
abundances=[float(x) for x in data["abundances"].strip().split()],
resid=float(data["resid"]),
coverage=float(data["coverage"]),
location=data.get("location", None),
timepoint=data.get("timepoint", None),
)


def _get_abundances(content: _FreyjaData, /) -> pd.DataFrame:
abund, names = content.abundances, content.lineages

# Validate that the abundances are within bounds
for ab, name in zip(abund, names):
if not (0 <= ab <= 1):
raise ValueError(f"Abundance {ab} of variant {name} is out of bounds.")

return pd.DataFrame(
{
"name": names,
"abundance": abund,
}
)


_SAMPLE_COL: str = "sample"


def _parse_freyja(
path: Path,
*,
sample: str,
min_coverage: float,
) -> pd.DataFrame:
"""Parses Freyja file into a data frame.
If the coverage is not matched, an empty data frame is returned.
Adds the column for sample names.
"""
content = _read_freyja(path)
coverage = content.coverage

if coverage < min_coverage:
return pd.DataFrame()

abundances = _get_abundances(content)
abundances[_SAMPLE_COL] = sample

return abundances


def freyja_gather(
directory: Annotated[str, typer.Argument(help="Directory with Freyja output")],
metadata: Annotated[str, typer.Argument(help="Metadata")],
output: Annotated[str, typer.Argument(help="Desired output location")],
):
"""Gathers Freyja-demixed files from the given directory into the output CSV,
adding the metadata."""
typer.echo(
"This function is not implemented yet. It will be added at a later time."
directory: Annotated[str, typer.Option(help="Directory with Freyja output")],
output: Annotated[str, typer.Option(help="Desired output location for the CSV")],
metadata: Optional[
Annotated[
Optional[str],
typer.Option("--metadata", help="File with sample-specific metadata"),
]
] = None,
metadata_sep: Annotated[
str, typer.Option("--metadata-sep", help="Separator used in the metadata file.")
] = "\t",
metadata_sample_col: Annotated[
str,
typer.Option(
"--metadata-sample-col",
help="Name of the column storing sample names in the metadata file",
),
] = "sample",
min_coverage: Annotated[
Optional[float],
typer.Option(
"--min-coverage",
help="Minimum coverage threshold needed to use the sample. By default no threshold is applied.",
),
] = None,
output_sep: Annotated[
str, typer.Option("--output-sep", help="Separator used in the output file.")
] = "\t",
) -> None:
"""Gathers Freyja-demixed files from the given directory into the output CSV, adding the metadata from the file."""
# TODO(Pawel): Add reading timepoint and location directly from the Freyja files, rather than metadata CSV

if min_coverage is None:
min_coverage = -1e9

path = Path(directory)

# Obtain the filenames
sample_paths = list(path.glob("*.tsv"))

# Get the large data frame with abundances
abundance = pd.concat(
[
_parse_freyja(path=pth, sample=pth.stem, min_coverage=min_coverage)
for pth in sample_paths
],
ignore_index=True,
)
typer.Exit(code=1)
# Add your processing logic here

if metadata is not None:
metadata = pd.read_csv(metadata, sep=metadata_sep)

if metadata_sample_col not in metadata.columns:
raise ValueError(
f"Column {metadata_sample_col} not available in the metadata file with columns {metadata.columns}."
)

abundance = pd.merge(
abundance,
metadata,
left_on=_SAMPLE_COL,
right_on=metadata_sample_col,
how="left",
)

# Generate the parent directories (if not exist)
output_path = Path(output)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save file
abundance.to_csv(output_path, index=False, sep=output_sep)

0 comments on commit cc05a57

Please sign in to comment.