From 99d9709f1bc965cfce7ef8d3d4a251cdc95a9fa5 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 21 Dec 2019 07:18:15 -0800 Subject: [PATCH 1/4] added `select_target_names` option to `Targets` The addition of `select_target_names` makes it possible to create a `Targets` object for just some of the sequences in the input `seqsfile`, which is useful if your Genbank file contains more sequences than you want to align against. Two cells have been added to `lasv_pilot.ipynb` that demonstrate and teset this option. In this process a previously silent bug in a minor check was found on [this line](https://github.com/jbloomlab/alignparse/blob/0.1.1/alignparse/targets.py#L445), and has now been fixed. --- CHANGELOG.rst | 12 ++++- alignparse/__init__.py | 2 +- alignparse/targets.py | 17 ++++++- notebooks/lasv_pilot.ipynb | 90 ++++++++++++++++++++++++++------------ 4 files changed, 90 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ccfc3fd..9e774b9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,9 +6,19 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_. +0.2.dev0 +-------- + +Added ++++++ +* Added ``select_target_names`` option to ``Targets``. + 0.1.1 --------------------------- -* Fixed DataFrame querying bug in `./alignparse/ccs.py` + +Fixed +++++++ +* Fixed DataFrame querying bug in ``./alignparse/ccs.py`` 0.1.0 --------------------------- diff --git a/alignparse/__init__.py b/alignparse/__init__.py index 3fbfd23..1b4f837 100644 --- a/alignparse/__init__.py +++ b/alignparse/__init__.py @@ -7,5 +7,5 @@ __author__ = '`the Bloom lab `_' __email__ = 'jbloom@fredhutch.org' -__version__ = '0.1.1' +__version__ = '0.2.dev0' __url__ = 'https://github.com/jbloomlab/alignparse' diff --git a/alignparse/targets.py b/alignparse/targets.py index d42a16e..7072962 100644 --- a/alignparse/targets.py +++ b/alignparse/targets.py @@ -342,6 +342,9 @@ class Targets: ignore_feature_parse_specs_keys : None or list Ignore these target-level keys in `feature_parse_specs`. Useful for YAML with default keys that don't represent actual targets. + select_target_names : None or list + If `None`, the created object is for all sequences in `seqsfile`. + Otherwise pass a list with names of just the sequences of interest. Attributes ---------- @@ -361,7 +364,9 @@ def __repr__(self): def __init__(self, *, seqsfile, feature_parse_specs, allow_extra_features=False, seqsfileformat='genbank', allow_clipped_muts_seqs=False, - ignore_feature_parse_specs_keys=None): + ignore_feature_parse_specs_keys=None, + select_target_names=None, + ): """See main class docstring.""" # read feature_parse_specs if isinstance(feature_parse_specs, str): @@ -393,6 +398,10 @@ def __init__(self, *, seqsfile, feature_parse_specs, 'mutation_op_count'] # get targets from seqsfile + if select_target_names is not None: + if len(select_target_names) < 1: + raise ValueError('`select_target_names` must be none or ' + 'non-empty list') if isinstance(seqsfile, str): seqrecords = list(Bio.SeqIO.parse(seqsfile, format=seqsfileformat)) else: @@ -403,6 +412,8 @@ def __init__(self, *, seqsfile, feature_parse_specs, self._target_dict = {} for seqrecord in seqrecords: tname = Target.get_name(seqrecord) + if select_target_names and (tname not in select_target_names): + continue target = Target(seqrecord=seqrecord, req_features=self.features_to_parse(tname, 'name'), allow_extra_features=allow_extra_features, @@ -421,6 +432,8 @@ def __init__(self, *, seqsfile, feature_parse_specs, str(self._return_suffixes)) self.target_names = [target.name for target in self.targets] self.target_seqs = {target.name: target.seq for target in self.targets} + if not self.targets: + raise ValueError('no targets found') # check needed for `to_csv` option of `parse_alignment`. if len(self.target_names) != len({tname.replace(' ', '_') for @@ -442,7 +455,7 @@ def __init__(self, *, seqsfile, feature_parse_specs, # features unless flag to do this explicitly set if not allow_clipped_muts_seqs: for t in self.target_names: - for f in self.features_to_parse(tname, 'name'): + for f in self.features_to_parse(t, 'name'): for return_name in ['sequence', 'mutations']: if return_name in self._parse_returnvals(t, f): filt = self._parse_filters[t][f] diff --git a/notebooks/lasv_pilot.ipynb b/notebooks/lasv_pilot.ipynb index 312426e..7ea7207 100644 --- a/notebooks/lasv_pilot.ipynb +++ b/notebooks/lasv_pilot.ipynb @@ -461,6 +461,42 @@ "_ = targets.plot(ax_width=10)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if needed, it is also possible to get a [Targets](https://jbloomlab.github.io/alignparse/alignparse.targets.html#alignparse.targets.Targets) object for just some of the sequences specified in `seqsfile` or `feature_parse_specs`.\n", + "This is most commonly useful when `seqsfile` contains additional sequences that are not of interest.\n", + "Below we illustrate how to do this by:\n", + " - Setting `select_target_names` to only keep the *LASV_Josiah_WT* sequence in `seqsfile`\n", + " - Setting `ingore_feature_parse_specs` to ignore the other targets (in this case, *LASV_Josiah_OPT*) in `feature_parse_specs`" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here are the names of the retained targets: ['LASV_Josiah_WT']\n" + ] + } + ], + "source": [ + "targets_subset = alignparse.targets.Targets(\n", + " seqsfile=targetfiles,\n", + " feature_parse_specs=lasv_parse_specs_file,\n", + " allow_extra_features=True,\n", + " select_target_names=['LASV_Josiah_WT'],\n", + " ignore_feature_parse_specs_keys=['LASV_Josiah_OPT']\n", + " )\n", + "\n", + "print(f\"Here are the names of the retained targets: {targets_subset.target_names}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -485,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -533,7 +569,7 @@ "0 input_files/lasv_example_ccs.fastq.gz " ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -561,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -577,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -598,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -684,7 +720,7 @@ "4 0.0000 " ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -702,7 +738,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -757,7 +793,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -786,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "scrolled": true }, @@ -841,7 +877,7 @@ "0 ./output_files/lasv_pilot_alignments.sam " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -861,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -891,7 +927,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -925,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -976,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "scrolled": true }, @@ -1051,7 +1087,7 @@ "4 unmapped 0 lasv_pilot" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1062,7 +1098,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1098,7 +1134,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1184,7 +1220,7 @@ "4 lasv_pilot " ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1195,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": { "scrolled": true }, @@ -1255,7 +1291,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1321,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1344,7 +1380,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1526,7 +1562,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1552,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1603,7 +1639,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "scrolled": true }, @@ -1954,7 +1990,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -1976,7 +2012,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [ { From 74f12de89767a243e36cc176f323a09bc8d4c668 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 21 Dec 2019 14:56:59 -0800 Subject: [PATCH 2/4] make `select_target_names` version 0.1.2 --- CHANGELOG.rst | 10 +++++----- alignparse/__init__.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9e774b9..9d3057a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,21 +6,21 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_. -0.2.dev0 --------- +0.1.2 +----- Added +++++ * Added ``select_target_names`` option to ``Targets``. 0.1.1 ---------------------------- +----- Fixed -++++++ ++++++ * Fixed DataFrame querying bug in ``./alignparse/ccs.py`` 0.1.0 ---------------------------- +----- Initial release diff --git a/alignparse/__init__.py b/alignparse/__init__.py index 1b4f837..c160344 100644 --- a/alignparse/__init__.py +++ b/alignparse/__init__.py @@ -7,5 +7,5 @@ __author__ = '`the Bloom lab `_' __email__ = 'jbloom@fredhutch.org' -__version__ = '0.2.dev0' +__version__ = '0.1.2' __url__ = 'https://github.com/jbloomlab/alignparse' From 8a65923717665cfca6b7e4a4f4478f8d0781f7d4 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 21 Dec 2019 14:57:23 -0800 Subject: [PATCH 3/4] fix doc typos --- alignparse/targets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/alignparse/targets.py b/alignparse/targets.py index 7072962..614b4cd 100644 --- a/alignparse/targets.py +++ b/alignparse/targets.py @@ -770,7 +770,7 @@ def align_and_parse(self, Returns ------- (readstats, aligned, filtered) : tuple - Same meaning as for :meth:`Targets.parse_alignments` except + Same meaning as for :meth:`Targets.parse_alignment` except the data frames / CSV files all have additional columns indicating name of each query set (`name_cols`) as well as any `group_cols`. @@ -1280,7 +1280,7 @@ def _parse_alignment_cs(self, samfile, *, multi_align='primary', Note ---- This method returns the same information that can be better - obtained via :meth:`Targets.parse_alignments` by setting + obtained via :meth:`Targets.parse_alignment` by setting to return 'cs', 'clip5', 'clip3' for every feature. It is currently retained only for debugging / testing purposes, and may eventually be removed. From f72d03b0e57656edb4c28f748905985d45946441 Mon Sep 17 00:00:00 2001 From: jbloom Date: Sat, 21 Dec 2019 14:58:54 -0800 Subject: [PATCH 4/4] check that `select_target_names` is list if not None --- alignparse/targets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/alignparse/targets.py b/alignparse/targets.py index 614b4cd..26bd019 100644 --- a/alignparse/targets.py +++ b/alignparse/targets.py @@ -399,7 +399,8 @@ def __init__(self, *, seqsfile, feature_parse_specs, # get targets from seqsfile if select_target_names is not None: - if len(select_target_names) < 1: + if not (isinstance(select_target_names, list) and + len(select_target_names) >= 1): raise ValueError('`select_target_names` must be none or ' 'non-empty list') if isinstance(seqsfile, str):