From 99d9709f1bc965cfce7ef8d3d4a251cdc95a9fa5 Mon Sep 17 00:00:00 2001
From: jbloom <jbloom@fredhutch.org>
Date: Sat, 21 Dec 2019 07:18:15 -0800
Subject: [PATCH 1/4] added `select_target_names` option to `Targets`

The addition of `select_target_names` makes it possible
to create a `Targets` object for just some of the sequences
in the input `seqsfile`, which is useful if your Genbank file
contains more sequences than you want to align against.

Two cells have been added to `lasv_pilot.ipynb` that demonstrate
and teset this option.

In this process a previously silent bug in a minor check was found
on [this line](https://github.com/jbloomlab/alignparse/blob/0.1.1/alignparse/targets.py#L445),
and has now been fixed.
---
 CHANGELOG.rst              | 12 ++++-
 alignparse/__init__.py     |  2 +-
 alignparse/targets.py      | 17 ++++++-
 notebooks/lasv_pilot.ipynb | 90 ++++++++++++++++++++++++++------------
 4 files changed, 90 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index ccfc3fd..9e774b9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,9 +6,19 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
+0.2.dev0
+--------
+
+Added
++++++
+* Added ``select_target_names`` option to ``Targets``.
+
 0.1.1
 ---------------------------
-* Fixed DataFrame querying bug in `./alignparse/ccs.py`
+
+Fixed
+++++++
+* Fixed DataFrame querying bug in ``./alignparse/ccs.py``
 
 0.1.0
 ---------------------------
diff --git a/alignparse/__init__.py b/alignparse/__init__.py
index 3fbfd23..1b4f837 100644
--- a/alignparse/__init__.py
+++ b/alignparse/__init__.py
@@ -7,5 +7,5 @@
 
 __author__ = '`the Bloom lab <https://research.fhcrc.org/bloom/en.html>`_'
 __email__ = 'jbloom@fredhutch.org'
-__version__ = '0.1.1'
+__version__ = '0.2.dev0'
 __url__ = 'https://github.com/jbloomlab/alignparse'
diff --git a/alignparse/targets.py b/alignparse/targets.py
index d42a16e..7072962 100644
--- a/alignparse/targets.py
+++ b/alignparse/targets.py
@@ -342,6 +342,9 @@ class Targets:
     ignore_feature_parse_specs_keys : None or list
         Ignore these target-level keys in `feature_parse_specs`. Useful for
         YAML with default keys that don't represent actual targets.
+    select_target_names : None or list
+        If `None`, the created object is for all sequences in `seqsfile`.
+        Otherwise pass a list with names of just the sequences of interest.
 
     Attributes
     ----------
@@ -361,7 +364,9 @@ def __repr__(self):
     def __init__(self, *, seqsfile, feature_parse_specs,
                  allow_extra_features=False, seqsfileformat='genbank',
                  allow_clipped_muts_seqs=False,
-                 ignore_feature_parse_specs_keys=None):
+                 ignore_feature_parse_specs_keys=None,
+                 select_target_names=None,
+                 ):
         """See main class docstring."""
         # read feature_parse_specs
         if isinstance(feature_parse_specs, str):
@@ -393,6 +398,10 @@ def __init__(self, *, seqsfile, feature_parse_specs,
                             'mutation_op_count']
 
         # get targets from seqsfile
+        if select_target_names is not None:
+            if len(select_target_names) < 1:
+                raise ValueError('`select_target_names` must be none or '
+                                 'non-empty list')
         if isinstance(seqsfile, str):
             seqrecords = list(Bio.SeqIO.parse(seqsfile, format=seqsfileformat))
         else:
@@ -403,6 +412,8 @@ def __init__(self, *, seqsfile, feature_parse_specs,
         self._target_dict = {}
         for seqrecord in seqrecords:
             tname = Target.get_name(seqrecord)
+            if select_target_names and (tname not in select_target_names):
+                continue
             target = Target(seqrecord=seqrecord,
                             req_features=self.features_to_parse(tname, 'name'),
                             allow_extra_features=allow_extra_features,
@@ -421,6 +432,8 @@ def __init__(self, *, seqsfile, feature_parse_specs,
                                      str(self._return_suffixes))
         self.target_names = [target.name for target in self.targets]
         self.target_seqs = {target.name: target.seq for target in self.targets}
+        if not self.targets:
+            raise ValueError('no targets found')
 
         # check needed for `to_csv` option of `parse_alignment`.
         if len(self.target_names) != len({tname.replace(' ', '_') for
@@ -442,7 +455,7 @@ def __init__(self, *, seqsfile, feature_parse_specs,
         # features unless flag to do this explicitly set
         if not allow_clipped_muts_seqs:
             for t in self.target_names:
-                for f in self.features_to_parse(tname, 'name'):
+                for f in self.features_to_parse(t, 'name'):
                     for return_name in ['sequence', 'mutations']:
                         if return_name in self._parse_returnvals(t, f):
                             filt = self._parse_filters[t][f]
diff --git a/notebooks/lasv_pilot.ipynb b/notebooks/lasv_pilot.ipynb
index 312426e..7ea7207 100644
--- a/notebooks/lasv_pilot.ipynb
+++ b/notebooks/lasv_pilot.ipynb
@@ -461,6 +461,42 @@
     "_ = targets.plot(ax_width=10)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that if needed, it is also possible to get a [Targets](https://jbloomlab.github.io/alignparse/alignparse.targets.html#alignparse.targets.Targets) object for just some of the sequences specified in `seqsfile` or `feature_parse_specs`.\n",
+    "This is most commonly useful when `seqsfile` contains additional sequences that are not of interest.\n",
+    "Below we illustrate how to do this by:\n",
+    " - Setting `select_target_names` to only keep the *LASV_Josiah_WT* sequence in `seqsfile`\n",
+    " - Setting `ingore_feature_parse_specs` to ignore the other targets (in this case, *LASV_Josiah_OPT*) in `feature_parse_specs`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Here are the names of the retained targets: ['LASV_Josiah_WT']\n"
+     ]
+    }
+   ],
+   "source": [
+    "targets_subset = alignparse.targets.Targets(\n",
+    "                    seqsfile=targetfiles,\n",
+    "                    feature_parse_specs=lasv_parse_specs_file,\n",
+    "                    allow_extra_features=True,\n",
+    "                    select_target_names=['LASV_Josiah_WT'],\n",
+    "                    ignore_feature_parse_specs_keys=['LASV_Josiah_OPT']\n",
+    "                    )\n",
+    "\n",
+    "print(f\"Here are the names of the retained targets: {targets_subset.target_names}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -485,7 +521,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -533,7 +569,7 @@
        "0  input_files/lasv_example_ccs.fastq.gz  "
       ]
      },
-     "execution_count": 10,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -561,7 +597,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -577,7 +613,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -598,7 +634,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -684,7 +720,7 @@
        "4    0.0000  "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -702,7 +738,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -757,7 +793,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -786,7 +822,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {
     "scrolled": true
    },
@@ -841,7 +877,7 @@
        "0  ./output_files/lasv_pilot_alignments.sam  "
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -861,7 +897,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -891,7 +927,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
@@ -925,7 +961,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -976,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {
     "scrolled": true
    },
@@ -1051,7 +1087,7 @@
        "4                  unmapped      0  lasv_pilot"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1062,7 +1098,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -1098,7 +1134,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -1184,7 +1220,7 @@
        "4  lasv_pilot  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1195,7 +1231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "metadata": {
     "scrolled": true
    },
@@ -1255,7 +1291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -1321,7 +1357,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1344,7 +1380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -1526,7 +1562,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1552,7 +1588,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -1603,7 +1639,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 30,
    "metadata": {
     "scrolled": true
    },
@@ -1954,7 +1990,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1976,7 +2012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {

From 74f12de89767a243e36cc176f323a09bc8d4c668 Mon Sep 17 00:00:00 2001
From: jbloom <jbloom@fredhutch.org>
Date: Sat, 21 Dec 2019 14:56:59 -0800
Subject: [PATCH 2/4] make `select_target_names` version 0.1.2

---
 CHANGELOG.rst          | 10 +++++-----
 alignparse/__init__.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 9e774b9..9d3057a 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -6,21 +6,21 @@ All notable changes to this project will be documented in this file.
 
 The format is based on `Keep a Changelog <https://keepachangelog.com>`_.
 
-0.2.dev0
---------
+0.1.2
+-----
 
 Added
 +++++
 * Added ``select_target_names`` option to ``Targets``.
 
 0.1.1
----------------------------
+-----
 
 Fixed
-++++++
++++++
 * Fixed DataFrame querying bug in ``./alignparse/ccs.py``
 
 0.1.0
----------------------------
+-----
 Initial release
 
diff --git a/alignparse/__init__.py b/alignparse/__init__.py
index 1b4f837..c160344 100644
--- a/alignparse/__init__.py
+++ b/alignparse/__init__.py
@@ -7,5 +7,5 @@
 
 __author__ = '`the Bloom lab <https://research.fhcrc.org/bloom/en.html>`_'
 __email__ = 'jbloom@fredhutch.org'
-__version__ = '0.2.dev0'
+__version__ = '0.1.2'
 __url__ = 'https://github.com/jbloomlab/alignparse'

From 8a65923717665cfca6b7e4a4f4478f8d0781f7d4 Mon Sep 17 00:00:00 2001
From: jbloom <jbloom@fredhutch.org>
Date: Sat, 21 Dec 2019 14:57:23 -0800
Subject: [PATCH 3/4] fix doc typos

---
 alignparse/targets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/alignparse/targets.py b/alignparse/targets.py
index 7072962..614b4cd 100644
--- a/alignparse/targets.py
+++ b/alignparse/targets.py
@@ -770,7 +770,7 @@ def align_and_parse(self,
         Returns
         -------
         (readstats, aligned, filtered) : tuple
-            Same meaning as for :meth:`Targets.parse_alignments` except
+            Same meaning as for :meth:`Targets.parse_alignment` except
             the data frames / CSV files all have additional columns indicating
             name of each query set (`name_cols`) as well as any `group_cols`.
 
@@ -1280,7 +1280,7 @@ def _parse_alignment_cs(self, samfile, *, multi_align='primary',
         Note
         ----
         This method returns the same information that can be better
-        obtained via :meth:`Targets.parse_alignments` by setting
+        obtained via :meth:`Targets.parse_alignment` by setting
         to return 'cs', 'clip5', 'clip3' for every feature. It is
         currently retained only for debugging / testing purposes,
         and may eventually be removed.

From f72d03b0e57656edb4c28f748905985d45946441 Mon Sep 17 00:00:00 2001
From: jbloom <jbloom@fredhutch.org>
Date: Sat, 21 Dec 2019 14:58:54 -0800
Subject: [PATCH 4/4] check that `select_target_names` is list if not None

---
 alignparse/targets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/alignparse/targets.py b/alignparse/targets.py
index 614b4cd..26bd019 100644
--- a/alignparse/targets.py
+++ b/alignparse/targets.py
@@ -399,7 +399,8 @@ def __init__(self, *, seqsfile, feature_parse_specs,
 
         # get targets from seqsfile
         if select_target_names is not None:
-            if len(select_target_names) < 1:
+            if not (isinstance(select_target_names, list) and
+                    len(select_target_names) >= 1):
                 raise ValueError('`select_target_names` must be none or '
                                  'non-empty list')
         if isinstance(seqsfile, str):