Merge pull request #12 from widdowquinn/strath_move

Update ncfp for the move to Strathclyde
widdowquinn · Jul 25, 2020 · 7bb4027 · 7bb4027
2 parents fea603f + a874f36
commit 7bb4027
Show file tree

Hide file tree

Showing 31 changed files with 1,117 additions and 947 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -12,8 +12,8 @@ workflows:
   test:
     jobs:
       - test-3_8
-      # - test-3_7
-      # - test-3_6
+      - test-3_7
+      - test-3_6
 
 jobs:
   test-3_8: &test-template
@@ -27,7 +27,7 @@ jobs:
 
       - restore_cache:
           keys:
-            - ncfp-dependencies-pip-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
+            - ncfp-dependencies-pip-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}-{{ checksum "requirements-pip.txt" }}
             - ncfp-dependencies-pip-
 
       - run:
@@ -45,7 +45,7 @@ jobs:
       - save_cache:
           paths:
             - ./venv
-          key: ncfp-dependencies-pip-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
+          key: ncfp-dependencies-pip-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}-{{ checksum "requirements-pip.txt" }}
 
       - run:
           name: install ncfp
@@ -67,11 +67,11 @@ jobs:
       - codecov/upload:
           file: .coverage.xml
 
-  # test-3_6:
-  #   <<: *test-template
-  #   docker:
-  #     - image: circleci/python:3.6.8
-  # test-3_8:
-  #   <<: *test-template
-  #   docker:
-  #     - image: circleci/python:3.8.0
+  test-3_6:
+    <<: *test-template
+    docker:
+      - image: circleci/python:3.6.8
+  test-3_7:
+    <<: *test-template
+    docker:
+      - image: circleci/python:3.7.8
diff --git a/.gitignore b/.gitignore
@@ -16,19 +16,24 @@ venv*
 *__pycache__*
 
 # Exclude test files
+tests/examples
 tests/test_output
 skipped.fas
 .ncfp_cache
 test.log
-.coverage
 cover/
-example_output
+example_output/
 
 # Exclude local VS Code settings
 .vscode/
 
 # Exclude unneeded documentation files
 docs/_build/
+htmlcov/
+.coverage
+.coverage.*
+classes_ncbi_cds_from_protein.pdf
+packages_ncbi_cds_from_protein.pdf
 
 # Exclude distribution files
 dist/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,8 +3,18 @@ repos:
     rev: stable
     hooks:
     - id: black
-      language_version: python3.6
+      language_version: python3.7
+-   repo: https://github.com/asottile/blacken-docs
+    rev: v0.3.0
+    hooks:
+    -   id: blacken-docs
+        additional_dependencies: [black==18.9b0]
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v1.2.3
     hooks:
-    - id: flake8
+    - id: flake8
+    - id: check-case-conflict
+    - id: check-docstring-first
+    - id: check-json
+    - id: check-merge-conflict
+#    - id: check-yaml
diff --git a/.travis.yml b/.travis.yml
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,12 +1,23 @@
 # CHANGES.md - `ncfp`
 
-## v0.1.1dev1
+## v0.2.0-a1
 
-* Add installation instructions and other improvements to documentation
-* Tidied codebase in some places (removing `print` statements, unused functions, etc.).
-* Add CLI tests.
-* Correct `ncfp` program name in help/usage text.
+- Update copyright notices
+- Convert parsers to use `pathlib`
+- Add requirements file for development tools
+- Convert tests to use `pytest`, not `nose`
+- Revise logging usage
+- Change continuous integration from TravisCI to CircleCI
+- Guess sequence origin rather than asking user to provide at CLI (allows mixed origin files)
+- Update CLI parser and docs to reflect new sequence origin guessing
+
+## v0.1.1
+
+- Add installation instructions and other improvements to documentation
+- Tidied codebase in some places (removing `print` statements, unused functions, etc.).
+- Add CLI tests.
+- Correct `ncfp` program name in help/usage text.
 
 ## v0.1.0
 
-* First release of `ncfp`
+*- First release of `ncfp`
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 The MIT License
 
-Copyright (c) 2017-2018 The James Hutton Institute
+Copyright (c) 2017-2019 The James Hutton Institute
+Copyright (c) 2019-2020 The University of Strathclyde
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -0,0 +1,72 @@
+# Makefile
+#
+# This file is part of the ncfp package distribution
+# (https://github.com/widdowquinn/ncfp)
+
+# Set up all development dependencies in the current conda environment
+setup_env:
+	@conda install --file requirements-dev.txt --yes
+	@conda install --file requirements.txt --yes
+	@pip install -r requirements-pip.txt
+	@pre-commit install
+	@pip install -U -e .
+
+# Run all tests and display coverage report in a browser
+test:
+	@pytest --cov-report=html --cov=ncbi_cds_from_protein -v tests/ && open htmlcov/index.html
+
+# Build and display documentation
+docs: clean_docs uml
+	@cd docs && make html && open _build/html/index.html
+
+uml:
+	pyreverse -o pdf -p ncbi_cds_from_protein ncbi_cds_from_protein
+
+# Clean up outputs
+clean: clean_docs clean_tests clean_examples
+
+clean_docs:
+	@rm -rf docs/_build/html && \
+	rm -rf classes_ncbi_cds_From_protein.pdf && \
+	rm -rf packages_ncbi_cds_From_protein.pdf
+
+clean_examples:
+	@rm -rf tests/test_output/*
+
+clean_tests:
+	@rm -rf tests/test_output/*
+
+# Run examples from documentation
+examples:
+	# NCBI no introns
+	@ncfp tests/test_input/sequences/input_ncbi.fasta \
+        tests/examples/ncbi [email protected] -v
+	# UniProt no introns
+	@ncfp tests/test_input/sequences/input_uniprot.fasta \
+        tests/examples/uniprot [email protected] -v
+	# UniProt/Stockholm no introns
+	@ncfp -s tests/test_input/sequences/input_uniprot_stockholm.fasta \
+        tests/examples/uniprot_stockholm [email protected] -v
+	# Human isoforms/intron-exon
+	@ncfp tests/test_input/sequences/human.fasta \
+        tests/examples/human [email protected] -v
+	# Logging
+	@ncfp tests/test_input/sequences/human.fasta \
+        tests/examples/logging [email protected] \
+        -l tests/examples/logging/human.log
+	# Cache location
+	@ncfp tests/test_input/sequences/human.fasta \
+        tests/examples/caches [email protected] \
+        -d tests/examples/caches \
+        -c ncfp_cache
+	# Cache reuse
+	@ncfp tests/test_input/sequences/human.fasta \
+        tests/examples/caches1 [email protected] \
+        -d tests/examples/caches \
+        -c ncfp_cache
+	@ncfp tests/test_input/sequences/human.fasta \
+        tests/examples/caches2 [email protected] \
+        -d tests/examples/caches \
+        -c ncfp_cache \
+        --filestem cached \
+        --keepcache
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 This repository contains code for a script that identifies and writes the corresponding nucleotide sequences for each protein in an input multiple sequence file to be used, for example, in backthreading coding sequences onto protein alignments for phylogenetic analyses. `ncfp` uses the NCBI accession or UniProt gene name (as appropriate) to identify source nucleotide sequences in the NCBI databases, download them, and write them to a file.
 
-[![ncfp TravisCI build status](https://api.travis-ci.org/widdowquinn/ncfp.svg?branch=master)](https://travis-ci.org/widdowquinn/ncfp/branches)
+[![CircleCI](https://circleci.com/gh/widdowquinn/ncfp/tree/master.svg?style=shield)](https://circleci.com/gh/widdowquinn/ncfp/tree/master)
 [![ncfp codecov.io coverage](https://img.shields.io/codecov/c/github/widdowquinn/ncfp/master.svg)](https://codecov.io/github/widdowquinn/ncfp)
 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/99a037e5eb2c4ae48e27e44c8974a3f8)](https://www.codacy.com/app/widdowquinn/ncfp?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=widdowquinn/ncfp&amp;utm_campaign=Badge_Grade)
 [![CodeFactor](https://www.codefactor.io/repository/github/widdowquinn/ncfp/badge)](https://www.codefactor.io/repository/github/widdowquinn/ncfp)
@@ -49,25 +49,25 @@ For more detailed information about `ncfp` as a program, or using the underlying
 
 Unless otherwise indicated, all code is licensed under the MIT license and subject to the following agreement:
 
-    (c) The James Hutton Institute 2017-2018
+    (c) The James Hutton Institute 2017-2019
+    (c) The University of Strathclyde 2019-2020
     Author: Leighton Pritchard
 
-    Contact: leighton.pritchard@hutton.ac.uk
+    Contact: leighton.pritchard@strath.ac.uk
 
     Address:
     Leighton Pritchard,
-    Information and Computational Sciences,
-    James Hutton Institute,
-    Errol Road,
-    Invergowrie,
-    Dundee,
-    DD2 5DA,
+    Strathclyde Institute for Pharmacy and Biomedical Sciences,
+    Cathedral Street,
+    Glasgow,
+    G4 0RE,
     Scotland,
     UK
 
 The MIT License
 
-Copyright (c) 2017-2018 The James Hutton Institute
+Copyright (c) 2017-2019 The James Hutton Institute
+Copyright (c) 2019-2020 The University of Strathclyde
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.rst b/README.rst
@@ -13,8 +13,8 @@ source nucleotide sequences in the NCBI databases, download them, and write them
    :target: https://pypi.python.org/pypi/ncfp
 .. image:: https://img.shields.io/pypi/l/ncfp.svg?style=flat-square
    :target: https://pypi.python.org/pypi/ncfp
-.. image:: https://img.shields.io/travis/widdowquinn/ncfp.svg?style=flat-square
-   :target: https://travis-ci.org/widdowquinn/ncfp
+.. image:: https://circleci.com/gh/widdowquinn/ncfp/tree/master.svg?style=flat-square
+   :target: https://circleci.com/gh/widdowquinn/ncfp/tree/master
 .. image:: https://img.shields.io/codecov/c/github/widdowquinn/ncfp.svg?style=flat-square
    :target: https://codecov.org/widdowquinn/ncfp
 .. image:: https://readthedocs.org/projects/ncfp/badge/?version=latest
@@ -54,33 +54,33 @@ Documentation
 -------------
 
 For more detailed information about ``ncfp`` as a program, or using the underlying ``ncbi_cds_from_protein``
-Python module, please see the most recent documentation at <https://ncfp.readthedocs.io/en/latest/>
+Python module, please see the current stable documentation at <https://ncfp.readthedocs.io/en/stable/>
 
 -------
 License
 -------
 
 Unless otherwise indicated, all code is licensed under the MIT license and subject to the following agreement:
 
-    (c) The James Hutton Institute 2017-2018
+    (c) The James Hutton Institute 2017-2019
+    (c) The University of Strathclyde 2019-2020
     Author: Leighton Pritchard
 
-    Contact: leighton.pritchard@hutton.ac.uk
+    Contact: leighton.pritchard@strath.ac.uk
 
     Address:
     Leighton Pritchard,
-    Information and Computational Sciences,
-    James Hutton Institute,
-    Errol Road,
-    Invergowrie,
-    Dundee,
-    DD6 9LH,
+    Strathclyde Institute for Pharmacy and Biomedical Sciences,
+    Cathedral Street,
+    Glasgow,
+    G4 0RE,
     Scotland,
     UK
 
 The MIT License
 
-Copyright (c) 2017-2018 The James Hutton Institute
+Copyright (c) 2017-2019 The James Hutton Institute
+Copyright (c) 2019-2020 The University of Strathclyde
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/docs/basic_use.rst b/docs/basic_use.rst
@@ -29,25 +29,22 @@ to the file ``<OUTPUT>/skipped.fas``
 Input sequence formats
 ----------------------
 
-Input protein sequences must be provided in FASTA format, and ``ncfp`` expects input sequence headers to take one of
-two forms: "NCBI" or "UniProt". By default, ``ncfp`` expects sequences to be in NCBI format:
+Input protein sequences must be provided in FASTA format.
 
-.. code-block:: bash
-
-    ncfp <INPUT>.fasta <OUTPUT> <EMAIL>
+``ncfp`` expects input sequence headers to take one of two forms: "NCBI" or "UniProt". ``ncfp`` will guess at sequence format/origin on the basis of the ID and description fields. By default, sequences will be assumed to be in NCBI format unless:
 
-For sequence input in UniProt format, one of the ``-u`` or ``--uniprot`` options must be used, e.g.
+* the sequence ID conforms to a UniProt UID
+* the sequence ID conforms to a UniParc UID (these sequences will be skipped as we should not expect a unique coding sequence)
 
 .. code-block:: bash
 
-    $ ncfp -u <INPUT>.fasta <OUTPUT> <EMAIL>
-    $ ncfp --uniprot <INPUT>.fasta <OUTPUT> <EMAIL>
+    ncfp <INPUT>.fasta <OUTPUT> <EMAIL>
 
 ^^^^^^^^^^^^^^^^^^
 NCBI header format
 ^^^^^^^^^^^^^^^^^^
 
-In NCBI header format, the sequence identifier is expected to correspond to a valid NCBI protein sequence
+In the NCBI header format, the sequence identifier is expected to correspond to a valid NCBI protein sequence
 accession, e.g.
 
 .. code-block:: bash
@@ -72,7 +69,7 @@ accession as a sequence identifier, e.g.
 UniProt header format
 ^^^^^^^^^^^^^^^^^^^^^
 
-In UniProt header format, the sequence description string is expected to correspond to a UniProt download
+In the UniProt header format, the sequence description string is expected to correspond to a UniProt download
 and contain the ``GN`` gene identifier key:value pair, e.g.
 
 .. code-block:: bash
@@ -83,7 +80,7 @@ and contain the ``GN`` gene identifier key:value pair, e.g.
     SQQSKINLINLEQEKIVNSIPVDGKFILAVAYSPDGKHLACGTFEGIVAIYDVETGKQVQ
     KYQDRAKPVRSISYSPDGSFLLAASDDMHVNIYDVLHSSLVGSVSGHISWILSVACSPDG
 
-If a coding sequence is identified successfully, the output nucleotide sequence header will have the gene
+If a coding sequence is identified successfully, the output nucleotide sequence header should have the gene
 accession as its sequence identifier, e.g.
 
 .. code-block:: bash
@@ -116,8 +113,8 @@ option, e.g.
 
 .. code-block:: bash
 
-    $ ncfp -u -s <INPUT>.fasta <OUTPUT> <EMAIL>
-    $ ncfp --uniprot --stockholm <INPUT>.fasta <OUTPUT> <EMAIL>
+    $ ncfp -s <INPUT>.fasta <OUTPUT> <EMAIL>
+    $ ncfp --stockholm <INPUT>.fasta <OUTPUT> <EMAIL>
 
 The output nucleotide sequence does not preserve the Stockholm format location information in the output, nor
 does it preserve sequence gap symbols: