diff --git a/.rtd-environment.yml b/.rtd-environment.yml new file mode 100644 index 00000000..cbcad0d7 --- /dev/null +++ b/.rtd-environment.yml @@ -0,0 +1,33 @@ +name: gimmemotifs +channels: + - defaults + - bioconda + - conda-forge +dependencies: + - configparser + - diskcache + - feather-format + - genomepy >=0.8.3 + - jinja2 + - logomaker + - matplotlib-base >=3.1.2 + - ncurses + - numpy + - pandas >=1.0.3 + - pillow + - pyarrow >=0.16.0 + - pybedtools + - python >=3 + - python-xxhash + - pyyaml >=3.10 + - qnorm + - represent + - scikit-learn >=0.18 + - scipy >=1.3.0 + - seaborn + - statsmodels + - tqdm >=4.27.0 + - xdg + - xgboost >=0.71 + - sphinx_bootstrap_theme + - numpydoc diff --git a/.travis.yml b/.travis.yml index b452bd84..c8a17716 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,8 +21,11 @@ before_install: fi - chmod +x miniconda.sh - ./miniconda.sh -b -p $HOME/miniconda -f - - export PATH=$HOME/miniconda/bin:$PATH - - conda config --set always_yes yes + - source "$HOME/miniconda/etc/profile.d/conda.sh" + - hash -r + - conda config --set always_yes yes --set changeps1 no + - conda update -q conda + - conda info -a - if [ "$TRAVIS_OS_NAME" == "osx" ]; then ulimit -S -n 4096; ulimit -a; fi install: @@ -34,7 +37,8 @@ install: else conda env create -q -f conda_env.osx.txt -n gimme; fi - - source activate gimme + - conda activate gimme + - conda list - python setup.py build && pip install -e . before_script: diff --git a/CHANGELOG.md b/CHANGELOG.md index d0638e19..a109ed18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,33 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +## [0.15.0] - 2020-09-29 + +### Added + +- Added additional columns to `gimme maelstrom` output for better intepretation (correlation of motif to signal and % of regions with motif). +- Added support for multi-species input in `genome@chrom:start-end` format. +- `gimme maelstrom` warns if data is not row-centered and will center by default. +- `gimme maelstrom` selects a set of non-redundant (or less redundant) motifs by default. +- Added SVR regressor for `gimme maelstrom`. +- Added quantile normalization to `coverage_table`. + +### Removed + +- Removed the lightning classifiers and regressors as the package is no longer actively maintained. + +### Changed + +- Visually improved HTML output. +- Score of `maelstrom` is now an aggregate z-score based on combining z-scores from individual methods using Stouffer's method. The z-scores of individual methods are generated using the inverse normal transform. +- Reorganized some classes and functions. + +### Fixed + +- Fixed minor issues with sorting columns in HTML output. +- `gimme motifs` doesn't crash when no motifs are found. +- Fixed error with Ensembl chromosome names in `combine_peaks`. + ## [0.14.4] - 2020-04-02 ### Fixed diff --git a/conda_env.dev.txt b/conda_env.dev.txt index 31297494..8dc6f56e 100644 --- a/conda_env.dev.txt +++ b/conda_env.dev.txt @@ -3,12 +3,9 @@ configparser dinamo diskcache feather-format -future gadem -genomepy >=0.6.1 -ghostscript +genomepy >=0.8.3 homer -icu=58 ipywidgets # Necessary for progress bar in Jupyter notebook jinja2 logomaker @@ -18,17 +15,16 @@ ncurses numpy prosampler pillow -pyarrow +pyarrow >=0.16.0 pybedtools pysam python python-xxhash pyyaml >=3.10 -scikit-learn >=0.18 -scipy >=1.3.0 +qnorm +scikit-learn >=0.23 +scipy >=1.4.1 seaborn -six -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler diff --git a/conda_env.osx.txt b/conda_env.osx.txt index dc2adacb..dd1a2c98 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -2,13 +2,13 @@ bedtools configparser diskcache feather-format -future gadem -genomepy >=0.6.1 +genomepy >=0.8.3 ghostscript homer jinja2 logomaker +llvm-openmp matplotlib >=2.0 meme >=5 ncurses @@ -21,11 +21,10 @@ pysam python python-xxhash pyyaml >=3.10 -scikit-learn >=0.18 -scipy <1.3.0 +qnorm +scikit-learn +scipy seaborn -six -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler @@ -33,7 +32,7 @@ ucsc-bigbedtobed ucsc-genepredtobed weeder xdg -xgboost >=0.71 +xgboost=0.72 xxmotif # development-specific diff --git a/conda_env.test.txt b/conda_env.test.txt new file mode 100644 index 00000000..af2f478c --- /dev/null +++ b/conda_env.test.txt @@ -0,0 +1,37 @@ +bedtools +configparser +dinamo +diskcache +feather-format +gadem +genomepy >=0.6.1 +ghostscript +homer +icu=58 +ipywidgets # Necessary for progress bar in Jupyter notebook +jinja2 +logomaker +matplotlib >=2.0 +meme >=5 +ncurses +numpy +pillow +prosampler +pyarrow +pybedtools +python >=3.8 +python-xxhash +pyyaml >=3.10 +qnorm +scikit-learn >=0.18 +scipy <1.3.0 +seaborn +statsmodels +tqdm >=4.27.0 +trawler +ucsc-bigbedtobed +ucsc-genepredtobed +weeder +xdg +xgboost >=0.71 +xxmotif diff --git a/conda_env.txt b/conda_env.txt index 5767b058..b569daff 100644 --- a/conda_env.txt +++ b/conda_env.txt @@ -1,39 +1,35 @@ -bedtools configparser dinamo diskcache feather-format -future gadem -genomepy >=0.6.1 +genomepy >=0.8.3 ghostscript homer -icu=58 ipywidgets # Necessary for progress bar in Jupyter notebook jinja2 logomaker -matplotlib >=2.0 -meme >=5 +matplotlib-base >=3.1.2 +meme >=5.1.1 ncurses numpy +pandas >=1.0.3 pillow prosampler -pyarrow +pyarrow >=0.16.0 pybedtools pysam -python +python >=3 python-xxhash pyyaml >=3.10 +qnorm scikit-learn >=0.18 scipy <1.3.0 seaborn -six -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler ucsc-bigbedtobed -ucsc-genepredtobed weeder xdg xgboost >=0.71 diff --git a/data/templates/sortable/sortable-theme-slick.css b/data/templates/sortable/sortable-theme-slick.css index df20f5a8..a699c304 100644 --- a/data/templates/sortable/sortable-theme-slick.css +++ b/data/templates/sortable/sortable-theme-slick.css @@ -1,6 +1,7 @@ /* line 2, ../sass/_sortable.sass */ table[data-sortable] { - font-size: 80%; + font-family: 'Nunito Sans'; + font-size: 90%; border-collapse: collapse; border-spacing: 0; } diff --git a/data/templates/sortable/sortable.min.js b/data/templates/sortable/sortable.min.js index b968cf01..11d419dd 100644 --- a/data/templates/sortable/sortable.min.js +++ b/data/templates/sortable/sortable.min.js @@ -1,2 +1,2 @@ /*! sortable.js 0.8.0 */ -(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(-?[£$¤]?[\d,.e\-]+%?|inf)$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this); +(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(.+\>)?f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this); diff --git a/data/templates/table.tpl b/data/templates/table.tpl new file mode 100644 index 00000000..bf836b55 --- /dev/null +++ b/data/templates/table.tpl @@ -0,0 +1,215 @@ +{# Update the template_structure.html document too #} +{%- block before_style -%}{%- endblock before_style -%} +{% block style %} + + +{%- endblock style %} +{%- block before_table %}{% endblock before_table %} +{%- block table %} + +{%- block caption %} +{%- if caption -%} + +{%- endif -%} +{%- endblock caption %} +{%- block thead %} + + {%- block before_head_rows %}{% endblock %} + {%- for r in head %} + {%- block head_tr scoped %} + + {%- for c in r %} + {%- if c.is_visible != False %} + <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}} + {%- endif %} + {%- endfor %} + + {%- endblock head_tr %} + {%- endfor %} + {%- block after_head_rows %}{% endblock %} + +{%- endblock thead %} +{%- block tbody %} + + {% block before_rows %}{% endblock before_rows %} + {% for r in body %} + {% block tr scoped %} + + {% for c in r %} + {% if c.is_visible != False %} + <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }} + {% endif %} + {%- endfor %} + + {% endblock tr %} + {%- endfor %} + {%- block after_rows %}{%- endblock after_rows %} + +{%- endblock tbody %} +
{{caption}}
+{%- endblock table %} +{%- block after_table %}{% endblock after_table %} + diff --git a/docs/examples.rst b/docs/examples.rst index 33fd06c7..9eedc21a 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -41,10 +41,10 @@ Compare motifs between data sets $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.out/ The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using rank aggregation. -The score that is shown is the -log10(p-value), where the p-value (from the rank aggregation) is corrected for multiple testing. -This procedure is then repeated with the ranking reversed. These are shown as negative values. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. Create sequence logos --------------------- diff --git a/docs/reference.rst b/docs/reference.rst index f29e2044..2ba5b10e 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -362,11 +362,26 @@ This command can be used to identify differential motifs between two or more dat :: -h, --help show this help message and exit - -p PFMFILE, --pfmfile PFMFILE + -p pfmfile, --pfmfile pfmfile PFM file with motifs (default: - gimme.vertebrate.v5.0.pwm) + gimme.vertebrate.v5.0.pfm) + --no-filter Don't remove redundant motifs. + -F FLOAT, --filter_cutoff FLOAT + Cutoff to select non-redundant motifs. Default is 0.8, + increase this value to get fewer motifs. + --nocenter Don't mean-center the rows by default -m NAMES, --methods NAMES Run with specific methods + -a method, --aggregation method + How to combine motifs from individual methods. Default + is "int_stouffer", for inverse normal transform of + ranks, followed by Stouffer's method to combine + z-scores. Alternatively, specify "stuart" for log- + transformed rank aggregation p-values. + -N INT, --nthreads INT + Number of threads (default 12) + --rawscore Don't z-score normalize motif scores + --nogc Don't use GC% bins **Input file formats** @@ -407,16 +422,23 @@ The second option looks like this: This is a tab-separated table, with a header describing the experiments. In case of sequencing data, such as ChIP-seq, ATAC-seq or DNaseI seq, we recommend to use **log-transformed** read counts which are -**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after the log-transformatiion step, -for instance by quantile normalization or scaling. +**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after + the log-transformatiion step, for instance by quantile normalization or scaling. +By default, ``gimme maelstrom`` will mean-center the input, disable this with ``--nocenter``. The second input format generally gives better results than the first one and would be the recommended format. The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using the rank aggregation method from `Kolde, 2012 `_. -The score that is shown is the -log10(p-value), where the p-value comes from the rank aggregation. -This procedure is then repeated with the ranking reversed. These are shown as negative values. +This z-score represents the combined result of multiple methods. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. + +By default, ``gimme maelstrom`` selects a non-redundant set of motifs by clustering the motifs based on scores in the set of +input sequences. You can disable this by using the ``--no-filter`` argument. You can tweak the number of selected motifs by +changing the ``--filter-cutoff`` parameter. By default this is set to ``0.8``. Increase this value to select fewer motifs, +decrease it to select more motifs. Keep in mind that you may start to lose biologically relevant motifs if you set this too high. .. _`gimme_scan`: diff --git a/docs/requirements.txt b/docs/requirements.txt index 2ba060ac..9429d2a2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -12,5 +12,3 @@ six future statsmodels tqdm -xgboost >=0.71 -sklearn-contrib-lightning==0.4.0 diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 088f8710..9aae5889 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -320,7 +320,7 @@ There is also a larger file, that contains more regions ``hg19.blood.most_variab $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.blood.1k.out -There output directory contains several files: +The output directory contains several files: :: @@ -330,11 +330,11 @@ There output directory contains several files: The two motif files, ``motif.count.txt.gz`` and ``motif.score.txt.gz`` contain the motif scan results. The ``activity.*.out.txt`` files are tables with the results of the individual methods. The main result is ``final.out.txt``, which integrates all individual methods in a final score. -This score represents the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using the rank aggregation method from `Kolde, 2012 `_. -The score that is shown is the -log10(p-value). -This procedure is then repeated with the ranking reversed. These are shown as negative values. +This z-score represents the combined result of multiple methods. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. The file ``gimme.maelstrom.report.html`` contains a graphical summary of this file that can be opened in your web browser. @@ -342,7 +342,6 @@ The file ``gimme.maelstrom.report.html`` contains a graphical summary of this fi You can sort on the different columns by clicking on them. - The following Python snippet will create a heatmap of the results. .. code-block:: python @@ -359,7 +358,7 @@ This will show a heatmap like this: .. image:: images/heatmap.png We see that the expected motifs for different cell types are identified. GATA/TAL1 for Erythrocytes, CEBP for monocytes, LEF/TCF for T cells (ie. Wnt signaling), SPIB and PAX5 for B cells and so on. -Keep in mind that this shows only the most relevant motifs (-log10 p-value cutoff of 6), there are more relevant motifs. +Keep in mind that this shows only the most relevant motifs (z-score threshold of 6), there are more relevant motifs. This example was run only on 1,000 variable enhancer. A file with more regions, ``hg19.blood.most_variable.10k.txt`` for this example, will usually yield better results. The Jupyter notebook example `maelstrom.ipynb `_ shows a more extensive example on how to work with maelstrom results in Python. diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py index 3d63dd7f..fcd514c2 100644 --- a/gimmemotifs/__init__.py +++ b/gimmemotifs/__init__.py @@ -52,3 +52,25 @@ def filter(self, record): __version__ = get_versions()["version"] del get_versions + +# fmt: off +# easier import of gimme (config and cli left out) +from . import background # noqa: F401 +from . import cluster # noqa: F401 +from . import comparison # noqa: F401 +from . import denovo # noqa: F401 +from . import fasta # noqa: F401 +from . import maelstrom # noqa: F401 +from . import moap # noqa: F401 +from . import motif # noqa: F401 +from . import plot # noqa: F401 +from . import prediction # noqa: F401 +from . import rank # noqa: F401 +from . import report # noqa: F401 +from . import rocmetrics # noqa: F401 +from . import scanner # noqa: F401 +from . import shutils # noqa: F401 +from . import stats # noqa: F401 +from . import utils # noqa: F401 +from . import validation # noqa: F401 +# fmt: on diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index e308f556..46d33233 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -11,8 +11,6 @@ similar genomic distribution as the input. """ -from __future__ import division - # Python imports import gzip import os @@ -251,15 +249,17 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None): for _i in range(k - 1): new_init = [] for x in init: - for l in alphabet: - new_init.append(x + l) + for letter in alphabet: + new_init.append(x + letter) init = new_init[:] - self.trans = dict([(word, dict([(l, 0.0) for l in alphabet])) for word in init]) + self.trans = dict( + [(word, dict([(letter, 0.0) for letter in alphabet])) for word in init] + ) new_init = [] for x in init: - for l in alphabet: - new_init.append(x + l) + for letter in alphabet: + new_init.append(x + letter) kmercount = dict([(word, 0) for word in new_init]) lettercount = dict([(word[:k], 0) for word in new_init]) @@ -286,9 +286,9 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None): for k, v in lettercount.items(): self.init[k] = v / total - def _generate_sequence(self, l): + def _generate_sequence(self, length): sequence = list(self._weighted_random(list(self.init.items()))) - for _ in range(l - self.k): + for _ in range(length - self.k): sequence.append( self._weighted_random( list(self.trans["".join(sequence[-self.k :])].items()) @@ -296,10 +296,10 @@ def _generate_sequence(self, l): ) return "".join(sequence) - def _weighted_random(self, l): + def _weighted_random(self, weighted_list): n = random.uniform(0, 1) item = None - for item, weight in l: # noqa: B007 + for item, weight in weighted_list: # noqa: B007 if n < weight: break else: @@ -360,6 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100): cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)] df.columns = cols + + # Make really sure that column 'chrom' is a string + df.dropna(subset=["chrom"], inplace=True) + df["chrom"] = df["chrom"].apply(str).astype("string") + df.reset_index()[cols].to_feather(fname) diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index bc0e602f..b24e3652 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -326,6 +326,29 @@ def cli(sys_args): default=default_pfm_file, metavar="pfmfile", ) + p.add_argument( + "--no-filter", + dest="filter_redundant", + help="Don't remove redundant motifs.", + default=True, + action="store_false", + ) + p.add_argument( + "-F", + "--filter_cutoff", + dest="filter_cutoff", + help="Cutoff to select non-redundant motifs. Default is 0.8, increase this value to get fewer motifs.", + default=0.8, + type=float, + metavar="FLOAT", + ) + p.add_argument( + "--nocenter", + dest="center", + help="Don't mean-center the rows by default", + default=True, + action="store_false", + ) p.add_argument( "-m", "--methods", @@ -334,6 +357,19 @@ def cli(sys_args): default=None, metavar="NAMES", ) + p.add_argument( + "-a", + "--aggregation", + dest="aggregation", + help=( + 'How to combine motifs from individual methods. Default is "int_stouffer", ' + "for inverse normal transform of ranks, followed by Stouffer's method to combine " + 'z-scores. Alternatively, specify "stuart" for log-transformed rank aggregation ' + "p-values." + ), + default="int_stouffer", + metavar="method", + ) p.add_argument( "-N", "--nthreads", @@ -353,7 +389,7 @@ def cli(sys_args): p.add_argument( "--nogc", dest="gc", - help="Don't use GC% bins", + help="Don't use GC%% bins", action="store_false", default=True, ) diff --git a/gimmemotifs/commands/__init__.py b/gimmemotifs/commands/__init__.py index c20bb546..f944936c 100644 --- a/gimmemotifs/commands/__init__.py +++ b/gimmemotifs/commands/__init__.py @@ -1,13 +1,9 @@ import pkgutil -import six import os dirname = os.path.split(__file__)[0] -if six.PY3: - level = 0 -else: - level = -1 +level = 0 # Dynamically load all commands for _importer, cmdname, _ in pkgutil.iter_modules([dirname]): diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py index b7d8e43f..04d8cd1c 100755 --- a/gimmemotifs/commands/maelstrom.py +++ b/gimmemotifs/commands/maelstrom.py @@ -15,10 +15,14 @@ def maelstrom(args): genome = args.genome outdir = args.outdir pfmfile = args.pfmfile + filter_redundant = args.filter_redundant + filter_cutoff = args.filter_cutoff methods = args.methods ncpus = args.ncpus zscore = args.zscore + center = args.center gc = args.gc + aggregation = args.aggregation if not os.path.exists(infile): raise ValueError("file {} does not exist".format(infile)) @@ -31,8 +35,12 @@ def maelstrom(args): genome, outdir, pfmfile, + filter_redundant=filter_redundant, + filter_cutoff=filter_cutoff, methods=methods, ncpus=ncpus, zscore=zscore, gc=gc, + center=center, + aggregation=aggregation, ) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index 429fca70..f5b72737 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -7,12 +7,14 @@ """Command line function 'roc'.""" from __future__ import print_function import os +import re import sys import shutil import logging from tempfile import NamedTemporaryFile import numpy as np +import pandas as pd from gimmemotifs.background import create_background_file from gimmemotifs.comparison import MotifComparer, select_nonredundant_motifs @@ -22,7 +24,11 @@ from gimmemotifs.stats import calc_stats_iterator from gimmemotifs.report import roc_html_report from gimmemotifs.scanner import scan_to_file -from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed +from gimmemotifs.utils import ( + determine_file_type, + narrowpeak_to_bed, + write_equalsize_bedfile, +) logger = logging.getLogger("gimme.motifs") @@ -38,6 +44,19 @@ def motifs(args): if not os.path.exists(scan_dir): os.makedirs(scan_dir) + sample = args.sample + if args.size and args.size > 0: + file_type = determine_file_type(args.sample) + if file_type == "fasta": + logger.warn("size parameter will be ignored for FASTA input") + else: + outfile = os.path.join(args.outdir, f"input.w{args.size}.bed") + if file_type == "narrowpeak": + narrowpeak_to_bed(args.sample, outfile, size=args.size) + if file_type == "bed": + write_equalsize_bedfile(args.sample, args.size, outfile) + sample = outfile + genome = args.genome if genome is None: args.zscore = False @@ -70,7 +89,7 @@ def motifs(args): bg, fmt="fasta", genome=genome, - inputfile=args.sample, + inputfile=sample, size=size, number=10000, ) @@ -83,7 +102,7 @@ def motifs(args): if args.denovo: gimme_motifs( - args.sample, + sample, args.outdir, params={ "tools": args.tools, @@ -142,19 +161,19 @@ def motifs(args): # Print the metrics f_out.write( - "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" + "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) logger.info("creating motif scan tables") - ftype = determine_file_type(args.sample) - sample = args.sample - delete_sample = False - if ftype == "narrowpeak": - f = NamedTemporaryFile(delete=False) - logger.debug("Using %s as temporary BED file".format(f.name)) - narrowpeak_to_bed(args.sample, f.name, size=args.size) - sample = f.name - delete_sample = True + # ftype = determine_file_type(args.sample) + # sample = args.sample + # delete_sample = False + # if ftype == "narrowpeak": + # f = NamedTemporaryFile(delete=False) + # logger.debug("Using {} as temporary BED file".format(f.name)) + # narrowpeak_to_bed(args.sample, f.name, size=args.size) + # sample = f.name + # delete_sample = True # Create a table with the best score per motif for all motifs. # This has three reasons: @@ -174,6 +193,9 @@ def motifs(args): gcnorm=True, ) + n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0] + n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0] + logger.info("calculating stats") for motif_stats in calc_stats_iterator( motifs=pfmfile, @@ -188,10 +210,14 @@ def motifs(args): if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( - "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format( + "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], + motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], + motif_stats[str(motif)]["matches_at_fpr"][1] + / n_background + * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], @@ -203,7 +229,7 @@ def motifs(args): f_out.close() # Select a set of "non-redundant" motifs. - # Using Recursive Feature Elemination, a set of motifs is selected that + # Using Recursive Feature Elimination, a set of motifs is selected that # best explains the peaks in comparison to the background sequences. nr_motifs = select_nonredundant_motifs( args.outdir + "/gimme.roc.report.txt", @@ -221,10 +247,11 @@ def motifs(args): with NamedTemporaryFile(mode="w") as f: print(motif_dict[motif].to_pwm(), file=f) f.flush() + safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif) scan_to_file( sample, f.name, - filepath_or_buffer=os.path.join(scan_dir, f"{motif}.matches.bed"), + filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"), bed=True, fpr=0.01, genome=args.genome, @@ -232,9 +259,6 @@ def motifs(args): gcnorm=True, ) - if delete_sample: - os.unlink(sample) - if args.report: logger.info("creating statistics report") if args.outdir: diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py index 94c0b808..911cd9d6 100644 --- a/gimmemotifs/comparison.py +++ b/gimmemotifs/comparison.py @@ -6,8 +6,6 @@ """ Module to compare DNA sequence motifs (positional frequency matrices) """ -from __future__ import print_function - # Python imports import sys import os @@ -890,12 +888,14 @@ def generate_score_dist(self, motifs, match, metric, combine): f = open(score_file, "w") all_scores = {} - for l in [len(motif) for motif in motifs]: - all_scores[l] = {} + for motif_len in [len(motif) for motif in motifs]: + all_scores[motif_len] = {} sorted_motifs = {} - for l in all_scores.keys(): - sorted_motifs[l] = [motif for motif in motifs if len(motif) == l] + for motif_len in all_scores.keys(): + sorted_motifs[motif_len] = [ + motif for motif in motifs if len(motif) == motif_len + ] for l1 in all_scores.keys(): for l2 in all_scores.keys(): @@ -950,7 +950,11 @@ def select_nonredundant_motifs( y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0]))) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=2, shuffle=True, + X, + y, + test_size=0.4, + random_state=2, + shuffle=True, ) X_bla = X_train[keep] diff --git a/gimmemotifs/denovo.py b/gimmemotifs/denovo.py index 2d1913ac..2cd1225f 100644 --- a/gimmemotifs/denovo.py +++ b/gimmemotifs/denovo.py @@ -669,11 +669,14 @@ def gimme_motifs( sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) - with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: - for m in final_motifs: - f.write("{}\n".format(m.to_pwm())) + motifs_found = len(final_motifs) > 0 - if create_report: + if motifs_found: + with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: + for m in final_motifs: + f.write("{}\n".format(m.to_pwm())) + + if motifs_found and create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( @@ -700,7 +703,7 @@ def gimme_motifs( logger.info("finished") logger.info("output dir: %s", outdir) - if cluster: + if motifs_found and cluster: logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html")) return final_motifs diff --git a/gimmemotifs/fasta.py b/gimmemotifs/fasta.py index 6cc1d5d4..781d1cf0 100644 --- a/gimmemotifs/fasta.py +++ b/gimmemotifs/fasta.py @@ -12,7 +12,7 @@ class Fasta(object): - def __init__(self, fname=None, split_whitespace=False): + def __init__(self, fname=None, split_whitespace=False, fdict=None): """ Instantiate fasta object. Optional Fasta-formatted file as argument""" self.ids = [] self.seqs = [] @@ -35,6 +35,10 @@ def __init__(self, fname=None, split_whitespace=False): if p.match(sequence): raise IOError("Not a valid FASTA file") self.seqs.append(sequence) + elif fdict is not None: + for name, seq in fdict.items(): + self.ids.append(name) + self.seqs.append(seq) def hardmask(self): """ Mask all lowercase nucleotides with N's """ diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index c4bf21ab..fbe46b03 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -22,9 +22,13 @@ import numpy as np import pandas as pd from sklearn.preprocessing import scale +from scipy.stats import pearsonr from scipy.cluster import hierarchy from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram +from sklearn.cluster import FeatureAgglomeration + +# from scipy.spatial.distance import correlation # Plotting import matplotlib.pyplot as plt @@ -96,7 +100,9 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): mapfile = pfmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(mapfile): - m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0) + m2f = pd.read_csv( + mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#" + ) m2f["factors"] = m2f["factors"].str[:50] else: motifs = [m.id for m in read_motifs(pfmfile)] @@ -172,38 +178,29 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300) -def _rank_agg_column(exps, dfs, e): - tmp_dfs = [pd.DataFrame(), pd.DataFrame()] - - for i, sort_order in enumerate([False, True]): - for method, scoring, _ in exps: - k = "{}.{}".format(method, scoring) - if k in dfs: - v = dfs[k] - # Sample rows before sorting to shuffle - # Otherwise all ties will not have a random order due to inherent - # ordering of the motif dataframe - tmp_dfs[i][k] = ( - v.sample(frac=1).sort_values(e, ascending=sort_order).index.values - ) - return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1])) - - -def df_rank_aggregation(df, dfs, exps): +def df_rank_aggregation(df, dfs, exps, method="int_stouffer"): df_p = pd.DataFrame(index=list(dfs.values())[0].index) names = list(dfs.values())[0].columns + dfs = [ + pd.concat([v[col].rename(k, inplace=True) for k, v in dfs.items()], axis=1) + for col in names + ] pool = Pool(16) - func = partial(_rank_agg_column, exps, dfs) - ret = pool.map(func, names) + func = partial(rankagg, method=method) + ret = pool.map(func, dfs) pool.close() pool.join() - for e, result in zip(names, ret): - df_p[e] = result + for name, result in zip(names, ret): + df_p[name] = result if df.shape[1] != 1: df_p = df_p[df.columns] + if method == "int_stouffer": + df_p.columns = ["z-score " + c for c in df_p.columns] + else: + df_p.columns = ["activity " + c for c in df_p.columns] return df_p @@ -212,6 +209,8 @@ def run_maelstrom( genome, outdir, pfmfile=None, + filter_redundant=True, + filter_cutoff=0.8, plot=True, cluster=False, score_table=None, @@ -220,6 +219,8 @@ def run_maelstrom( ncpus=None, zscore=True, gc=True, + center=False, + aggregation="int_stouffer", ): """Run maelstrom on an input table. @@ -239,6 +240,12 @@ def run_maelstrom( pfmfile : str, optional Specify a PFM file for scanning. + filter_redundant : bool, optional + Create a non-redundant set of motifs based on correlation of motif scores in the input data. + + filter_cutoff : float, optional + Cutoff to use for non-redundant motif selection. Default is 0.8. + plot : bool, optional Create heatmaps. @@ -264,6 +271,15 @@ def run_maelstrom( gc : bool, optional Use GC% bins to normalize motif scores. + + center : bool, optional + Mean-center the input table. + + aggregation: str, optional + How to combine scores of the predictors. The default is "int_stouffer", for + inverse normal transform followed by Stouffer's methods to combine z-scores. + Alternatively, "stuart" performs rank aggregation and reports the -log10 of + the rank aggregation p-value. """ logger.info("Starting maelstrom") if infile.endswith("feather"): @@ -272,6 +288,28 @@ def run_maelstrom( else: df = pd.read_table(infile, index_col=0, comment="#") + # Check if the input is mean-centered + if df.shape[1] > 1 and not np.allclose(df.mean(1), 0): + if center: + logger.info( + "Input is not mean-centered, setting the mean of all rows to 0." + ) + logger.info( + "Use --nocenter if you know what you're doing and want to change this behavior." + ) + logger.info( + "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to " + "first transform your data, for instance using log2(), and to normalize " + "between samples. To create a table suitable for maelstrom you can use the " + "coverage_table script included with GimmeMotifs." + ) + df = df.sub(df.mean(axis=1), axis=0) + else: + logger.info("Input is not mean-centered, but --nocenter was specified.") + logger.info( + "Leaving the data as-is, but make sure this is what your really want." + ) + # Check for duplicates if df.index.duplicated(keep=False).any(): logger.warning("Input file contains duplicate regions!") @@ -334,6 +372,69 @@ def run_maelstrom( else: logger.info("Scores, using: %s", score_table) + counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") + scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") + + if filter_redundant: + logger.info("Selecting non-redundant motifs") + + fa = FeatureAgglomeration( + distance_threshold=filter_cutoff, + n_clusters=None, + affinity="correlation", + linkage="complete", + compute_full_tree=True, + ) + fa.fit(scores) + X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_}) + X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif") + selected_motifs = ( + X_cluster.sort_values("var") + .drop_duplicates(subset=["label"], keep="last")["motif"] + .values + ) + nr_motif = ( + X_cluster.sort_values("var") + .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]] + .set_index("label") + ) + X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label") + motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif") + + scores = scores[selected_motifs] + counts = counts[selected_motifs] + score_table = os.path.join(outdir, "motif.nr.score.txt.gz") + scores.to_csv(score_table, sep="\t", compression="gzip") + count_table = os.path.join(outdir, "motif.nr.count.txt.gz") + counts.to_csv(count_table, sep="\t", compression="gzip") + + m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#") + m2f = m2f.join(motif_map, on="Motif") + m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N" + m2f["Motif"] = m2f["motif_nr"] + m2f = m2f.drop(columns=["motif_nr"]) + + motifs = read_motifs(pfmfile) + pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm") + with open(pfmfile, "w") as f: + for motif in motifs: + f.write(f"{motif.to_pfm()}\n") + mapfile = pfmfile.replace(".pfm", ".motif2factors.txt") + with open(mapfile, "w") as f: + f.write( + "# Note: this mapping is specifically created for this non-redundant set of motifs.\n" + ) + f.write( + "# It also includes factors for motifs that were similar, but this can be\n" + ) + f.write("# specific to this analysis.\n") + + with open(mapfile, "a") as f: + m2f.to_csv(f, index=False, sep="\t") + logger.info(f"Selected {len(selected_motifs)} motifs") + logger.info(f"Motifs: {pfmfile}") + logger.info(f"Factor mappings: {mapfile}") + if cluster: cluster = False for method in methods: @@ -380,18 +481,14 @@ def run_maelstrom( for method, scoring, fname in exps: try: - if scoring == "count" and count_table is not None: + if scoring == "count": moap_with_table( fname, count_table, outdir, method, scoring, ncpus=ncpus ) - elif scoring == "score" and score_table is not None: + elif scoring == "score": moap_with_table( fname, score_table, outdir, method, scoring, ncpus=ncpus ) - else: - moap_with_bg( - fname, genome, outdir, method, scoring, pfmfile=pfmfile, ncpus=ncpus - ) except Exception as e: logger.warn("Method %s with scoring %s failed", method, scoring) @@ -409,14 +506,34 @@ def run_maelstrom( if len(methods) > 1: logger.info("Rank aggregation") - df_p = df_rank_aggregation(df, dfs, exps) + df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) + + # Add percentage of input sequences with motif + if df.shape[1] > 1: + df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100 + else: + bla = counts.join(df).groupby(df.columns[0]).mean() * 100 + bla = bla.T + bla = bla.rename( + columns={col: f"{col} % with motif" for col in bla.columns} + ) + df_p = df_p.join(bla) + + if df.shape[1] > 1: + # Add correlation between motif score and signal + logger.info("Correlation") + for col in df.columns: + df_p[f"corr {col}"] = 0 + for motif in df_p.index: + df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0] + df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: - mcount = df.join(pd.read_table(count_table, index_col=0, comment="#")) + mcount = df.join(counts) m_group = mcount.groupby(df.columns[0]) freq = m_group.sum() / m_group.count() freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") @@ -455,7 +572,10 @@ def __init__(self, outdir): raise FileNotFoundError("No such directory: " + outdir) # Load motifs - fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) + fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m")) + + if len(fnames) == 0: + fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) if len(fnames) > 0: pfmfile = fnames[0] with open(pfmfile) as fin: @@ -474,6 +594,15 @@ def __init__(self, outdir): self.result = pd.read_table( os.path.join(outdir, "final.out.txt"), comment="#", index_col=0 ) + self.correlation = self.result.loc[:, self.result.columns.str.contains("corr")] + self.percent_match = self.result.loc[ + :, self.result.columns.str.contains("% with motif") + ] + self.result = self.result.loc[ + :, + ~self.result.columns.str.contains("corr") + & ~self.result.columns.str.contains("% with motif"), + ] # Read motif results self.scores = pd.read_table( @@ -502,11 +631,12 @@ def plot_heatmap( min_freq=0.01, threshold=2, name=True, - indirect=False, + indirect=True, figsize=None, - max_len=50, + max_number_factors=5, aspect=1, - **kwargs + cmap="RdBu_r", + **kwargs, ): """Plot clustered heatmap of predicted motif activity. @@ -514,7 +644,7 @@ def plot_heatmap( ---------- kind : str, optional Which data type to use for plotting. Default is 'final', which will - plot the result of the rang aggregation. Other options are 'freq' + plot the result of the rank aggregation. Other options are 'freq' for the motif frequencies, or any of the individual activities such as 'rf.score'. @@ -528,10 +658,10 @@ def plot_heatmap( Use factor names instead of motif names for plotting. indirect : bool, optional - Include indirect factors. Default is False. + Include indirect factors (computationally predicted or non-curated). Default is True. - max_len : int, optional - Truncate the list of factors to this maximum length. + max_number_factors : int, optional + Truncate the list of factors to this maximum size. figsize : tuple, optional Tuple of figure size (width, height). @@ -539,8 +669,11 @@ def plot_heatmap( aspect : int, optional Aspect ratio for tweaking the plot. + cmap : str, optional + Color paletter to use, RdBu_r by default. + kwargs : other keyword arguments - All other keyword arguments are passed to sns.clustermap + All other keyword arguments are passed to sns.heatmap Returns ------- @@ -555,13 +688,17 @@ def plot_heatmap( filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq) idx = self.result.loc[filt].index + + if idx.shape[0] == 0: + logger.warning("Empty matrix, try lowering the threshold") + return + if idx.shape[0] >= 100: logger.warning("The filtered matrix has more than 100 rows.") logger.warning( "It might be worthwhile to increase the threshold for visualization" ) - cmap = "RdBu_r" if kind == "final": data = self.result elif kind == "freq": @@ -579,18 +716,25 @@ def plot_heatmap( else: raise ValueError("Unknown dtype") - # print(data.head()) - # plt.figure( m = data.loc[idx] - vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99)) - vmin = -vmax + + if "vmax" in kwargs: + vmax = kwargs.pop("vmax") + else: + vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99)) + + if "vmin" in kwargs: + vmin = kwargs.pop("vmin") + else: + vmin = -vmax + if name: m["factors"] = [ - join_max( - _get_factor_list(self.motifs[n], indirect), - max_len, - ",", - suffix=",(...)", + self.motifs[n].format_factors( + max_length=max_number_factors, + html=False, + include_indirect=indirect, + extra_str=",..", ) for n in m.index ] @@ -598,7 +742,8 @@ def plot_heatmap( h, w = m.shape if figsize is None: - figsize = (3 + m.shape[1] / 4, 1 + m.shape[0] / 3) + figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3) + fig = plt.figure(figsize=figsize) npixels = 30 g = GridSpec( @@ -606,8 +751,8 @@ def plot_heatmap( ) ax1 = fig.add_subplot(g[0, :]) ax2 = fig.add_subplot(g[1, :]) - ax2.set_title("Significance (-log10(p-value))") - dm = pdist(m, metric="euclidean") + ax2.set_title("aggregated z-score") + dm = pdist(m, metric="correlation") hc = linkage(dm, method="ward") leaves = dendrogram(hc, no_plot=True)["leaves"] cg = sns.heatmap( @@ -619,10 +764,12 @@ def plot_heatmap( linewidths=1, vmin=vmin, vmax=vmax, + **kwargs, ) + plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90) plt.tight_layout() # cg.ax_col_dendrogram.set_visible(False) - # plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) + # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) return cg def plot_scores(self, motifs, name=True, max_len=50): diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 45b87356..d4053bc4 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -4,7 +4,6 @@ # the terms of the MIT License, see the file COPYING included with this # distribution. """ Module for motif activity prediction """ -from __future__ import print_function def warn(*args, **kwargs): @@ -18,7 +17,6 @@ def warn(*args, **kwargs): import os import sys -import shutil try: from itertools import izip @@ -33,13 +31,14 @@ def warn(*args, **kwargs): from tqdm.auto import tqdm # scikit-learn -from sklearn.model_selection import train_test_split, GridSearchCV -from sklearn.ensemble import BaggingClassifier, RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier -from sklearn.linear_model import MultiTaskLasso, BayesianRidge +from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge +from sklearn.multioutput import MultiOutputRegressor from sklearn.preprocessing import scale, LabelEncoder -from lightning.classification import CDClassifier -from lightning.regression import CDRegressor +from sklearn.svm import LinearSVR +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline import xgboost @@ -356,252 +355,6 @@ def fit(self, df_X, df_y): logger.info("Done") -@register_predictor("LightningRegressor") -class LightningRegressionMoap(Moap): - def __init__(self, scale=True, cv=3, ncpus=None): - """Predict motif activities using lightning CDRegressor - - Parameters - ---------- - scale : boolean, optional, default True - If ``True``, the motif scores will be scaled - before classification - - cv : int, optional, default 3 - Cross-validation k-fold parameter. - - ncpus : int, optional - Number of threads. Default is the number specified in the config. - - Attributes - ---------- - act_ : DataFrame, shape (n_motifs, n_clusters) - fitted coefficients - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation - """ - - self.act_description = "activity values: coefficients from " "fitted model" - - if ncpus is None: - ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) - self.ncpus = ncpus - self.kfolds = cv - self.scale = scale - - self.act_ = None - self.pref_table = "score" - self.supported_tables = ["score", "count"] - self.ptype = "regression" - - def fit(self, df_X, df_y, batch_size=50, shuffle=True, tmpdir=None): - logger.info("Fitting LightningRegression") - - if self.scale: - # Scale motif scores - df_X[:] = scale(df_X, axis=0) - - # Normalize across samples and features - # y = df_y.apply(scale, 1).apply(scale, 0) - y = df_y - X = df_X.loc[y.index] - - if not y.shape[0] == X.shape[0]: - raise ValueError("number of regions is not equal") - - # Define model - cd = CDRegressor(penalty="l1/l2", C=1.0) - parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 2)]} - clf = GridSearchCV(cd, parameters, n_jobs=self.ncpus) - - if shuffle: - idx = list(y.sample(y.shape[1], axis=1, random_state=42).columns) - else: - idx = list(y.columns) - - if tmpdir: - if not os.path.exists(tmpdir): - os.mkdir(tmpdir) - - coefs = pd.DataFrame(index=X.columns) - start_i = 0 - if tmpdir: - for i in range(0, len(idx), batch_size): - fname = os.path.join(tmpdir, "{}.feather".format(i)) - if os.path.exists(fname) and os.path.exists(fname + ".done"): - - tmp = pd.read_feather(fname) - tmp = tmp.set_index(tmp.columns[0]) - coefs = coefs.join(tmp) - else: - logger.info("Resuming at batch {}".format(i)) - start_i = i - break - - for i in tqdm(range(start_i, len(idx), batch_size)): - split_y = y[idx[i : i + batch_size]] - - # Fit model - clf.fit(X.values, split_y.values) - tmp = pd.DataFrame( - clf.best_estimator_.coef_.T, index=X.columns, columns=split_y.columns - ) - if tmpdir: - fname = os.path.join(tmpdir, "{}.feather".format(i)) - tmp.reset_index().rename(columns=str).to_feather(fname) - # Make sure we don't read corrupted files - open(fname + ".done", "a").close() - # Get coefficients - coefs = coefs.join(tmp) - - # Get coefficients - self.act_ = coefs[y.columns] - - logger.info("Done") - - -@register_predictor("LightningClassification") -class LightningClassificationMoap(Moap): - def __init__(self, scale=True, permute=False, ncpus=None): - """Predict motif activities using lightning CDClassifier - - Parameters - ---------- - scale : boolean, optional, default True - If ``True``, the motif scores will be scaled - before classification - - ncpus : int, optional - Number of threads. Default is the number specified in the config. - - Attributes - ---------- - act_ : DataFrame, shape (n_motifs, n_clusters) - fitted coefficients - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation - """ - - self.act_description = "activity values: coefficients from " "fitted model" - - # self.cdc = CDClassifier(random_state=args.seed) - self.cdc = CDClassifier() - - self.parameters = { - "penalty": ["l1/l2"], - "loss": ["squared_hinge"], - "multiclass": [True], - "max_iter": [20], - "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], - "C": [0.001, 0.01, 0.1, 0.5, 1.0], - "tol": [1e-3], - } - - self.kfolds = 10 - - if ncpus is None: - ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) - - self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus) - - self.scale = scale - self.permute = permute - - self.act_ = None - self.sig_ = None - self.pref_table = "score" - self.supported_tables = ["score", "count"] - self.ptype = "classification" - - def fit(self, df_X, df_y): - logger.info("Fitting LightningClassification") - - if not df_y.shape[0] == df_X.shape[0]: - raise ValueError("number of regions is not equal") - if df_y.shape[1] != 1: - raise ValueError("y needs to have 1 label column") - - if self.scale: - # Scale motif scores - df_X[:] = scale(df_X, axis=0) - - idx = list(range(df_y.shape[0])) - - y = df_y.iloc[idx] - X = df_X.loc[y.index].values - y = y.values.flatten() - - # Convert (putative) string labels - label = LabelEncoder() - y = label.fit_transform(y) - - # Split data - X_train, X_test, y_train, y_test = train_test_split(X, y) - - logger.debug("Setting parameters through cross-validation") - # Determine best parameters based on CV - self.clf.fit(X_train, y_train) - - logger.debug( - "Average score ({} fold CV): {}".format( - self.kfolds, self.clf.score(X_test, y_test) - ) - ) - - logger.debug("Estimate coefficients using bootstrapping") - - # Estimate coefficients using bootstrappig - # b = BaggingClassifier(self.clf.best_estimator_, - # max_samples=0.75, n_jobs=-1, random_state=state) - b = BaggingClassifier(self.clf.best_estimator_, max_samples=0.75, n_jobs=-1) - b.fit(X, y) - - # Get mean coefficients - coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) - - # Create dataframe of predicted coefficients - if len(label.classes_) == 2: - self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))) - else: - self.act_ = pd.DataFrame(coeffs.T) - - # Convert labels back to original names - self.act_.columns = label.inverse_transform(range(len(label.classes_))) - self.act_.index = df_X.columns - - if self.permute: - # Permutations - logger.debug("Permutations") - random_dfs = [] - for _ in range(10): - y_random = np.random.permutation(y) - b.fit(X, y_random) - coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) - - if len(label.classes_) == 2: - random_dfs.append(pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))) - else: - random_dfs.append(pd.DataFrame(coeffs.T)) - random_df = pd.concat(random_dfs) - - # Select cutoff based on percentile - high_cutoffs = random_df.quantile(0.99) - low_cutoffs = random_df.quantile(0.01) - - # Set significance - self.sig_ = pd.DataFrame(index=df_X.columns) - self.sig_["sig"] = False - - for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs): - self.sig_["sig"].loc[self.act_[col] >= c_high] = True - self.sig_["sig"].loc[self.act_[col] <= c_low] = True - logger.info("Done") - - @register_predictor("MWU") class MWUMoap(Moap): def __init__(self, *args, **kwargs): @@ -791,22 +544,16 @@ def fit(self, df_X, df_y): logger.info("Done") -@register_predictor("Lasso") -class LassoMoap(Moap): - def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): - """Predict motif activities using Lasso MultiTask regression +@register_predictor("MultiTaskLasso") +class MultiTaskLassoMoap(Moap): + def __init__(self, scale=True, ncpus=None): + """Predict motif activities using MultiTaskLasso. Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled - before classification - - kfolds : integer, optional, default 5 - number of kfolds for parameter search - - alpha_stepsize : float, optional, default 1.0 - stepsize for use in alpha gridsearch + before classification. ncpus : int, optional Number of threads. Default is the number specified in the config. @@ -814,101 +561,129 @@ def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) - fitted motif activities - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation + Coefficients of the regression model. """ - self.kfolds = kfolds - self.act_description = "activity values: coefficients from " "fitted model" + self.act_description = "activity values: coefficients of the" "regression model" - self.scale = scale if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus - - # initialize attributes + self.scale = scale self.act_ = None - self.sig_ = None + self.pref_table = "score" + self.supported_tables = ["score", "count"] + self.ptype = "regression" + + def fit(self, df_X, df_y): + logger.info("Fitting MultiTaskLasso") + + if not df_y.shape[0] == df_X.shape[0]: + raise ValueError("number of regions is not equal") + + if self.scale: + logger.debug("Scaling motif scores") + # Scale motif scores + df_X.loc[:, :] = scale(df_X, axis=0) + + # logger.debug("Scaling y") + + # Normalize across samples and features + # y = df_y.apply(scale, 1).apply(scale, 0) + y = df_y + + X = df_X.loc[y.index] - mtk = MultiTaskLasso() - parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]} - self.clf = GridSearchCV( - mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2" + model = Pipeline( + [ + ("scale", StandardScaler()), + ( + "reg", + MultiTaskLassoCV( + fit_intercept=False, n_alphas=20, n_jobs=self.ncpus + ), + ), + ] ) + logger.debug("Fitting model") + model.fit(df_X, df_y) + logger.info("Done") + + self.act_ = pd.DataFrame( + model.steps[1][1].coef_, index=y.columns, columns=X.columns + ).T + + def predict(self, df_X): + return df_X.dot(self.act_.loc[df_X.columns]) + + +@register_predictor("SVR") +class SVRMoap(Moap): + def __init__(self, scale=True, ncpus=None): + """Predict motif activities using Support Vector Regression. + + Parameters + ---------- + scale : boolean, optional, default True + If ``True``, the motif scores will be scaled + before classification. + + ncpus : int, optional + Number of threads. Default is the number specified in the config. + + Attributes + ---------- + act_ : DataFrame, shape (n_motifs, n_clusters) + SVR weights. + """ + + self.act_description = "activity values: SVR weights" + + if ncpus is None: + ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) + self.ncpus = ncpus + self.scale = scale + self.act_ = None self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression" - def fit(self, df_X, df_y, permute=False): - logger.info("Fitting Lasso") + def fit(self, df_X, df_y): + logger.info("Fitting SVR") + if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if self.scale: + logger.debug("Scaling motif scores") # Scale motif scores - df_X[:] = scale(df_X, axis=0) + df_X.loc[:, :] = scale(df_X, axis=0) + + # logger.debug("Scaling y") - idx = list(range(df_y.shape[0])) - y = df_y.iloc[idx] - X = df_X.loc[y.index].values - y = y.values - - # fit coefficients - coefs = self._get_coefs(X, y) - self.act_ = pd.DataFrame(coefs.T) - - # convert labels back to original names - self.act_.columns = df_y.columns - self.act_.index = df_X.columns - - if permute: - # Permutations - logger.info("permutations\n") - random_dfs = [] - for _ in range(10): - y_random = y[np.random.permutation(range(y.shape[0]))] - coefs = self._get_coefs(X, y_random) - random_dfs.append(pd.DataFrame(coefs.T)) - random_df = pd.concat(random_dfs) - - # Select cutoff based on percentile - high_cutoffs = random_df.quantile(0.99) - low_cutoffs = random_df.quantile(0.01) - - # Set significance - self.sig_ = pd.DataFrame(index=df_X.columns) - self.sig_["sig"] = False - - for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs): - self.sig_["sig"].loc[self.act_[col] >= c_high] = True - self.sig_["sig"].loc[self.act_[col] <= c_low] = True + # Normalize across samples and features + # y = df_y.apply(scale, 1).apply(scale, 0) + y = df_y + self.columns = df_y.columns + X = df_X.loc[y.index] + clf = LinearSVR() + self.model = MultiOutputRegressor(clf, n_jobs=1) + logger.debug("Fitting model") + self.model.fit(df_X, df_y) logger.info("Done") - def _get_coefs(self, X, y): - logger.info("set alpha through cross-validation\n") - # Determine best parameters based on CV - self.clf.fit(X, y) - - logger.debug( - "average score ({} fold CV): {}".format(self.kfolds, self.clf.best_score_) + self.act_ = pd.DataFrame( + {c: e.coef_ for c, e in zip(df_y.columns, self.model.estimators_)}, + index=X.columns, ) - logger.info("Estimate coefficients using bootstrapping\n") + def predict(self, df_X): + # print(self.model.predict(df_X) ) - n_samples = 0.75 * X.shape[0] - max_samples = X.shape[0] - m = self.clf.best_estimator_ - coefs = [] - for _ in range(10): - idx = np.random.randint(0, n_samples, max_samples) - m.fit(X[idx], y[idx]) - coefs.append(m.coef_) - coefs = np.array(coefs).mean(axis=0) - return coefs + return pd.DataFrame( + self.model.predict(df_X), index=df_X.index, columns=self.columns + ) def moap( @@ -935,7 +710,7 @@ def moap( method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', - 'lightningclassification', 'lightningregressor', 'bayesianridge', + 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional @@ -1058,13 +833,7 @@ def moap( motifs = motifs.loc[df.index] - if method == "lightningregressor": - outdir = os.path.dirname(outfile) - tmpname = os.path.join(outdir, ".lightning.tmp") - clf.fit(motifs, df, tmpdir=tmpname) - shutil.rmtree(tmpname) - else: - clf.fit(motifs, df) + clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index 3065a6cb..253bc410 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -10,8 +10,8 @@ import sys import random from math import log, sqrt +from collections import Counter from warnings import warn -import six from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME from gimmemotifs.c_metrics import pfmscan @@ -1304,6 +1304,88 @@ def wiggle_pwm(self): return self.wiggled_pwm + def format_factors( + self, max_length=5, html=False, include_indirect=True, extra_str=", (...)" + ): + if html: + fmt_d = "{}" + fmt_i = "{}" + else: + fmt_d = fmt_i = "{}" + + if hasattr(self, "factor_info"): + fcount = Counter([x.upper() for x in self.factor_info["Factor"]]) + else: + fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME]) + + direct = sorted( + list( + set( + [ + x.upper() if x != "de novo" else x + for x in self.factors[DIRECT_NAME] + ] + ) + ), + key=lambda x: fcount[x], + reverse=True, + ) + + indirect = [] + if include_indirect: + indirect = sorted( + list( + set( + [ + x.upper() + for x in self.factors[INDIRECT_NAME] + if x.upper() not in direct + ] + ) + ), + key=lambda x: fcount[x], + reverse=True, + ) + + if len(direct) > max_length: + show_factors = direct[:max_length] + else: + show_factors = direct[:] + for f in sorted(indirect, key=lambda x: fcount[x], reverse=True): + if f not in show_factors: + show_factors.append(f) + if len(show_factors) >= max_length: + break + + if "de novo" in show_factors: + show_factors = ["de novo"] + sorted( + [f for f in show_factors if f != "de novo"], + key=lambda x: fcount[x], + reverse=True, + ) + else: + show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True) + + factor_str = ",".join( + [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] + ) + + if len(direct + indirect) > max_length: + factor_str += extra_str + + if html: + tooltip = "" + if len(direct) > 0: + tooltip += "direct: " + ",".join(sorted(direct)) + if len(indirect) > 0: + if tooltip != "": + tooltip += " " + tooltip += "predicted: " + ",".join(sorted(indirect)) + + factor_str = '
' + factor_str + "
" + + return factor_str + def default_motifs(): """Return list of Motif instances from default motif database.""" @@ -1393,14 +1475,8 @@ def parse_motifs(motifs): motifs : list List of Motif instances. """ - if isinstance(motifs, six.string_types): - with open(motifs) as f: - if motifs.endswith("pwm") or motifs.endswith("pfm"): - motifs = read_motifs(f, fmt="pwm") - elif motifs.endswith("transfac"): - motifs = read_motifs(f, fmt="transfac") - else: - motifs = read_motifs(f) + if isinstance(motifs, str): + return read_motifs(motifs) elif isinstance(motifs, Motif): motifs = [motifs] else: @@ -1426,6 +1502,8 @@ def _add_factors_from_handle(motifs, handle): m2f_direct = {} m2f_indirect = {} for line in open(map_file): + if line.startswith("#"): + continue try: motif, *factor_info = line.strip().split("\t") if len(factor_info) == 1: @@ -1438,7 +1516,11 @@ def _add_factors_from_handle(motifs, handle): except Exception: pass + m2f = pd.read_csv(map_file, sep="\t", comment="#", index_col=0) + for motif in motifs: + if motif.id in m2f.index: + motif.factor_info = m2f.loc[motif.id] if motif.id in m2f_direct: motif.factors[DIRECT_NAME] = m2f_direct[motif.id] if motif.id in m2f_indirect: @@ -1518,7 +1600,7 @@ def read_motifs(infile=None, fmt="pfm", as_dict=False): if fmt == "pwm": fmt = "pfm" - if infile is None or isinstance(infile, six.string_types): + if infile is None or isinstance(infile, str): infile = pfmfile_location(infile) with open(infile) as f: motifs = _read_motifs_from_filehandle(f, fmt) diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py index d161c138..d27f9d21 100644 --- a/gimmemotifs/plot.py +++ b/gimmemotifs/plot.py @@ -4,7 +4,6 @@ # the terms of the MIT License, see the file COPYING included with this # distribution. """ Various plotting functions """ -from __future__ import print_function from PIL import Image import seaborn as sns from mpl_toolkits.axes_grid1 import ImageGrid @@ -33,8 +32,7 @@ def axes_off(ax): - """Get rid of all axis ticks, lines, etc. - """ + """Get rid of all axis ticks, lines, etc.""" ax.set_frame_on(False) ax.axes.get_yaxis().set_visible(False) ax.axes.get_xaxis().set_visible(False) @@ -355,7 +353,7 @@ def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None): m = 25 / data.values.max() for node in t.traverse("levelorder"): - val = data[[l.name for l in node.get_leaves()]].values.mean() + val = data[[leaf.name for leaf in node.get_leaves()]].values.mean() style = NodeStyle() style["size"] = 0 diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py index 15076a86..5ae37690 100644 --- a/gimmemotifs/rank.py +++ b/gimmemotifs/rank.py @@ -9,6 +9,7 @@ import subprocess as sp import pandas as pd import numpy as np +from scipy.stats import rankdata, norm try: from scipy.special import factorial @@ -77,23 +78,77 @@ def qStuart(r): return factorial(N) * v[N] -def rankagg(df, method="stuart"): - """Return aggregated ranks. +def _rank_int(series, c=3.0 / 8, stochastic=True): + # Based on code by Edward Mountjoy + # See: https://github.com/edm1/rank-based-INT + """Perform rank-based inverse normal transformation on pandas series. + If stochastic is True ties are given rank randomly, otherwise ties will + share the same value. NaN values are ignored. + Args: + param1 (pandas.Series): Series of values to transform + param2 (Optional[float]): Constand parameter (Bloms constant) + param3 (Optional[bool]): Whether to randomise rank of ties + + Returns: + pandas.Series + """ + + # Check input + assert isinstance(series, pd.Series) + assert isinstance(c, float) + assert isinstance(stochastic, bool) + + # Set seed + np.random.seed(123) + + # Take original series indexes + orig_idx = series.index + + # Drop NaNs + series = series.loc[~pd.isnull(series)] + + # Get ranks + if stochastic: + # Shuffle by index + series = series.loc[np.random.permutation(series.index)] + # Get rank, ties are determined by their position in the series (hence + # why we randomised the series) + rank = rankdata(series, method="ordinal") + else: + # Get rank, ties are averaged + rank = rankdata(series, method="average") + + # Convert numpy array back to series + rank = pd.Series(rank, index=series.index) + + # Convert rank to normal distribution + transformed = rank.apply(_rank_to_normal, c=c, n=len(rank)) + + return transformed[orig_idx] + +def _rank_to_normal(rank, c, n): + # Standard quantile function + x = (rank - c) / (n - 2 * c + 1) + return norm.ppf(x) + + +def _rankagg_int(df): + # Convert values to ranks + df_int = df.apply(_rank_int) + # Combine z-score using Stouffer's method + df_int = (df_int.sum(1) / np.sqrt(df_int.shape[1])).to_frame() + df_int.columns = ["z-score"] + return df_int + + +def _rankagg_stuart(df): + """ Implementation is ported from the RobustRankAggreg R package References: Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709 Stuart et al., 2003, DOI: 10.1126/science.1087447 - - Parameters - ---------- - df : pandas.DataFrame - DataFrame with values to be ranked and aggregated - - Returns - ------- - pandas.DataFrame with aggregated ranks """ rmat = pd.DataFrame(index=df.iloc[:, 0]) @@ -105,3 +160,49 @@ def rankagg(df, method="stuart"): rmat = rmat.apply(sorted, 1, result_type="expand") p = rmat.apply(qStuart, 1) return pd.DataFrame({"score": p}, index=rmat.index) + + +def rankagg(df, method="int_stouffer", include_reverse=True, log_transform=True): + """Return aggregated ranks. + + Stuart implementation is ported from the RobustRankAggreg R package + + References: + Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709 + Stuart et al., 2003, DOI: 10.1126/science.1087447 + + Parameters + ---------- + df : pandas.DataFrame + DataFrame with values to be ranked and aggregated + method : str, optional + Either "int_stouffer" or "stuart". The "int_stouffer" method is based on combining z-scores + from a inverse normal transform of ranks using Stouffer's method. + + Returns + ------- + pandas.DataFrame with aggregated ranks + """ + method = method.lower() + if method not in ["stuart", "int_stouffer"]: + raise ValueError("unknown method for rank aggregation") + + if method == "stuart": + df_asc = pd.DataFrame() + df_desc = pd.DataFrame() + for col in df.columns: + df_asc[col] = ( + df.sample(frac=1).sort_values(col, ascending=False).index.values + ) + if include_reverse: + df_desc[col] = ( + df.sample(frac=1).sort_values(col, ascending=True).index.values + ) + + df_result = -np.log10(_rankagg_stuart(df_asc)) + if include_reverse: + df_result += np.log10(_rankagg_stuart(df_desc)) + + return df_result + if method == "int_stouffer": + return _rankagg_int(df) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 239eb441..e41951dc 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -16,11 +16,21 @@ import numpy as np import pandas as pd from statsmodels.stats.multitest import multipletests +from pandas.core.indexing import _non_reducing_slice +from pandas.io.formats.style import Styler +import seaborn as sns +from matplotlib import colors +import matplotlib.pyplot as plt + +try: + import emoji +except ImportError: + pass from gimmemotifs.comparison import MotifComparer from gimmemotifs.fasta import Fasta from gimmemotifs.motif import read_motifs -from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME +from gimmemotifs.config import MotifConfig from gimmemotifs.plot import roc_plot from gimmemotifs.stats import calc_stats, add_star, write_stats from gimmemotifs import __version__ @@ -28,6 +38,588 @@ logger = logging.getLogger("gimme.report") +FACTOR_TOOLTIP = "
factors
(direct or predicted)
" + + +def _wrap_html_str(x): + if " " not in x: + return x + + min_pos, max_pos = 0, len(x) + if ">" in x and "[^<>]*<").search(x) + min_pos, max_pos = m.start(), m.end() + + positions = [m.start() for m in re.compile(" ").finditer(x)] + + positions = [p for p in positions if min_pos < p < max_pos] + + if len(positions) == 0: + return x + + pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0] + x = x[:pos] + "
" + x[pos + 1 :] + return x + + +def relative_luminance(rgba): + """ + Calculate relative luminance of a color. + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + Parameters + ---------- + color : rgb or rgba tuple + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4) for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b + + +def contrasting_text_color(color, text_color_threshold=0.408): + dark = relative_luminance(color) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return text_color + + +class ExtraStyler(Styler): + """ + Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS. + """ + + loader = jinja2.ChoiceLoader( + [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader] + ) + env = jinja2.Environment(loader=loader) + template = env.get_template("table.tpl") + + def __init__(self, *args, **kwargs): + self._data_todo = [] + self.circle_styles = None + self.palette_styles = None + self.col_heading_style = { + "name": "col_heading", + "props": [("border-bottom", "1px solid #e0e0e0")], + } + super(ExtraStyler, self).__init__(*args, **kwargs) + self.display_data = self.data.copy() + + # self.template = + + self._font = "Nunito Sans" + + @property + def font(self): + return self._font + + @font.setter + def font(self, font_name): + self._font = font_name + + def set_font(self, font_name): + """ + Set the font that will be used. + + Parameters + ---------- + font_name : str + Should be a font name available though the Google Font API. + + Returns + ------- + self : ExtraStyler + + Notes + ----- + ``font_name`` can contain spaces, eg. "Nunito Sans". + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) + >>> ExtraStyler(df).font("Roboto) + """ + self.font = font_name + return self + + def _current_index(self, subset): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + selected = self.data.loc[subset] + idx_slice = pd.IndexSlice[ + self.data.index.get_indexer(selected.index), + self.data.columns.get_indexer(selected.columns), + ] + return idx_slice + + def _translate(self): + self._compute_data() + d = super()._translate() + circle_styles = self.circle_styles or [] + palette_styles = self.palette_styles or [] + col_heading_style = self.col_heading_style or [] + d.update( + { + "font": self.font, + "circle_styles": circle_styles, + "palette_styles": palette_styles, + "col_heading_style": col_heading_style, + } + ) + return d + + def _compute_data(self): + r = self + for func, args, kwargs in self._data_todo: + r = func(self)(*args, **kwargs) + r.data = r.display_data + return r + + def _tooltip(self, tip, subset=None, part=None): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + if part is None: + part = "data" + + if part == "data": + self.display_data.loc[subset] = ( + "
" + + self.display_data.loc[subset].astype(str) + + "
" + ) + elif part == "columns": + idx = self._current_index(subset)[1] + rename = dict( + zip( + self.display_data.columns[idx], + "
" + + self.display_data.columns[idx].astype(str) + + "
", + ) + ) + self.display_data.rename(columns=rename, inplace=True) + elif part == "index": + idx = self._current_index(subset)[0] + rename = dict( + zip( + self.display_data.index[idx], + "
" + + self.display_data.index[idx].astype(str) + + "
", + ) + ) + self.display_data.rename(index=rename, inplace=True) + else: + raise ValueError(f"unknown value for part: {part}") + return self + + def _wrap_iterable(self, it): + return [_wrap_html_str(val) for val in it] + + def _wrap(self, subset=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + if axis in [0, "columns"]: + idx = self._current_index(subset)[1] + rename = dict( + zip( + self.display_data.columns[idx], + self._wrap_iterable(self.display_data.columns[idx]), + ) + ) + self.display_data.rename(columns=rename, inplace=True) + elif axis in [1, "index"]: + idx = self._current_index(subset)[0] + rename = dict( + zip( + self.display_data.index[idx], + self._wrap_iterable(self.display_data.index[idx]), + ) + ) + self.display_data.rename(index=rename, inplace=True) + else: + raise ValueError(f"unknown value for axis: {axis}") + return self + + def _convert_to_image(self, subset=None, height=30): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + self.display_data.loc[subset] = ( + f'
' + ) + return self + + def _border(self, idx, location="left"): + return [f"border-{location}: 2px solid #444;" for val in idx] + + def border( + self, + subset=None, + location="bottom", + part="data", + width="2px", + style="solid", + color="#444", + ): + """ + Add a border to data cells, columns or index. + + Parameters + ---------- + subset : IndexSlice, optional + An argument to ``DataFrame.loc`` that restricts which elements + ``border`` is applied to. If ``part`` is "columns" or "index" + subset should be present in either the columns or the index. + + location : str, optional + Location of the border, default is "bottom". Can be "top", "bottom", + "right" or "left". + + part : str, optional + If ``part`` is "data", the border will be applied to the data cells. + Set part to "index" or to "column" to add a border to the index or + header, respectively. + + width : str, int or float, optional + Valid CSS value for border width. + + style : str, optional + Valid CSS value for border style. + + color : str, optional + Valid CSS value for border color. + + Returns + ------- + self : ExtraStyler + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) + >>> ExtraStyler(df).border(part="columns) + """ + if part == "data": + self.apply(self._border, subset=subset, location=location) + else: + self.col_heading_style["props"].append( + (f"border-{location}", f"{width} {style} {color}") + ) + return self + + def _align(self, idx, location="center"): + return [f"text-align:{location};" for val in idx] + + def align(self, subset=None, location="center", axis=0): + """ + Align text. + + Parameters + ---------- + subset : IndexSlice, optional + An argument to ``DataFrame.loc`` that restricts which elements + ``center_align`` is applied to. + + location : str, optional + "center", "left" or "right" + + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. + + Returns + ------- + self : ExtraStyler + """ + self.apply(self._align, subset=subset, location=location, axis=axis) + return self + + def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0): + rng = M - m + norm = colors.Normalize(m - (rng * low), M + (rng * high)) + normed = norm(s.values) + c = plt.cm.get_cmap(cmap)(normed) + return [ + f"background-color: {colors.rgb2hex(color)}; color: {contrasting_text_color(color)}" + for color in c + ] + + def to_precision_str(self, subset=None, precision=0, include_zero=True): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + def precision_str(x, precision=precision): + if (include_zero or x > 0) and x <= 10 ** -precision: + return f"<{10**-precision}" + else: + return f"{{0:.{precision}f}}".format(x) + + self.display_data.loc[subset] = self.data.loc[subset].applymap(precision_str) + return self + + def _circle( + self, + subset=None, + show_text=True, + color=None, + cmap=None, + vmin=None, + vmax=None, + scale=False, + size=25, + min_size=5, + morph=False, + ): + subset = pd.IndexSlice[:, :] if subset is None else subset + subslice = _non_reducing_slice(subset) + + if color: + palette = sns.color_palette([color]) + # print(palette) + elif cmap is None: + palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10) + else: + # if isinstance(palette, str): + palette = sns.color_palette(cmap) + + # Make sure we don't select text columns + if len(palette) > 1: + subslice = pd.IndexSlice[ + self.data.loc[subslice].index, + self.data.loc[subslice].select_dtypes(exclude=["object"]).columns, + ] + idx = self._current_index(subslice) + + self.circle_styles = self.circle_styles or [] + circle_id = len(self.circle_styles) + 1 + + props = [ + ("height", f"{size}px"), + ("width", f"{size}px"), + ("border-radius", "50%"), + ("color", "#000"), + ("line-height", f"{size}px"), + ("display", "inline-block"), + ("text-align", "center"), + ("vertical-align", "middle"), + ] + + self.circle_styles.append({"name": f"circle{circle_id}", "props": props}) + self.palette_styles = self.palette_styles or [] + for i, color in enumerate(palette.as_hex()): + props = [("background-color", color)] + if scale: + circle_size = min_size + ((size - min_size) / len(palette) * (i + 1)) + props += [ + ("height", f"{circle_size}px"), + ("width", f"{circle_size}px"), + ("line-height", f"{circle_size}px"), + ("text-align", "center"), + ] + if morph: + props += [("border-radius", f"{50 - int(50 / len(palette)) * i}%")] + self.palette_styles.append( + {"name": f"color{circle_id}_{i}", "props": props} + ) + + if len(palette) > 1: + vmax = ( + self.data.loc[subslice].max().max() * 1.01 + if vmax is None + else vmax * 1.01 + ) + text = self.display_data.iloc[idx].astype(str) if show_text else "" + self.display_data.iloc[idx] = ( + f"
" + + text + + "
" + ) + else: + text = self.display_data.iloc[idx].astype(str) if show_text else "" + self.display_data.iloc[idx] = ( + f"
" + text + "
" + ) + + return self + + def add_circle(self, **kwargs): + self._data_todo.append((lambda instance: instance._circle, (), kwargs)) + return self + + def wrap(self, **kwargs): + self._data_todo.append((lambda instance: instance._wrap, (), kwargs)) + return self + + def add_tooltip(self, tip, **kwargs): + self._data_todo.append((lambda instance: instance._tooltip, (tip,), kwargs)) + return self + + def convert_to_image(self, **kwargs): + self._data_todo.append( + (lambda instance: instance._convert_to_image, (), kwargs) + ) + return self + + def rename(self, columns=None, index=None): + self.display_data = self.display_data.rename(columns=columns, index=index) + return self + + def _emoji_score(self, series, emoji_str=None, bins=None): + if emoji_str is None: + emoji_str = ":star:" + if bins is None: + bins = 3 + + if isinstance(bins, int): + labels = range(1, bins + 1) + else: + labels = range(1, len(bins)) + + return [ + emoji.emojize(emoji_str * val, use_aliases=True) + for val in pd.cut(series, bins=bins, labels=labels) + ] + + def _emoji_scale(self, series, emojis=None, bins=None): + emoji_dict = { + "thumbs": [":thumbsdown:", ":thumbsup:"], + "check": [":cross_mark:", ":white_check_mark:"], + "smiley": [ + ":crying_face:", + ":slightly_frowning_face:", + ":neutral_face:", + ":slightly_smiling_face:", + ":grin:", + ], + "black_square": [ + ":black_small_square:", + ":black_medium_small_square:", + ":black_medium_square:", + ":black_large_square:", + ], + "white_square": [ + ":white_small_square:", + ":white_medium_small_square:", + ":white_medium_square:", + ":white_large_square:", + ], + } + + if emojis is None: + emojis = "smiley" + + if emojis in emoji_dict: + labels = emoji_dict[emojis] + if bins is None: + bins = len(labels) + + return [ + emoji.emojize(val, use_aliases=True) + for val in pd.cut(series, bins=bins, labels=labels) + ] + + def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + + result = self.display_data.iloc[idx].apply( + self._emoji_scale, axis=axis, result_type="expand", args=(emojis, bins) + ) + self.display_data.iloc[idx] = result.values + + return self.align(subset=subset, location="center", axis=axis) + + def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + result = self.display_data.iloc[idx].apply( + self._emoji_score, axis=axis, result_type="expand", args=(emoji_str, bins) + ) + self.display_data.iloc[idx] = result.values + + return self.align(subset=subset, location="left", axis=axis) + + def emojify(self, subset=None): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + result = self.display_data.iloc[idx].applymap(emoji.emojize) + self.display_data.iloc[idx] = result.values + + return self + + def scaled_background_gradient( + self, + subset=None, + cmap="RdBu_r", + low=0, + high=0, + center_zero=False, + vmin=None, + vmax=None, + ): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + vmax = ( + self.data.loc[subset] + .replace({np.inf: np.nan, -np.inf: np.nan}) + .max(skipna=True) + .max() + if vmax is None + else vmax + ) + vmin = ( + self.data.loc[subset] + .replace({np.inf: np.nan, -np.inf: np.nan}) + .min(skipna=True) + .min() + if vmin is None + else vmin + ) + + if center_zero: + vmax = max(abs(vmax), abs(vmin)) + vmin = -vmax + + r = self + for col in self.data.loc[subset].columns: + r = r.apply( + self._background_gradient, + subset=pd.IndexSlice[subset[0], col], + cmap=cmap, + m=vmin, + M=vmax, + low=low, + high=high, + ) + + return r + def get_roc_values(motif, fg_file, bg_file, genome): """Calculate ROC AUC values for ROC plots.""" @@ -167,8 +759,8 @@ class ReportMotif(object): + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } - rm.bg[bg][u"roc_img_link"] = { - u"href": "images/" + rm.bg[bg]["roc_img_link"] = { + "href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } @@ -253,93 +845,114 @@ def create_denovo_motif_report( ) -def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): - df = pd.read_table(infile, index_col=0) - df = df[np.any(abs(df) >= threshold, 1)] +def motif_to_factor_series(series, pfmfile=None, motifs=None): + if motifs is None: + motifs = read_motifs(pfmfile, as_dict=True) - motifs = read_motifs(pfmfile) + if isinstance(series, pd.Index): + index = series + else: + index = series.index - df.rename_axis(None, inplace=True) - cols = df.columns + factors = [motifs[motif].format_factors(html=True) for motif in series] + return pd.Series(data=factors, index=index) + + +def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"): + if motifs is None: + motifs = read_motifs(pfmfile, as_dict=True) + + if not os.path.exists(outdir): + os.makedirs(outdir) + if not os.path.exists(os.path.join(outdir, subdir)): + os.makedirs(os.path.join(outdir, subdir)) + + img_series = [] + for motif in series: + if motif not in motifs: + raise ValueError(f"Motif {motif} does not occur in motif database") + fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)) + if not os.path.exists(fname): + motifs[motif].plot_logo(fname=os.path.join(outdir, fname)) + img_series.append(fname) + + if isinstance(series, pd.Index): + index = series + else: + index = series.index + return pd.Series(data=img_series, index=index) - motifs = read_motifs(pfmfile) - idx = [motif.id for motif in motifs] - direct = [ - ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) - for motif in motifs - ] - indirect = [ - ",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]]))) - for motif in motifs - ] - m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx) - - factor_cols = [DIRECT_NAME, INDIRECT_NAME] - if True: - for factor_col in factor_cols: - f = m2f[factor_col].str.len() > 30 - m2f[factor_col] = ( - '
' - + m2f[factor_col].str.slice(0, 30) - ) - m2f.loc[f, factor_col] += "(...)" - m2f[factor_col] += "
" - df = df.join(m2f) - df["logo"] = [ - ''.format(re.sub("[()/]", "_", x)) - for x in list(df.index) +def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): + + # Read the maelstrom text report + df = pd.read_table(infile, index_col=0) + + # Columns with maelstrom rank aggregation value + value_cols = df.columns[ + ~df.columns.str.contains("corr") & ~df.columns.str.contains("% with motif") ] + # Columns with correlation values + corr_cols = df.columns[df.columns.str.contains("corr")] - if not os.path.exists(outdir + "/logos"): - os.makedirs(outdir + "/logos") - for motif in motifs: - if motif.id in df.index: - motif.plot_logo( - fname=outdir + "/logos/{}.png".format(re.sub("[()/]", "_", motif.id)) - ) + df = df[np.any(abs(df[value_cols]) >= threshold, 1)] - template_dir = MotifConfig().get_template_dir() - js = open( - os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8" - ).read() - css = open( - os.path.join(template_dir, "sortable/sortable-theme-slick.css"), - encoding="utf-8", - ).read() - df = df[factor_cols + ["logo"] + list(cols)] - - df_styled = df.style - absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min()))) - target = absmax * 1.75 - - for col in cols: - smin = df[col].min() - smax = df[col].max() - diff = smax - smin - low = abs((-target - smin) / diff) - high = (target - smax) / diff - df_styled = df_styled.background_gradient( - cmap="RdBu_r", low=low, high=high, subset=[col] - ) + # Add motif logo's + df.insert( + 0, + "logo", + motif_to_img_series(df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"), + ) + # Add factors that can bind to the motif + df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) - df_styled = df_styled.set_precision(3) - df_styled = df_styled.set_table_attributes("data-sortable") - df_styled = df_styled.render() - df_styled = df_styled.replace( - "data-sortable", 'class="sortable-theme-slick" data-sortable' + rename_columns = {"factors": FACTOR_TOOLTIP} + + df_styled = ( + ExtraStyler(df) + .set_precision(2) + .convert_to_image( + subset=["logo"], + height=30, + ) + .scaled_background_gradient( + subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75 + ) + .border(subset=list(value_cols[:1]), location="left") + .border(part="columns", location="bottom") + .set_table_attributes('class="sortable-theme-slick" data-sortable') + .align(subset=list(value_cols), location="center") + .set_font("Nunito Sans") + .rename(columns=rename_columns) ) + if len(corr_cols) > 0: + df_styled = ( + df_styled.wrap(subset=list(corr_cols)) + .align(subset=list(corr_cols), location="center") + .scaled_background_gradient( + subset=corr_cols, + cmap="PuOr_r", + center_zero=True, + low=1 / 1.75, + high=1 / 1.75, + ) + ) + + for col in df.columns: + if "% with motif" in col: + df_styled = ( + df_styled.add_circle(subset=[col], cmap="Purples", vmax=100, size=40) + .wrap(subset=[col]) + .align(subset=[col], location="center") + .border(subset=[col], location="left") + .to_precision_str(subset=[col]) + ) + + df_styled = df_styled.wrap().render() + with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: - f.write("\n") - f.write("\n".format(css)) - f.write("\n") - f.write("\n") f.write(df_styled) - f.write("\n".format(js)) - f.write("\n") def roc_html_report( @@ -353,102 +966,104 @@ def roc_html_report( ): df = pd.read_table(infile, index_col=0) df.rename_axis(None, inplace=True) - df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1] + + motifs = read_motifs(pfmfile, as_dict=True) + if use_motifs is not None: + motifs = {k: v for k, v in motifs.items() if k in use_motifs} + idx = list(motifs.keys()) + df = df.loc[idx] + + df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1]) + df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"])) + df = df[df["corrected P-value"] <= threshold] cols = [ + "factors", "logo", - "# matches", - "# matches background", - "P-value", - "log10 P-value", - "corrected P-value", + "% matches input", + "%matches background", + "-log10 P-value", "ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR", ] - motifs = read_motifs(pfmfile) - if use_motifs is not None: - motifs = [m for m in motifs if m.id in use_motifs] - - idx = [motif.id for motif in motifs] - df = df.loc[idx] - direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs] - indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs] - m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx) - - factor_cols = [DIRECT_NAME, INDIRECT_NAME] - if True: - for factor_col in factor_cols: - f = m2f[factor_col].str.len() > 30 - m2f[factor_col] = ( - '
' - + m2f[factor_col].str.slice(0, 30) - ) - m2f.loc[f, factor_col] += "(...)" - m2f[factor_col] += "
" - df = df.join(m2f) - cols = factor_cols + cols - - df = df[df["corrected P-value"] <= threshold] - if link_matches: df["# matches"] = ( "" + df["# matches"].astype(str) + "" ) - df["logo"] = [ - ''.format(re.sub(r"[^-_\w]+", "_", x)) - for x in list(df.index) - ] + # Add motif logo's + df.insert( + 0, + "logo", + motif_to_img_series( + df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos" + ), + ) + # Add factors that can bind to the motif + df.insert( + 0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs) + ) + + rename_columns = {"factors": FACTOR_TOOLTIP} df = df[cols] - if not os.path.exists(outdir + "/logos"): - os.makedirs(outdir + "/logos") - for motif in motifs: - if motif.id in df.index: - motif.plot_logo( - fname=outdir - + "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id)) - ) bar_cols = [ - "log10 P-value", + "% matches input", + "%matches background", + "-log10 P-value", "ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR", ] - template_dir = MotifConfig().get_template_dir() - js = open( - os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8" - ).read() - css = open( - os.path.join(template_dir, "sortable/sortable-theme-slick.css"), - encoding="utf-8", - ).read() + + df["% matches input"] = df["% matches input"].astype(int) + df["%matches background"] = df["%matches background"].astype(int) + rename_columns = {"factors": FACTOR_TOOLTIP} + df = df.sort_values("ROC AUC", ascending=False) with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f: - f.write("\n") - f.write("\n".format(css)) - f.write("\n") - f.write("\n") if df.shape[0] > 0: f.write( - df.sort_values("ROC AUC", ascending=False) - .style.bar(bar_cols) - .set_precision(3) - .set_table_attributes("data-sortable") + ExtraStyler(df) + .convert_to_image( + subset=["logo"], + height=30, + ) + .add_circle( + subset=["% matches input", "%matches background"], + vmax=100, + cmap="Purples", + ) + .scaled_background_gradient( + "-log10 P-value", vmin=0, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "ROC AUC", vmin=0.5, vmax=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "PR AUC", vmin=0, vmax=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "Enr. at 1% FPR", vmin=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "Recall at 10% FDR", vmin=0, vmax=1, high=0.7, cmap="Reds" + ) + .set_precision(2) + .set_table_attributes('class="sortable-theme-slick" data-sortable') + .wrap(subset=cols) + .align(subset=bar_cols, location="center") + .rename(columns=rename_columns) + .to_precision_str(subset=["% matches input", "%matches background"]) .render() - .replace("data-sortable", 'class="sortable-theme-slick" data-sortable') ) else: - f.write("No enriched motifs found.") - f.write("\n".format(js)) - f.write("\n") + f.write("No enriched motifs found.") diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index d45d3906..8397fb8f 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -1,11 +1,11 @@ import os import re import sys +from collections import Counter from functools import partial from tempfile import mkdtemp, NamedTemporaryFile import logging import multiprocessing as mp -import six # "hidden" features, in development try: @@ -18,7 +18,8 @@ from genomepy import Genome from diskcache import Cache import numpy as np -from scipy.stats import scoreatpercentile +from sklearn.preprocessing import scale +import pandas as pd from gimmemotifs import __version__ from gimmemotifs.background import RandomGenomicFasta, gc_bin_bedfile @@ -46,7 +47,6 @@ def _pickle_method(m): # only used when using cache, should not be a requirement try: from dogpile.cache import make_region - from dogpile.cache.api import NO_VALUE import xxhash except ImportError: pass @@ -250,8 +250,7 @@ def scan_to_file( zscore=True, gcnorm=True, ): - """Scan an inputfile with motifs. - """ + """Scan an inputfile with motifs.""" should_close = False if filepath_or_buffer is None: fo = sys.stdout @@ -341,7 +340,7 @@ def scan_to_best_match( if genome: s.set_genome(genome) - if isinstance(motifs, six.string_types): + if isinstance(motifs, str): motifs = read_motifs(motifs) logger.debug("scanning %s...", fname) @@ -370,7 +369,9 @@ def parse_threshold_values(motif_file, cutoff): return threshold -def scan_sequence(seq, motifs, nreport, scan_rc): +def scan_sequence( + seq, seq_gc_bin, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False +): ret = [] # scan for motifs @@ -378,36 +379,37 @@ def scan_sequence(seq, motifs, nreport, scan_rc): if cutoff is None: ret.append([]) else: - result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) + if zscore: + m_mean, m_std = motifs_meanstd[seq_gc_bin][motif.id] + result = pwmscan( + seq, motif.logodds, motif.pwm_min_score(), nreport, scan_rc + ) + result = [[(row[0] - m_mean) / m_std, row[1], row[2]] for row in result] + result = [row for row in result if row[0] >= cutoff] + else: + result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) if cutoff <= motif.pwm_min_score() and len(result) == 0: result = [[motif.pwm_min_score(), 0, 1]] * nreport - ret.append(result) - - # return results - return ret - - -def scan_region(region, genome, motifs, nreport, scan_rc): - - # retrieve sequence - chrom, start, end = re.split(r"[:-]", region) - seq = genome[chrom][int(start) : int(end)].seq.upper() - - return scan_sequence(seq, motifs, nreport, scan_rc) + ret.append(result) -def scan_seq_mult(seqs, motifs, nreport, scan_rc): - ret = [] - for seq in seqs: - result = scan_sequence(seq.upper(), motifs, nreport, scan_rc) - ret.append(result) return ret -def scan_region_mult(regions, genome, motifs, nreport, scan_rc): +def scan_seq_mult( + seqs, seq_gc_bins, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False +): ret = [] - for region in regions: - result = scan_region(region, genome, motifs, nreport, scan_rc) + for seq, seq_gc_bin in zip(seqs, seq_gc_bins): + result = scan_sequence( + seq.upper(), + seq_gc_bin, + motifs, + nreport, + scan_rc, + motifs_meanstd=motifs_meanstd, + zscore=zscore, + ) ret.append(result) return ret @@ -529,7 +531,7 @@ class Scanner(object): def __init__(self, ncpus=None): self.config = MotifConfig() - self.threshold = None + self._threshold = None self.genome = None self.background = None self.meanstd = {} @@ -604,19 +606,15 @@ def _meanstd_from_seqs(self, motifs, seqs): def _threshold_from_seqs(self, motifs, seqs, fpr): scan_motifs = [(m, m.pwm_min_score()) for m in motifs] - table = [] - for x in self._scan_sequences_with_motif(scan_motifs, seqs, 1, True): - table.append([row[0][0] for row in x]) + seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs] + for gc_bin, result in zip( + seq_gc_bins, self._scan_sequences_with_motif(scan_motifs, seqs, 1, True) + ): + table.append([gc_bin] + [row[0][0] for row in result]) - for (motif, _), scores in zip(scan_motifs, np.array(table).transpose()): - if len(scores) > 0: - opt_score = scoreatpercentile(scores, 100 - (100 * fpr)) - yield motif, opt_score # cutoff - else: - raise ValueError( - "Could not determine threshold for motif {}".format(motif) - ) + df = pd.DataFrame(table, columns=["gc_bin"] + [m.id for m in motifs]) + return df def set_meanstd(self, gc=False): if not self.background: @@ -677,8 +675,22 @@ def set_meanstd(self, gc=False): lock.release() + for gc_bin in self.gc_bins: + gc_bin = "{:.2f}-{:.2f}".format(*gc_bin) + if gc_bin not in self.meanstd: + valid_bins = [] + for b in self.gc_bins: + bstr = "{:.2f}-{:.2f}".format(b[0], b[1]) + if bstr in self.meanstd: + valid_bins.append(((b[0] + b[1]) / 2, bstr)) + + v = float(gc_bin.split("-")[1]) + _, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0] + logger.warn(f"Using {bstr}") + self.meanstd[gc_bin] = self.meanstd[bstr] + def set_background( - self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None + self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None ): """Set the background to use for FPR and z-score calculations. @@ -705,6 +717,16 @@ def set_background( size = int(size) + if gc_bins is None: + if gc: + gc_bins = [(0.0, 0.2), (0.8, 1)] + for b in np.arange(0.2, 0.799, 0.05): + gc_bins.append((b, b + 0.05)) + else: + gc_bins = [(0, 1)] + if nseq is None: + nseq = max(10000, len(gc_bins) * 1000) + if genome and fname: raise ValueError("Need either genome or filename for background.") @@ -736,12 +758,6 @@ def set_background( if not fa: if gc: - - if gc_bins is None: - gc_bins = [(0.0, 0.2), (0.8, 1)] - for b in np.arange(0.2, 0.799, 0.05): - gc_bins.append((b, b + 0.05)) - with NamedTemporaryFile() as tmp: logger.info("using {} sequences".format(nseq)) gc_bin_bedfile( @@ -757,6 +773,12 @@ def set_background( if gc_bins: self.gc_bins = gc_bins + @property + def threshold(self): + if self._threshold is None: + self.set_threshold() + return self._threshold + def set_threshold(self, fpr=None, threshold=None, gc=False): """Set motif scanning threshold based on background sequences. @@ -775,6 +797,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): if threshold and fpr: raise ValueError("Need either fpr or threshold.") + if threshold is None and fpr is None: + if self.genome: + fpr = 0.01 + logger.info(f"Using default FPR of {fpr}") + else: + threshold = 0.95 + logger.info( + f"Genome not specified, using default threshold of {threshold}." + ) + logger.info("This is likely not ideal.") + if fpr: fpr = float(fpr) if not (0.0 < fpr < 1.0): @@ -783,11 +816,16 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): if not self.motifs: raise ValueError("please run set_motifs() first") - thresholds = {} motifs = read_motifs(self.motifs) + gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins] if threshold is not None: - self.threshold = parse_threshold_values(self.motifs, threshold) + d = parse_threshold_values(self.motifs, threshold) + self._threshold = pd.DataFrame(d, index=[0]) + self._threshold = self._threshold.join( + pd.DataFrame(gc_bins, index=[0] * len(gc_bins), columns=["gc_bin"]) + ) + self._threshold = self._threshold.set_index("gc_bin") return if not self.background: @@ -801,36 +839,41 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): lock.acquire() with Cache(CACHE_DIR) as cache: scan_motifs = [] + self._threshold = None for motif in motifs: - k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) - - threshold = cache.get(k) - if threshold is None: + k = "{}|{}|{:.4f}|{}".format( + motif.hash(), self.background_hash, fpr, ",".join(sorted(gc_bins)) + ) + vals = cache.get(k) + if vals is None: scan_motifs.append(motif) else: - if np.isclose(threshold, motif.pwm_max_score()): - thresholds[motif.id] = None - elif np.isclose(threshold, motif.pwm_min_score()): - thresholds[motif.id] = 0.0 + if self._threshold is None: + self._threshold = vals.to_frame() else: - thresholds[motif.id] = threshold + self._threshold[motif.id] = vals if len(scan_motifs) > 0: logger.info("determining FPR-based threshold") - for motif, threshold in self._threshold_from_seqs( - scan_motifs, seqs, fpr - ): - k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) - cache.set(k, threshold) - if np.isclose(threshold, motif.pwm_max_score()): - thresholds[motif.id] = None - elif np.isclose(threshold, motif.pwm_min_score()): - thresholds[motif.id] = 0.0 - else: - thresholds[motif.id] = threshold + df = self._threshold_from_seqs(scan_motifs, seqs, fpr).set_index( + "gc_bin" + ) + if self._threshold is None: + self._threshold = df + else: + self._threshold = pd.concat((self._threshold, df), axis=1) + for motif in scan_motifs: + k = "{}|{}|{:.4f}|{}".format( + motif.hash(), + self.background_hash, + fpr, + ",".join(sorted(gc_bins)), + ) + cache.set(k, df[motif.id]) lock.release() - self.threshold_str = "{}_{}_{}".format(fpr, threshold, self.background_hash) - self.threshold = thresholds + self.threshold_str = "{}_{}_{}_{}".format( + fpr, threshold, self.background_hash, ",".join(sorted(gc_bins)) + ) def set_genome(self, genome): """ @@ -887,8 +930,11 @@ def best_match(self, seqs, scan_rc=True, zscore=False, gc=False): yield [m[0] for m in matches] def get_seq_bin(self, seq): - useq = seq.upper() - gc = round((useq.count("G") + useq.count("C")) / len(useq), 2) + if len(str(seq)) == 0: + gc = 0 + else: + useq = seq.upper() + gc = round((useq.count("G") + useq.count("C")) / len(useq), 2) if gc == 0: gc = 0.01 for b_start, b_end in self.gc_bins: @@ -931,16 +977,9 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): """ Scan a set of regions or sequences. """ - - if not self.threshold: - logger.info( - "Using default threshold of 0.95. " "This is likely not optimal!" - ) - self.set_threshold(threshold=0.95) - seqs = as_fasta(seqs, genome=self.genome) - it = self._scan_sequences(seqs.seqs, nreport, scan_rc) + it = self._scan_sequences(seqs.seqs, nreport, scan_rc, zscore=zscore) if zscore: if gc: @@ -950,165 +989,90 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): if len(self.meanstd) != 1: self.set_meanstd(gc=gc) - gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs] - logger.debug("Scanning") - for result, gc_seq in zip(it, gc_seqs): - if zscore: - zresult = [] - for i, mrow in enumerate(result): - try: - m_mean, m_std = self.get_motif_mean_std( - gc_seq, self.motif_ids[i] - ) - except Exception: - print(self.meanstd) - print(gc_seq, self.motif_ids[i]) - raise - mrow = [((x[0] - m_mean) / m_std, x[1], x[2]) for x in mrow] - zresult.append(mrow) - yield zresult - else: - yield result + for result in it: + yield result - def _scan_regions(self, regions, nreport, scan_rc): - genome = self.genome - motif_file = self.motifs - motif_digest = self.checksum.get(motif_file, None) + def get_gc_thresholds(self, seqs, motifs=None, zscore=False): + # Simple case, only one threshold + if np.all(self.threshold.nunique(axis=0) == 1): + return self.threshold.iloc[0].to_dict() - # determine which regions are not in the cache - scan_regions = regions - if self.use_cache: - scan_regions = [] - for region in regions: - key = str((region, genome, motif_digest, nreport, scan_rc)) - ret = self.cache.get(key) - if ret == NO_VALUE: - scan_regions.append(region) - - # scan the regions that are not in the cache - if len(scan_regions) > 0: - - g = Genome(genome) - - motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)] - scan_func = partial( - scan_region_mult, - genome=g, - motifs=motifs, - nreport=nreport, - scan_rc=scan_rc, - ) + if motifs is None: + motifs = read_motifs(self.motifs) + seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs] - for region, ret in self._scan_jobs(scan_func, scan_regions): - # return values or store values in cache - if self.use_cache: - # store values in cache - key = str( - ( - region, - genome, - motif_digest, - nreport, - scan_rc, - self.threshold_str, - ) - ) - self.cache.set(key, ret) - else: - # return values - yield ret + gc_bin_count = Counter(seq_gc_bins) - if self.use_cache: - # return results from cache - for region in regions: - key = str( - (region, genome, motif_digest, nreport, scan_rc, self.threshold_str) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - raise Exception( - "cache is not big enough to hold all " - "results, try increasing the cache size " - "or disable cache" - ) - yield ret + _threshold = self.threshold + if zscore: + grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0) + _threshold = pd.DataFrame( + np.vstack(grouped.values), + index=_threshold.index, + columns=_threshold.columns, + ) + + nseqs = int(20000 / np.sum(list(gc_bin_count.values()))) + t = {} + maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns) + # We do this in a loop as the DataFrame will get too big to fit in memory + # when the difference between the number of sequences per gc_bin is very + # high. + _threshold = _threshold.reset_index() + idx = np.hstack( + [ + _threshold[_threshold[_threshold.columns[0]] == gc_bin] + .sample(nseqs * count, replace=True, random_state=42) + .index.values + for gc_bin, count in gc_bin_count.items() + ] + ) + for motif in _threshold.columns[1:]: + val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher") + if val < maxt.loc[motif]: + t[motif] = val + else: + t[motif] = None + return t def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): scan_func = partial( scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc ) - for ret in self._scan_jobs(scan_func, seqs): yield ret[1] - def _scan_sequences(self, seqs, nreport, scan_rc): - - motif_file = self.motifs - motif_digest = self.checksum.get(motif_file, None) - - scan_seqs = seqs - if self.use_cache: - # determine which sequences are not in the cache - hashes = dict([(s.upper(), xxhash.xxh64(s.upper()).digest()) for s in seqs]) - scan_seqs = [] - - for seq, seq_hash in hashes.items(): - key = str( - (seq_hash, motif_digest, nreport, scan_rc, self.threshold_str) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - scan_seqs.append(seq.upper()) - - # scan the sequences that are not in the cache - if len(scan_seqs) > 0: - motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)] - scan_func = partial( - scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc - ) - - for seq, ret in self._scan_jobs(scan_func, scan_seqs): - if self.use_cache: - h = hashes[seq] - key = str((h, motif_digest, nreport, scan_rc, self.threshold_str)) - self.cache.set(key, ret) - else: - yield ret - - if self.use_cache: - # return results from cache - for seq in seqs: - key = str( - ( - hashes[seq.upper()], - motif_digest, - nreport, - scan_rc, - self.threshold_str, - ) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - raise Exception( - "cache is not big enough to hold all " - "results, try increasing the cache size " - "or disable cache" - ) + def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False): + thresholds = self.get_gc_thresholds(seqs, zscore=zscore) + motifs = [(m, thresholds[m.id]) for m in read_motifs(self.motifs)] + motifs_meanstd = None + if zscore: + motifs_meanstd = self.meanstd - yield ret + scan_func = partial( + scan_seq_mult, + motifs=motifs, + nreport=nreport, + scan_rc=scan_rc, + motifs_meanstd=motifs_meanstd, + zscore=zscore, + ) + for _, ret in self._scan_jobs(scan_func, seqs): + yield ret def _scan_jobs(self, scan_func, scan_seqs): batchsize = 1000 + if self.ncpus > 1: for i in range((len(scan_seqs) - 1) // batchsize + 1): batch = scan_seqs[i * batchsize : (i + 1) * batchsize] chunksize = len(batch) // self.ncpus + 1 jobs = [] for j in range((len(batch) - 1) // chunksize + 1): - job = self.pool.apply_async( - scan_func, (batch[j * chunksize : (j + 1) * chunksize],) - ) + batch_seqs = batch[j * chunksize : (j + 1) * chunksize] + seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs] + job = self.pool.apply_async(scan_func, (batch_seqs, seq_gc_bins)) jobs.append(job) for k, job in enumerate(jobs): @@ -1117,7 +1081,8 @@ def _scan_jobs(self, scan_func, scan_seqs): yield region, ret else: for i in range((len(scan_seqs) - 1) // batchsize + 1): - for _j, ret in enumerate( - scan_func(scan_seqs[i * batchsize : (i + 1) * batchsize]) - ): + batch_seqs = scan_seqs[i * batchsize : (i + 1) * batchsize] + seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs] + + for _j, ret in enumerate(scan_func(batch_seqs, seq_gc_bins)): yield scan_seqs[i], ret diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py index 0319e8ad..3bb69d60 100644 --- a/gimmemotifs/tools/meme.py +++ b/gimmemotifs/tools/meme.py @@ -1,5 +1,6 @@ from .motifprogram import MotifProgram import io +import os import re from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile @@ -75,8 +76,11 @@ def _run_program(self, bin, fastafile, params=None): if not default_params["single"]: cmd.append(strand) - # sys.stderr.write(" ".join(cmd) + "\n") - p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE) + # Fix to run in Docker + env = os.environ.copy() + env["OMPI_MCA_plm_rsh_agent"] = "sh" + + p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env) stdout, stderr = p.communicate() motifs = [] diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py index c8d17e45..5f39ddef 100644 --- a/gimmemotifs/tools/memew.py +++ b/gimmemotifs/tools/memew.py @@ -1,5 +1,6 @@ from .motifprogram import MotifProgram import io +import os import re from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile @@ -76,8 +77,12 @@ def _run_program(self, bin, fastafile, params=None): if not default_params["single"]: cmd.append(strand) + # Fix to run in Docker + env = os.environ.copy() + env["OMPI_MCA_plm_rsh_agent"] = "sh" + # sys.stderr.write(" ".join(cmd) + "\n") - p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE) + p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env) stdout, stderr = p.communicate() motifs = [] diff --git a/gimmemotifs/tools/weeder.py b/gimmemotifs/tools/weeder.py index 0361c2d2..0ad4334c 100644 --- a/gimmemotifs/tools/weeder.py +++ b/gimmemotifs/tools/weeder.py @@ -76,7 +76,7 @@ def _run_program(self, bin, fastafile, params=None): shutil.copy(fastafile, name) fastafile = name - cmd = "{} -f {} -O".format(self.cmd, fastafile, weeder_organism) + cmd = "{} -f {} -O {}".format(self.cmd, fastafile, weeder_organism) if params["single"]: cmd += " -ss" diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 17bd6fb9..a1b3c71b 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -5,8 +5,6 @@ # distribution. """ Odds and ends that for which I didn't (yet) find another place """ -from __future__ import print_function - # Python imports import os import re @@ -15,18 +13,21 @@ import logging import mmap import random -import six import tempfile import requests +from io import TextIOWrapper +from functools import singledispatch from subprocess import Popen from tempfile import NamedTemporaryFile from shutil import copyfile # External imports +import pyfaidx from scipy import special import numpy as np import pybedtools from genomepy import Genome +from Bio.SeqIO.FastaIO import SimpleFastaParser # gimme imports @@ -49,8 +50,7 @@ def rc(seq): def narrowpeak_to_bed(inputfile, bedfile, size=0): - """Convert narrowPeak file to BED file. - """ + """Convert narrowPeak file to BED file.""" p = re.compile(r"^(#|track|browser)") warn_no_summit = True with open(bedfile, "w") as f_out: @@ -88,7 +88,7 @@ def pfmfile_location(infile): "database specified in the config file." ) - if isinstance(infile, six.string_types): + if isinstance(infile, str): if not os.path.exists(infile): motif_dir = config.get_motif_dir() checkfile = os.path.join(motif_dir, infile) @@ -132,7 +132,7 @@ def phyper_single(k, good, bad, N): def phyper(k, good, bad, N): - """ Current hypergeometric implementation in scipy is broken, + """Current hypergeometric implementation in scipy is broken, so here's the correct version. """ pvalues = [phyper_single(x, good, bad, N) for x in range(k + 1, N + 1)] @@ -293,8 +293,8 @@ def motif_localization(fastafile, motif, size, outfile, cutoff=0.9): def parse_cutoff(motifs, cutoff, default=0.9): - """ Provide either a file with one cutoff per motif or a single cutoff - returns a hash with motif id as key and cutoff as value + """Provide either a file with one cutoff per motif or a single cutoff + returns a hash with motif id as key and cutoff as value """ cutoffs = {} @@ -473,7 +473,7 @@ def get_seqs_type(seqs): - region file - BED file """ - region_p = re.compile(r"^(.+):(\d+)-(\d+)$") + region_p = re.compile(r"^([^\s:]+\@)?(.+):(\d+)-(\d+)$") if isinstance(seqs, Fasta): return "fasta" elif isinstance(seqs, list) or isinstance(seqs, np.ndarray): @@ -499,24 +499,197 @@ def get_seqs_type(seqs): raise ValueError("unknown type {}".format(type(seqs).__name__)) -def as_fasta(seqs, genome=None): - ftype = get_seqs_type(seqs) - if ftype == "fasta": - return seqs - elif ftype == "fastafile": - return Fasta(seqs) +# Regular expression to check for region (chr:start-end or genome@chr:start-end) +region_p = re.compile(r"^[^@]+@([^\s]+):(\d+)-(\d+)$") + + +def _check_minsize(fa, minsize): + """ + Raise ValueError if there is any sequence that is shorter than minsize. + If minsize is None the size will not be checked. + """ + if minsize is None: + return fa + + for name, seq in fa.items(): + if len(seq) < minsize: + raise ValueError(f"sequence {name} is shorter than {minsize}") + + return fa + + +def _genomepy_convert(to_convert, genome, minsize=None): + """ + Convert a variety of inputs using track2fasta(). + """ + if genome is None: + raise ValueError("input file is not a FASTA file, need a genome!") + + if isinstance(genome, Genome): + g = genome else: - if genome is None: - raise ValueError("need genome to convert to FASTA") + g = Genome(genome) + + tmpfile = NamedTemporaryFile() + g.track2fasta(to_convert, tmpfile.name) + + fa = as_seqdict(tmpfile.name) + return _check_minsize(fa, minsize) + + +def _as_seqdict_genome_regions(regions, minsize=None): + """ + Accepts list of regions where the genome is encoded in the region, + using the genome@chrom:start-end format. + """ + genomic_regions = {} + for region in regions: + genome, region = region.split("@") + if genome not in genomic_regions: + Genome(genome) + genomic_regions[genome] = [] + genomic_regions[genome].append(region) + + tmpfa = NamedTemporaryFile(mode="w", delete=False) + for genome, g_regions in genomic_regions.items(): + g = Genome(genome) + + fa = g.track2fasta(g_regions) + + for seq in fa: + seq.name = f"{genome}@{seq.name}" + print(seq.__repr__(), file=tmpfa) + + tmpfa.flush() + + # Open tempfile and restore original sequence order + fa = as_seqdict(tmpfa.name) + fa = {region: fa[region] for region in regions} + return _check_minsize(fa, minsize) + + +@singledispatch +def as_seqdict(to_convert, genome=None, minsize=None): + """ + Convert input to a dictionary with name as key and sequence as value. + + If the input contains genomic coordinates, the genome needs to be + specified. If minsize is specified all sequences will be checked if they + are not shorter than minsize. If regions (or a region file) are used as + the input, the genome can optionally be specified in the region using the + following format: genome@chrom:start-end. + + Current supported input types include: + * FASTA, BED and region files. + * List or numpy.ndarray of regions. + * pyfaidx.Fasta object. + * pybedtools.BedTool object. + + Parameters + ---------- + to_convert : list, str, pyfaidx.Fasta or pybedtools.BedTool + Input to convert to FASTA-like dictionary + + genome : str, optional + Genomepy genome name. + + minsize : int or None, optional + If specified, check if all sequences have at least size minsize. + + Returns + ------- + dict with sequence names as key and sequences as value. + """ + raise NotImplementedError(f"Not implement for {type(to_convert)}") + + +@as_seqdict.register(list) +def _as_seqdict_list(to_convert, genome=None, minsize=None): + """ + Accepts list of regions as input. + """ + if region_p.match(to_convert[0]): + return _as_seqdict_genome_regions(to_convert, minsize) + + return _genomepy_convert(to_convert, genome, minsize) + + +@as_seqdict.register(TextIOWrapper) +def _as_seqdict_file_object(to_convert, genome=None, minsize=None): + """ + Accepts file object as input, should be a FASTA file. + """ + fa = {x: y for x, y in SimpleFastaParser(to_convert)} + return _check_minsize(fa, minsize) + + +@as_seqdict.register(str) +def _as_seqdict_filename(to_convert, genome=None, minsize=None): + """ + Accepts filename as input. + """ + if not os.path.exists(to_convert): + raise ValueError("Assuming filename, but it does not exist") + + f = open(to_convert) + fa = as_seqdict(f) + + if any(fa): + return _check_minsize(fa, minsize) + + with open(to_convert) as f: + line = "" + while True: + line = f.readline() + if line == "": + break + if not line.startswith("#"): + break + + if line == "": + raise IOError(f"empty file {to_convert}") + + if region_p.match(line.strip()): + regions = [myline.strip() for myline in [line] + f.readlines()] + return _as_seqdict_genome_regions(regions, minsize=None) + + # Biopython parser resulted in empty dict + # Assuming it's a BED or region file + return _genomepy_convert(to_convert, genome, minsize) + + +@as_seqdict.register(pyfaidx.Fasta) +def _as_seqdict_pyfaidx(to_convert, genome=None, minsize=None): + """ + Accepts pyfaidx.Fasta object as input. + """ + fa = {k: str(v) for k, v in to_convert.items()} + return _check_minsize(fa, minsize) + + +@as_seqdict.register(pybedtools.BedTool) +def _as_seqdict_bedtool(to_convert, genome=None, minsize=None): + """ + Accepts pybedtools.BedTool as input. + """ + return _genomepy_convert( + ["{}:{}-{}".format(*f[:3]) for f in to_convert], genome, minsize + ) + + +@as_seqdict.register(np.ndarray) +def _as_seqdict_array(to_convert, genome=None, minsize=None): + """ + Accepts numpy.ndarray with regions as input. + """ + return as_seqdict(list(to_convert), genome, minsize) + - tmpfa = NamedTemporaryFile() - if isinstance(genome, str): - genome = Genome(genome) +def as_fasta(to_convert, genome=None, minsize=None): + if isinstance(to_convert, Fasta): + return to_convert - if isinstance(seqs, np.ndarray): - seqs = list(seqs) - genome.track2fasta(seqs, tmpfa.name) - return Fasta(tmpfa.name) + return Fasta(fdict=as_seqdict(to_convert, genome, minsize)) def file_checksum(fname): @@ -539,11 +712,11 @@ def file_checksum(fname): return checksum -def join_max(a, l, sep="", suffix=""): +def join_max(a, length, sep="", suffix=""): lengths = [len(x) for x in a] total = 0 for i, size in enumerate(lengths + [0]): - if total > (l - len(suffix)): + if total > (length - len(suffix)): return sep.join(a[: i - 1]) + suffix if i > 0: total += 1 diff --git a/readthedocs.yml b/readthedocs.yml index 9a6a856f..e458c84e 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,18 +1,18 @@ # Required version: 2 -# Build documentation in the docs/ directory with Sphinx +conda: + environment: .rtd-environment.yml + + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py +build: + image: latest + python: - version: 3.7 - install: - - requirements: docs/requirements.txt - - method: pip - path: . - extra_requirements: - - docs - - method: setuptools - path: another/package - system_packages: true + version: 3.7 + install: + - method: setuptools + path: . diff --git a/requirements.txt b/requirements.txt index 604878ff..ec991b68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,12 +10,9 @@ pyyaml >= 3.10 pybedtools statsmodels scikit-learn -sklearn-contrib-lightning seaborn pysam xgboost diskcache xxhash -six -future pillow diff --git a/scripts/combine_peaks b/scripts/combine_peaks index 155d6e83..da2870c0 100644 --- a/scripts/combine_peaks +++ b/scripts/combine_peaks @@ -46,7 +46,7 @@ def read_peak_file_to_df(fname): "qval", "peak", ] - df = pd.read_table(fname, names=header) + df = pd.read_table(fname, names=header, dtype={"chrom": "str"}) df["chrom"] = df["chrom"].astype(str) # get the summit @@ -57,7 +57,7 @@ def read_peak_file_to_df(fname): df["value"] = df["qval"] df = df[summit_header] elif ftype == "bed": - df = pd.read_table(fname, names=summit_header) + df = pd.read_table(fname, names=summit_header, dtype={"chrom": "str"}) if ((df["end"] - df["start"]) != 1).sum() != 0: raise ValueError(f"{fname} does not contain summits.") else: @@ -116,7 +116,7 @@ def combine_peaks(peaks, genome, window, scale_value): # store summit location + associated value in col4 df_all["col4"] = ( - df_all["chrom"] + df_all["chrom"].astype(str) + ";" + df_all["start"].astype(str) + ";" diff --git a/scripts/coverage_table b/scripts/coverage_table index 64385b65..14f3128f 100644 --- a/scripts/coverage_table +++ b/scripts/coverage_table @@ -9,6 +9,7 @@ import pysam import numpy as np import pandas as pd from sklearn.preprocessing import scale +import qnorm from gimmemotifs import __version__ @@ -27,11 +28,12 @@ def make_table( datafiles, window, log_transform=True, - scale_table=True, + normalization="none", top=0, topmethod="var", rmdup=True, rmrepeats=True, + ncpus=12 ): for x in datafiles: if not os.path.isfile(x): @@ -49,7 +51,7 @@ def make_table( data = {} try: # Load data in parallel - pool = multiprocessing.Pool(processes=12) + pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( @@ -99,9 +101,14 @@ def make_table( if log_transform: print("Log transform", file=sys.stderr) df = np.log1p(df) - if scale_table: - print("Scale", file=sys.stderr) + if normalization == "scale": + print("Normalization by scaling", file=sys.stderr) df[:] = scale(df, axis=0) + if normalization == "quantile": + print("Normalization by quantile normalization", file=sys.stderr) + df = qnorm.quantile_normalize(df) + else: + print("No normalization", file=sys.stderr) if top > 0: if topmethod == "var": @@ -165,12 +172,12 @@ if __name__ == "__main__": action="store_true", ) parser.add_argument( - "-s", - "--scale", - dest="scale_table", - help="Scale per datafile", - default=False, - action="store_true", + "-n", + "--normalization", + dest="normalization", + help="Normalization: none, quantile or scale", + default="none", + metavar="METHOD", ) parser.add_argument( "-t", "--top", dest="top", help="Select regions.", default=0, type=int @@ -196,6 +203,14 @@ if __name__ == "__main__": action="store_false", default=True, ) + parser.add_argument( + "--nthreads", + dest="ncpus", + help="Number of threads", + metavar="INT", + type=int, + default=12, + ) args = parser.parse_args() peakfile = args.peakfile @@ -205,11 +220,12 @@ if __name__ == "__main__": datafiles, args.window, log_transform=args.log_transform, - scale_table=args.scale_table, + normalization=args.normalization, top=args.top, topmethod=args.topmethod, rmdup=args.rmdup, rmrepeats=args.rmrepeats, + ncpus=args.ncpus ) yesno = {True: "yes", False: "no"} @@ -222,7 +238,7 @@ output.write("# Window: {}\n".format(args.window)) output.write("# Duplicates removed: {}\n".format(yesno[args.rmdup])) output.write("# MAPQ 0 removed: {}\n".format(yesno[args.rmrepeats])) output.write("# Log transformed: {}\n".format(yesno[args.log_transform])) -output.write("# Scaled: {}\n".format(yesno[args.scale_table])) +output.write("# Normalization: {}\n".format(args.normalization)) if args.top > 0: output.write("# Top {} regions selected by {}\n".format(args.top, args.topmethod)) df.to_csv(output, sep="\t", float_format="%0.5f") diff --git a/setup.py b/setup.py index 5f9fd60f..b5dc2217 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,6 @@ def run(self): "pybedtools", "statsmodels", "scikit-learn", - "sklearn-contrib-lightning", "seaborn", "pysam", "xgboost >= 0.71", @@ -132,11 +131,10 @@ def run(self): "diskcache", "xxhash", "configparser", - "six", - "future", - "genomepy >= 0.7.2", + "genomepy >= 0.8.3", "tqdm", "pillow", "logomaker", + "qnorm", ], ) diff --git a/test/data/rank/ranked.txt b/test/data/rank/ranked.txt index f5f36f40..bb52cd79 100644 --- a/test/data/rank/ranked.txt +++ b/test/data/rank/ranked.txt @@ -1,6 +1,6 @@ a b c d e -1 bZIP bZIP SOX T-box POU -2 AP2 AP2 AP2 AP2 AP2 -3 T-box SOX T-box SOX SOX -4 SOX T-box POU POU T-box -5 POU POU bZIP bZIP bZIP +bZIP 5 5 1 1 1 +AP2 4 4 4 4 4 +T-box 3 2 3 5 2 +SOX 2 3 5 3 3 +POU 1 1 2 2 4 diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py index 269e5b92..53bffc77 100644 --- a/test/test_maelstrom.py +++ b/test/test_maelstrom.py @@ -26,13 +26,30 @@ def test1_maelstrom(self): self.clusters, "mm10", self.outdir, + filter_redundant=False, score_table=self.score_table, count_table=self.count_table, plot=False, ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) - self.assertEquals((623, 4), df.shape) + + self.assertEquals((623, 8), df.shape) + + # Filter redundant motifs + run_maelstrom( + self.clusters, + "mm10", + self.outdir, + filter_redundant=True, + score_table=self.score_table, + count_table=self.count_table, + plot=False, + ) + df = pd.read_table(self.outfile, index_col=0, comment="#") + print(df.shape) + self.assertEquals((156, 8), df.shape) + for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) diff --git a/test/test_moap.py b/test/test_moap.py index 775b3534..c6c13be3 100644 --- a/test/test_moap.py +++ b/test/test_moap.py @@ -20,7 +20,7 @@ def setUp(self): def test1_moap(self): """ Test motif activity prediction """ - for method in ["mwu", "rf", "lightningclassification"]: + for method in ["mwu", "rf"]: df = moap( self.clusters, method=method, @@ -41,7 +41,7 @@ def test1_moap(self): def test2_moap(self): """ Test motif activity prediction for two clusters """ - for method in ["mwu", "rf", "lightningclassification"]: + for method in ["mwu", "rf"]: df = moap( self.clusters2, method=method, diff --git a/test/test_rank.py b/test/test_rank.py index 5cb533e9..cca52132 100644 --- a/test/test_rank.py +++ b/test/test_rank.py @@ -2,7 +2,7 @@ import tempfile import os import pandas as pd -from gimmemotifs.rank import rankagg +from gimmemotifs.rank import rankagg, _rankagg_stuart class TestRank(unittest.TestCase): @@ -17,13 +17,15 @@ def setUp(self): def test1_rankagg(self): """ Test rank aggregation """ df = pd.read_csv(self.fname, index_col=0, sep="\t") - result = rankagg(df) - self.assertEqual("AP2", result.sort_values("score").index[0]) + result = rankagg(df, method="stuart") + self.assertEqual("AP2", result.sort_values("score").index[-1]) + result = rankagg(df, method="int_stouffer") + self.assertEqual("AP2", result.sort_values("z-score").index[-1]) def test2_rankagg(self): """ Test Python implementation of rank aggregation """ df = pd.read_csv(self.rank_in, index_col=0, sep="\t") - result = rankagg(df)["score"].values + result = _rankagg_stuart(df)["score"].values ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values for v1, v2 in zip(ref, result): self.assertAlmostEqual(v1, v2) diff --git a/test/test_tools.py b/test/test_tools.py index d6dba2db..9903ce54 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -35,6 +35,7 @@ def test_tool(tool_name): "trawler", # unpredictable, sometimes doesn't find the motif "weeder", # doesn't work at the moment "posmo", # motif doesn't predictably look like AP1 + "dreme", # current dreme in bioconda is broken ]: return diff --git a/test/test_utils.py b/test/test_utils.py index 87f7b9da..c4f72c2f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -13,7 +13,7 @@ class TestUtils(unittest.TestCase): """ A test class to test utils functions """ def setUp(self): - self.genome_dir = "test/data/genome_index" + self.genomes_dir = "test/data/genome_index" self.datadir = "test/data/utils" def test1_phyper(self): @@ -31,7 +31,7 @@ def test2_as_fasta(self): """ convert bed, regions, etc to Fasta """ tmpdir = mkdtemp() - g = Genome("genome", genome_dir=self.genome_dir) + g = Genome("genome", genomes_dir=self.genomes_dir) fafile = os.path.join(self.datadir, "test.fa") fa = Fasta(fafile) diff --git a/versioneer.py b/versioneer.py index 64fea1c8..cce201c7 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.18 """The Versioneer - like a rocketeer, but for versions. @@ -276,7 +275,6 @@ """ -from __future__ import print_function try: import configparser except ImportError: @@ -308,11 +306,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -325,8 +325,10 @@ def get_root(): me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) except NameError: pass return root @@ -348,6 +350,7 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None + cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" @@ -372,17 +375,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -390,10 +394,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -418,7 +425,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, p.returncode -LONG_VERSION_PY['git'] = ''' +LONG_VERSION_PY[ + "git" +] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -993,7 +1002,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1002,7 +1011,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1010,19 +1019,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1037,8 +1053,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1046,10 +1061,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1072,17 +1096,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1091,10 +1114,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1105,13 +1130,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1167,16 +1192,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1205,11 +1236,13 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1218,8 +1251,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1251,8 +1283,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1366,11 +1397,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -1390,9 +1423,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } class VersioneerBadRootError(Exception): @@ -1415,8 +1452,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1470,9 +1508,13 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } def get_version(): @@ -1521,6 +1563,7 @@ def run(self): print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1553,14 +1596,15 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ @@ -1581,17 +1625,21 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if 'py2exe' in sys.modules: # py2exe enabled? + if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.distutils_buildexe import py2exe as _py2exe # py3 except ImportError: @@ -1610,13 +1658,17 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments @@ -1643,8 +1695,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -1699,11 +1753,13 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -1712,15 +1768,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -1762,8 +1821,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: