diff --git a/.rtd-environment.yml b/.rtd-environment.yml
new file mode 100644
index 00000000..cbcad0d7
--- /dev/null
+++ b/.rtd-environment.yml
@@ -0,0 +1,33 @@
+name: gimmemotifs
+channels:
+ - defaults
+ - bioconda
+ - conda-forge
+dependencies:
+ - configparser
+ - diskcache
+ - feather-format
+ - genomepy >=0.8.3
+ - jinja2
+ - logomaker
+ - matplotlib-base >=3.1.2
+ - ncurses
+ - numpy
+ - pandas >=1.0.3
+ - pillow
+ - pyarrow >=0.16.0
+ - pybedtools
+ - python >=3
+ - python-xxhash
+ - pyyaml >=3.10
+ - qnorm
+ - represent
+ - scikit-learn >=0.18
+ - scipy >=1.3.0
+ - seaborn
+ - statsmodels
+ - tqdm >=4.27.0
+ - xdg
+ - xgboost >=0.71
+ - sphinx_bootstrap_theme
+ - numpydoc
diff --git a/.travis.yml b/.travis.yml
index b452bd84..c8a17716 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,8 +21,11 @@ before_install:
fi
- chmod +x miniconda.sh
- ./miniconda.sh -b -p $HOME/miniconda -f
- - export PATH=$HOME/miniconda/bin:$PATH
- - conda config --set always_yes yes
+ - source "$HOME/miniconda/etc/profile.d/conda.sh"
+ - hash -r
+ - conda config --set always_yes yes --set changeps1 no
+ - conda update -q conda
+ - conda info -a
- if [ "$TRAVIS_OS_NAME" == "osx" ]; then ulimit -S -n 4096; ulimit -a; fi
install:
@@ -34,7 +37,8 @@ install:
else
conda env create -q -f conda_env.osx.txt -n gimme;
fi
- - source activate gimme
+ - conda activate gimme
+ - conda list
- python setup.py build && pip install -e .
before_script:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0638e19..a109ed18 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,33 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
+## [0.15.0] - 2020-09-29
+
+### Added
+
+- Added additional columns to `gimme maelstrom` output for better intepretation (correlation of motif to signal and % of regions with motif).
+- Added support for multi-species input in `genome@chrom:start-end` format.
+- `gimme maelstrom` warns if data is not row-centered and will center by default.
+- `gimme maelstrom` selects a set of non-redundant (or less redundant) motifs by default.
+- Added SVR regressor for `gimme maelstrom`.
+- Added quantile normalization to `coverage_table`.
+
+### Removed
+
+- Removed the lightning classifiers and regressors as the package is no longer actively maintained.
+
+### Changed
+
+- Visually improved HTML output.
+- Score of `maelstrom` is now an aggregate z-score based on combining z-scores from individual methods using Stouffer's method. The z-scores of individual methods are generated using the inverse normal transform.
+- Reorganized some classes and functions.
+
+### Fixed
+
+- Fixed minor issues with sorting columns in HTML output.
+- `gimme motifs` doesn't crash when no motifs are found.
+- Fixed error with Ensembl chromosome names in `combine_peaks`.
+
## [0.14.4] - 2020-04-02
### Fixed
diff --git a/conda_env.dev.txt b/conda_env.dev.txt
index 31297494..8dc6f56e 100644
--- a/conda_env.dev.txt
+++ b/conda_env.dev.txt
@@ -3,12 +3,9 @@ configparser
dinamo
diskcache
feather-format
-future
gadem
-genomepy >=0.6.1
-ghostscript
+genomepy >=0.8.3
homer
-icu=58
ipywidgets # Necessary for progress bar in Jupyter notebook
jinja2
logomaker
@@ -18,17 +15,16 @@ ncurses
numpy
prosampler
pillow
-pyarrow
+pyarrow >=0.16.0
pybedtools
pysam
python
python-xxhash
pyyaml >=3.10
-scikit-learn >=0.18
-scipy >=1.3.0
+qnorm
+scikit-learn >=0.23
+scipy >=1.4.1
seaborn
-six
-sklearn-contrib-lightning
statsmodels
tqdm >=4.27.0
trawler
diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index dc2adacb..dd1a2c98 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -2,13 +2,13 @@ bedtools
configparser
diskcache
feather-format
-future
gadem
-genomepy >=0.6.1
+genomepy >=0.8.3
ghostscript
homer
jinja2
logomaker
+llvm-openmp
matplotlib >=2.0
meme >=5
ncurses
@@ -21,11 +21,10 @@ pysam
python
python-xxhash
pyyaml >=3.10
-scikit-learn >=0.18
-scipy <1.3.0
+qnorm
+scikit-learn
+scipy
seaborn
-six
-sklearn-contrib-lightning
statsmodels
tqdm >=4.27.0
trawler
@@ -33,7 +32,7 @@ ucsc-bigbedtobed
ucsc-genepredtobed
weeder
xdg
-xgboost >=0.71
+xgboost=0.72
xxmotif
# development-specific
diff --git a/conda_env.test.txt b/conda_env.test.txt
new file mode 100644
index 00000000..af2f478c
--- /dev/null
+++ b/conda_env.test.txt
@@ -0,0 +1,37 @@
+bedtools
+configparser
+dinamo
+diskcache
+feather-format
+gadem
+genomepy >=0.6.1
+ghostscript
+homer
+icu=58
+ipywidgets # Necessary for progress bar in Jupyter notebook
+jinja2
+logomaker
+matplotlib >=2.0
+meme >=5
+ncurses
+numpy
+pillow
+prosampler
+pyarrow
+pybedtools
+python >=3.8
+python-xxhash
+pyyaml >=3.10
+qnorm
+scikit-learn >=0.18
+scipy <1.3.0
+seaborn
+statsmodels
+tqdm >=4.27.0
+trawler
+ucsc-bigbedtobed
+ucsc-genepredtobed
+weeder
+xdg
+xgboost >=0.71
+xxmotif
diff --git a/conda_env.txt b/conda_env.txt
index 5767b058..b569daff 100644
--- a/conda_env.txt
+++ b/conda_env.txt
@@ -1,39 +1,35 @@
-bedtools
configparser
dinamo
diskcache
feather-format
-future
gadem
-genomepy >=0.6.1
+genomepy >=0.8.3
ghostscript
homer
-icu=58
ipywidgets # Necessary for progress bar in Jupyter notebook
jinja2
logomaker
-matplotlib >=2.0
-meme >=5
+matplotlib-base >=3.1.2
+meme >=5.1.1
ncurses
numpy
+pandas >=1.0.3
pillow
prosampler
-pyarrow
+pyarrow >=0.16.0
pybedtools
pysam
-python
+python >=3
python-xxhash
pyyaml >=3.10
+qnorm
scikit-learn >=0.18
scipy <1.3.0
seaborn
-six
-sklearn-contrib-lightning
statsmodels
tqdm >=4.27.0
trawler
ucsc-bigbedtobed
-ucsc-genepredtobed
weeder
xdg
xgboost >=0.71
diff --git a/data/templates/sortable/sortable-theme-slick.css b/data/templates/sortable/sortable-theme-slick.css
index df20f5a8..a699c304 100644
--- a/data/templates/sortable/sortable-theme-slick.css
+++ b/data/templates/sortable/sortable-theme-slick.css
@@ -1,6 +1,7 @@
/* line 2, ../sass/_sortable.sass */
table[data-sortable] {
- font-size: 80%;
+ font-family: 'Nunito Sans';
+ font-size: 90%;
border-collapse: collapse;
border-spacing: 0;
}
diff --git a/data/templates/sortable/sortable.min.js b/data/templates/sortable/sortable.min.js
index b968cf01..11d419dd 100644
--- a/data/templates/sortable/sortable.min.js
+++ b/data/templates/sortable/sortable.min.js
@@ -1,2 +1,2 @@
/*! sortable.js 0.8.0 */
-(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(-?[£$¤]?[\d,.e\-]+%?|inf)$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this);
+(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(.+\>)?(-?[£$¤]?[\d,.e\-]+%?|inf)(\<.+)?$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this);
diff --git a/data/templates/table.tpl b/data/templates/table.tpl
new file mode 100644
index 00000000..bf836b55
--- /dev/null
+++ b/data/templates/table.tpl
@@ -0,0 +1,215 @@
+{# Update the template_structure.html document too #}
+{%- block before_style -%}{%- endblock before_style -%}
+{% block style %}
+
+
+{%- endblock style %}
+{%- block before_table %}{% endblock before_table %}
+{%- block table %}
+
+{%- block caption %}
+{%- if caption -%}
+ {{caption}}
+{%- endif -%}
+{%- endblock caption %}
+{%- block thead %}
+
+ {%- block before_head_rows %}{% endblock %}
+ {%- for r in head %}
+ {%- block head_tr scoped %}
+
+ {%- for c in r %}
+ {%- if c.is_visible != False %}
+ <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}{{ c.type }}>
+ {%- endif %}
+ {%- endfor %}
+
+ {%- endblock head_tr %}
+ {%- endfor %}
+ {%- block after_head_rows %}{% endblock %}
+
+{%- endblock thead %}
+{%- block tbody %}
+
+ {% block before_rows %}{% endblock before_rows %}
+ {% for r in body %}
+ {% block tr scoped %}
+
+ {% for c in r %}
+ {% if c.is_visible != False %}
+ <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}{{ c.type }}>
+ {% endif %}
+ {%- endfor %}
+
+ {% endblock tr %}
+ {%- endfor %}
+ {%- block after_rows %}{%- endblock after_rows %}
+
+{%- endblock tbody %}
+
+{%- endblock table %}
+{%- block after_table %}{% endblock after_table %}
+
diff --git a/docs/examples.rst b/docs/examples.rst
index 33fd06c7..9eedc21a 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -41,10 +41,10 @@ Compare motifs between data sets
$ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.out/
The output scores of ``gimme maelstrom`` represent the combined result of multiple methods.
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using rank aggregation.
-The score that is shown is the -log10(p-value), where the p-value (from the rank aggregation) is corrected for multiple testing.
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of
+the motif or a higher motif score is associated with higher signal in a specific sample.
Create sequence logos
---------------------
diff --git a/docs/reference.rst b/docs/reference.rst
index f29e2044..2ba5b10e 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -362,11 +362,26 @@ This command can be used to identify differential motifs between two or more dat
::
-h, --help show this help message and exit
- -p PFMFILE, --pfmfile PFMFILE
+ -p pfmfile, --pfmfile pfmfile
PFM file with motifs (default:
- gimme.vertebrate.v5.0.pwm)
+ gimme.vertebrate.v5.0.pfm)
+ --no-filter Don't remove redundant motifs.
+ -F FLOAT, --filter_cutoff FLOAT
+ Cutoff to select non-redundant motifs. Default is 0.8,
+ increase this value to get fewer motifs.
+ --nocenter Don't mean-center the rows by default
-m NAMES, --methods NAMES
Run with specific methods
+ -a method, --aggregation method
+ How to combine motifs from individual methods. Default
+ is "int_stouffer", for inverse normal transform of
+ ranks, followed by Stouffer's method to combine
+ z-scores. Alternatively, specify "stuart" for log-
+ transformed rank aggregation p-values.
+ -N INT, --nthreads INT
+ Number of threads (default 12)
+ --rawscore Don't z-score normalize motif scores
+ --nogc Don't use GC% bins
**Input file formats**
@@ -407,16 +422,23 @@ The second option looks like this:
This is a tab-separated table, with a header describing the experiments. In case of sequencing data, such
as ChIP-seq, ATAC-seq or DNaseI seq, we recommend to use **log-transformed** read counts which are
-**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after the log-transformatiion step,
-for instance by quantile normalization or scaling.
+**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after
+ the log-transformatiion step, for instance by quantile normalization or scaling.
+By default, ``gimme maelstrom`` will mean-center the input, disable this with ``--nocenter``.
The second input format generally gives better results than the first one and would be the recommended format.
The output scores of ``gimme maelstrom`` represent the combined result of multiple methods.
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using the rank aggregation method from `Kolde, 2012 `_.
-The score that is shown is the -log10(p-value), where the p-value comes from the rank aggregation.
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+This z-score represents the combined result of multiple methods.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of
+the motif or a higher motif score is associated with higher signal in a specific sample.
+
+By default, ``gimme maelstrom`` selects a non-redundant set of motifs by clustering the motifs based on scores in the set of
+input sequences. You can disable this by using the ``--no-filter`` argument. You can tweak the number of selected motifs by
+changing the ``--filter-cutoff`` parameter. By default this is set to ``0.8``. Increase this value to select fewer motifs,
+decrease it to select more motifs. Keep in mind that you may start to lose biologically relevant motifs if you set this too high.
.. _`gimme_scan`:
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2ba060ac..9429d2a2 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -12,5 +12,3 @@ six
future
statsmodels
tqdm
-xgboost >=0.71
-sklearn-contrib-lightning==0.4.0
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 088f8710..9aae5889 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -320,7 +320,7 @@ There is also a larger file, that contains more regions ``hg19.blood.most_variab
$ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.blood.1k.out
-There output directory contains several files:
+The output directory contains several files:
::
@@ -330,11 +330,11 @@ There output directory contains several files:
The two motif files, ``motif.count.txt.gz`` and ``motif.score.txt.gz`` contain the motif scan results.
The ``activity.*.out.txt`` files are tables with the results of the individual methods.
The main result is ``final.out.txt``, which integrates all individual methods in a final score.
-This score represents the combined result of multiple methods.
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using the rank aggregation method from `Kolde, 2012 `_.
-The score that is shown is the -log10(p-value).
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+This z-score represents the combined result of multiple methods.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of
+the motif or a higher motif score is associated with higher signal in a specific sample.
The file ``gimme.maelstrom.report.html`` contains a graphical summary of this file that can be opened in your web browser.
@@ -342,7 +342,6 @@ The file ``gimme.maelstrom.report.html`` contains a graphical summary of this fi
You can sort on the different columns by clicking on them.
-
The following Python snippet will create a heatmap of the results.
.. code-block:: python
@@ -359,7 +358,7 @@ This will show a heatmap like this:
.. image:: images/heatmap.png
We see that the expected motifs for different cell types are identified. GATA/TAL1 for Erythrocytes, CEBP for monocytes, LEF/TCF for T cells (ie. Wnt signaling), SPIB and PAX5 for B cells and so on.
-Keep in mind that this shows only the most relevant motifs (-log10 p-value cutoff of 6), there are more relevant motifs.
+Keep in mind that this shows only the most relevant motifs (z-score threshold of 6), there are more relevant motifs.
This example was run only on 1,000 variable enhancer. A file with more regions, ``hg19.blood.most_variable.10k.txt`` for this example, will usually yield better results.
The Jupyter notebook example `maelstrom.ipynb `_ shows a more extensive example on how to work with maelstrom results in Python.
diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py
index 3d63dd7f..fcd514c2 100644
--- a/gimmemotifs/__init__.py
+++ b/gimmemotifs/__init__.py
@@ -52,3 +52,25 @@ def filter(self, record):
__version__ = get_versions()["version"]
del get_versions
+
+# fmt: off
+# easier import of gimme (config and cli left out)
+from . import background # noqa: F401
+from . import cluster # noqa: F401
+from . import comparison # noqa: F401
+from . import denovo # noqa: F401
+from . import fasta # noqa: F401
+from . import maelstrom # noqa: F401
+from . import moap # noqa: F401
+from . import motif # noqa: F401
+from . import plot # noqa: F401
+from . import prediction # noqa: F401
+from . import rank # noqa: F401
+from . import report # noqa: F401
+from . import rocmetrics # noqa: F401
+from . import scanner # noqa: F401
+from . import shutils # noqa: F401
+from . import stats # noqa: F401
+from . import utils # noqa: F401
+from . import validation # noqa: F401
+# fmt: on
diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
index e308f556..46d33233 100644
--- a/gimmemotifs/background.py
+++ b/gimmemotifs/background.py
@@ -11,8 +11,6 @@
similar genomic distribution as the input.
"""
-from __future__ import division
-
# Python imports
import gzip
import os
@@ -251,15 +249,17 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None):
for _i in range(k - 1):
new_init = []
for x in init:
- for l in alphabet:
- new_init.append(x + l)
+ for letter in alphabet:
+ new_init.append(x + letter)
init = new_init[:]
- self.trans = dict([(word, dict([(l, 0.0) for l in alphabet])) for word in init])
+ self.trans = dict(
+ [(word, dict([(letter, 0.0) for letter in alphabet])) for word in init]
+ )
new_init = []
for x in init:
- for l in alphabet:
- new_init.append(x + l)
+ for letter in alphabet:
+ new_init.append(x + letter)
kmercount = dict([(word, 0) for word in new_init])
lettercount = dict([(word[:k], 0) for word in new_init])
@@ -286,9 +286,9 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None):
for k, v in lettercount.items():
self.init[k] = v / total
- def _generate_sequence(self, l):
+ def _generate_sequence(self, length):
sequence = list(self._weighted_random(list(self.init.items())))
- for _ in range(l - self.k):
+ for _ in range(length - self.k):
sequence.append(
self._weighted_random(
list(self.trans["".join(sequence[-self.k :])].items())
@@ -296,10 +296,10 @@ def _generate_sequence(self, l):
)
return "".join(sequence)
- def _weighted_random(self, l):
+ def _weighted_random(self, weighted_list):
n = random.uniform(0, 1)
item = None
- for item, weight in l: # noqa: B007
+ for item, weight in weighted_list: # noqa: B007
if n < weight:
break
else:
@@ -360,6 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100):
cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)]
df.columns = cols
+
+ # Make really sure that column 'chrom' is a string
+ df.dropna(subset=["chrom"], inplace=True)
+ df["chrom"] = df["chrom"].apply(str).astype("string")
+
df.reset_index()[cols].to_feather(fname)
diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index bc0e602f..b24e3652 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -326,6 +326,29 @@ def cli(sys_args):
default=default_pfm_file,
metavar="pfmfile",
)
+ p.add_argument(
+ "--no-filter",
+ dest="filter_redundant",
+ help="Don't remove redundant motifs.",
+ default=True,
+ action="store_false",
+ )
+ p.add_argument(
+ "-F",
+ "--filter_cutoff",
+ dest="filter_cutoff",
+ help="Cutoff to select non-redundant motifs. Default is 0.8, increase this value to get fewer motifs.",
+ default=0.8,
+ type=float,
+ metavar="FLOAT",
+ )
+ p.add_argument(
+ "--nocenter",
+ dest="center",
+ help="Don't mean-center the rows by default",
+ default=True,
+ action="store_false",
+ )
p.add_argument(
"-m",
"--methods",
@@ -334,6 +357,19 @@ def cli(sys_args):
default=None,
metavar="NAMES",
)
+ p.add_argument(
+ "-a",
+ "--aggregation",
+ dest="aggregation",
+ help=(
+ 'How to combine motifs from individual methods. Default is "int_stouffer", '
+ "for inverse normal transform of ranks, followed by Stouffer's method to combine "
+ 'z-scores. Alternatively, specify "stuart" for log-transformed rank aggregation '
+ "p-values."
+ ),
+ default="int_stouffer",
+ metavar="method",
+ )
p.add_argument(
"-N",
"--nthreads",
@@ -353,7 +389,7 @@ def cli(sys_args):
p.add_argument(
"--nogc",
dest="gc",
- help="Don't use GC% bins",
+ help="Don't use GC%% bins",
action="store_false",
default=True,
)
diff --git a/gimmemotifs/commands/__init__.py b/gimmemotifs/commands/__init__.py
index c20bb546..f944936c 100644
--- a/gimmemotifs/commands/__init__.py
+++ b/gimmemotifs/commands/__init__.py
@@ -1,13 +1,9 @@
import pkgutil
-import six
import os
dirname = os.path.split(__file__)[0]
-if six.PY3:
- level = 0
-else:
- level = -1
+level = 0
# Dynamically load all commands
for _importer, cmdname, _ in pkgutil.iter_modules([dirname]):
diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py
index b7d8e43f..04d8cd1c 100755
--- a/gimmemotifs/commands/maelstrom.py
+++ b/gimmemotifs/commands/maelstrom.py
@@ -15,10 +15,14 @@ def maelstrom(args):
genome = args.genome
outdir = args.outdir
pfmfile = args.pfmfile
+ filter_redundant = args.filter_redundant
+ filter_cutoff = args.filter_cutoff
methods = args.methods
ncpus = args.ncpus
zscore = args.zscore
+ center = args.center
gc = args.gc
+ aggregation = args.aggregation
if not os.path.exists(infile):
raise ValueError("file {} does not exist".format(infile))
@@ -31,8 +35,12 @@ def maelstrom(args):
genome,
outdir,
pfmfile,
+ filter_redundant=filter_redundant,
+ filter_cutoff=filter_cutoff,
methods=methods,
ncpus=ncpus,
zscore=zscore,
gc=gc,
+ center=center,
+ aggregation=aggregation,
)
diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index 429fca70..f5b72737 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -7,12 +7,14 @@
"""Command line function 'roc'."""
from __future__ import print_function
import os
+import re
import sys
import shutil
import logging
from tempfile import NamedTemporaryFile
import numpy as np
+import pandas as pd
from gimmemotifs.background import create_background_file
from gimmemotifs.comparison import MotifComparer, select_nonredundant_motifs
@@ -22,7 +24,11 @@
from gimmemotifs.stats import calc_stats_iterator
from gimmemotifs.report import roc_html_report
from gimmemotifs.scanner import scan_to_file
-from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed
+from gimmemotifs.utils import (
+ determine_file_type,
+ narrowpeak_to_bed,
+ write_equalsize_bedfile,
+)
logger = logging.getLogger("gimme.motifs")
@@ -38,6 +44,19 @@ def motifs(args):
if not os.path.exists(scan_dir):
os.makedirs(scan_dir)
+ sample = args.sample
+ if args.size and args.size > 0:
+ file_type = determine_file_type(args.sample)
+ if file_type == "fasta":
+ logger.warn("size parameter will be ignored for FASTA input")
+ else:
+ outfile = os.path.join(args.outdir, f"input.w{args.size}.bed")
+ if file_type == "narrowpeak":
+ narrowpeak_to_bed(args.sample, outfile, size=args.size)
+ if file_type == "bed":
+ write_equalsize_bedfile(args.sample, args.size, outfile)
+ sample = outfile
+
genome = args.genome
if genome is None:
args.zscore = False
@@ -70,7 +89,7 @@ def motifs(args):
bg,
fmt="fasta",
genome=genome,
- inputfile=args.sample,
+ inputfile=sample,
size=size,
number=10000,
)
@@ -83,7 +102,7 @@ def motifs(args):
if args.denovo:
gimme_motifs(
- args.sample,
+ sample,
args.outdir,
params={
"tools": args.tools,
@@ -142,19 +161,19 @@ def motifs(args):
# Print the metrics
f_out.write(
- "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
+ "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
)
logger.info("creating motif scan tables")
- ftype = determine_file_type(args.sample)
- sample = args.sample
- delete_sample = False
- if ftype == "narrowpeak":
- f = NamedTemporaryFile(delete=False)
- logger.debug("Using %s as temporary BED file".format(f.name))
- narrowpeak_to_bed(args.sample, f.name, size=args.size)
- sample = f.name
- delete_sample = True
+ # ftype = determine_file_type(args.sample)
+ # sample = args.sample
+ # delete_sample = False
+ # if ftype == "narrowpeak":
+ # f = NamedTemporaryFile(delete=False)
+ # logger.debug("Using {} as temporary BED file".format(f.name))
+ # narrowpeak_to_bed(args.sample, f.name, size=args.size)
+ # sample = f.name
+ # delete_sample = True
# Create a table with the best score per motif for all motifs.
# This has three reasons:
@@ -174,6 +193,9 @@ def motifs(args):
gcnorm=True,
)
+ n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0]
+ n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0]
+
logger.info("calculating stats")
for motif_stats in calc_stats_iterator(
motifs=pfmfile,
@@ -188,10 +210,14 @@ def motifs(args):
if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"])
f_out.write(
- "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format(
+ "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format(
motif.id,
motif_stats[str(motif)]["matches_at_fpr"][0],
+ motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100,
motif_stats[str(motif)]["matches_at_fpr"][1],
+ motif_stats[str(motif)]["matches_at_fpr"][1]
+ / n_background
+ * 100,
motif_stats[str(motif)]["phyper_at_fpr"],
log_pvalue,
motif_stats[str(motif)]["roc_auc"],
@@ -203,7 +229,7 @@ def motifs(args):
f_out.close()
# Select a set of "non-redundant" motifs.
- # Using Recursive Feature Elemination, a set of motifs is selected that
+ # Using Recursive Feature Elimination, a set of motifs is selected that
# best explains the peaks in comparison to the background sequences.
nr_motifs = select_nonredundant_motifs(
args.outdir + "/gimme.roc.report.txt",
@@ -221,10 +247,11 @@ def motifs(args):
with NamedTemporaryFile(mode="w") as f:
print(motif_dict[motif].to_pwm(), file=f)
f.flush()
+ safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
scan_to_file(
sample,
f.name,
- filepath_or_buffer=os.path.join(scan_dir, f"{motif}.matches.bed"),
+ filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"),
bed=True,
fpr=0.01,
genome=args.genome,
@@ -232,9 +259,6 @@ def motifs(args):
gcnorm=True,
)
- if delete_sample:
- os.unlink(sample)
-
if args.report:
logger.info("creating statistics report")
if args.outdir:
diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py
index 94c0b808..911cd9d6 100644
--- a/gimmemotifs/comparison.py
+++ b/gimmemotifs/comparison.py
@@ -6,8 +6,6 @@
"""
Module to compare DNA sequence motifs (positional frequency matrices)
"""
-from __future__ import print_function
-
# Python imports
import sys
import os
@@ -890,12 +888,14 @@ def generate_score_dist(self, motifs, match, metric, combine):
f = open(score_file, "w")
all_scores = {}
- for l in [len(motif) for motif in motifs]:
- all_scores[l] = {}
+ for motif_len in [len(motif) for motif in motifs]:
+ all_scores[motif_len] = {}
sorted_motifs = {}
- for l in all_scores.keys():
- sorted_motifs[l] = [motif for motif in motifs if len(motif) == l]
+ for motif_len in all_scores.keys():
+ sorted_motifs[motif_len] = [
+ motif for motif in motifs if len(motif) == motif_len
+ ]
for l1 in all_scores.keys():
for l2 in all_scores.keys():
@@ -950,7 +950,11 @@ def select_nonredundant_motifs(
y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0])))
X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.4, random_state=2, shuffle=True,
+ X,
+ y,
+ test_size=0.4,
+ random_state=2,
+ shuffle=True,
)
X_bla = X_train[keep]
diff --git a/gimmemotifs/denovo.py b/gimmemotifs/denovo.py
index 2d1913ac..2cd1225f 100644
--- a/gimmemotifs/denovo.py
+++ b/gimmemotifs/denovo.py
@@ -669,11 +669,14 @@ def gimme_motifs(
sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True)
final_motifs, stats = rename_motifs(sorted_motifs, result.stats)
- with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
- for m in final_motifs:
- f.write("{}\n".format(m.to_pwm()))
+ motifs_found = len(final_motifs) > 0
- if create_report:
+ if motifs_found:
+ with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
+ for m in final_motifs:
+ f.write("{}\n".format(m.to_pwm()))
+
+ if motifs_found and create_report:
bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background])
create_denovo_motif_report(
@@ -700,7 +703,7 @@ def gimme_motifs(
logger.info("finished")
logger.info("output dir: %s", outdir)
- if cluster:
+ if motifs_found and cluster:
logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html"))
return final_motifs
diff --git a/gimmemotifs/fasta.py b/gimmemotifs/fasta.py
index 6cc1d5d4..781d1cf0 100644
--- a/gimmemotifs/fasta.py
+++ b/gimmemotifs/fasta.py
@@ -12,7 +12,7 @@
class Fasta(object):
- def __init__(self, fname=None, split_whitespace=False):
+ def __init__(self, fname=None, split_whitespace=False, fdict=None):
""" Instantiate fasta object. Optional Fasta-formatted file as argument"""
self.ids = []
self.seqs = []
@@ -35,6 +35,10 @@ def __init__(self, fname=None, split_whitespace=False):
if p.match(sequence):
raise IOError("Not a valid FASTA file")
self.seqs.append(sequence)
+ elif fdict is not None:
+ for name, seq in fdict.items():
+ self.ids.append(name)
+ self.seqs.append(seq)
def hardmask(self):
""" Mask all lowercase nucleotides with N's """
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index c4bf21ab..fbe46b03 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -22,9 +22,13 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
+from scipy.stats import pearsonr
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram
+from sklearn.cluster import FeatureAgglomeration
+
+# from scipy.spatial.distance import correlation
# Plotting
import matplotlib.pyplot as plt
@@ -96,7 +100,9 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):
mapfile = pfmfile.replace(".pwm", ".motif2factors.txt")
if os.path.exists(mapfile):
- m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0)
+ m2f = pd.read_csv(
+ mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#"
+ )
m2f["factors"] = m2f["factors"].str[:50]
else:
motifs = [m.id for m in read_motifs(pfmfile)]
@@ -172,38 +178,29 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):
plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300)
-def _rank_agg_column(exps, dfs, e):
- tmp_dfs = [pd.DataFrame(), pd.DataFrame()]
-
- for i, sort_order in enumerate([False, True]):
- for method, scoring, _ in exps:
- k = "{}.{}".format(method, scoring)
- if k in dfs:
- v = dfs[k]
- # Sample rows before sorting to shuffle
- # Otherwise all ties will not have a random order due to inherent
- # ordering of the motif dataframe
- tmp_dfs[i][k] = (
- v.sample(frac=1).sort_values(e, ascending=sort_order).index.values
- )
- return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
-
-
-def df_rank_aggregation(df, dfs, exps):
+def df_rank_aggregation(df, dfs, exps, method="int_stouffer"):
df_p = pd.DataFrame(index=list(dfs.values())[0].index)
names = list(dfs.values())[0].columns
+ dfs = [
+ pd.concat([v[col].rename(k, inplace=True) for k, v in dfs.items()], axis=1)
+ for col in names
+ ]
pool = Pool(16)
- func = partial(_rank_agg_column, exps, dfs)
- ret = pool.map(func, names)
+ func = partial(rankagg, method=method)
+ ret = pool.map(func, dfs)
pool.close()
pool.join()
- for e, result in zip(names, ret):
- df_p[e] = result
+ for name, result in zip(names, ret):
+ df_p[name] = result
if df.shape[1] != 1:
df_p = df_p[df.columns]
+ if method == "int_stouffer":
+ df_p.columns = ["z-score " + c for c in df_p.columns]
+ else:
+ df_p.columns = ["activity " + c for c in df_p.columns]
return df_p
@@ -212,6 +209,8 @@ def run_maelstrom(
genome,
outdir,
pfmfile=None,
+ filter_redundant=True,
+ filter_cutoff=0.8,
plot=True,
cluster=False,
score_table=None,
@@ -220,6 +219,8 @@ def run_maelstrom(
ncpus=None,
zscore=True,
gc=True,
+ center=False,
+ aggregation="int_stouffer",
):
"""Run maelstrom on an input table.
@@ -239,6 +240,12 @@ def run_maelstrom(
pfmfile : str, optional
Specify a PFM file for scanning.
+ filter_redundant : bool, optional
+ Create a non-redundant set of motifs based on correlation of motif scores in the input data.
+
+ filter_cutoff : float, optional
+ Cutoff to use for non-redundant motif selection. Default is 0.8.
+
plot : bool, optional
Create heatmaps.
@@ -264,6 +271,15 @@ def run_maelstrom(
gc : bool, optional
Use GC% bins to normalize motif scores.
+
+ center : bool, optional
+ Mean-center the input table.
+
+ aggregation: str, optional
+ How to combine scores of the predictors. The default is "int_stouffer", for
+ inverse normal transform followed by Stouffer's methods to combine z-scores.
+ Alternatively, "stuart" performs rank aggregation and reports the -log10 of
+ the rank aggregation p-value.
"""
logger.info("Starting maelstrom")
if infile.endswith("feather"):
@@ -272,6 +288,28 @@ def run_maelstrom(
else:
df = pd.read_table(infile, index_col=0, comment="#")
+ # Check if the input is mean-centered
+ if df.shape[1] > 1 and not np.allclose(df.mean(1), 0):
+ if center:
+ logger.info(
+ "Input is not mean-centered, setting the mean of all rows to 0."
+ )
+ logger.info(
+ "Use --nocenter if you know what you're doing and want to change this behavior."
+ )
+ logger.info(
+ "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to "
+ "first transform your data, for instance using log2(), and to normalize "
+ "between samples. To create a table suitable for maelstrom you can use the "
+ "coverage_table script included with GimmeMotifs."
+ )
+ df = df.sub(df.mean(axis=1), axis=0)
+ else:
+ logger.info("Input is not mean-centered, but --nocenter was specified.")
+ logger.info(
+ "Leaving the data as-is, but make sure this is what your really want."
+ )
+
# Check for duplicates
if df.index.duplicated(keep=False).any():
logger.warning("Input file contains duplicate regions!")
@@ -334,6 +372,69 @@ def run_maelstrom(
else:
logger.info("Scores, using: %s", score_table)
+ counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
+ scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")
+
+ if filter_redundant:
+ logger.info("Selecting non-redundant motifs")
+
+ fa = FeatureAgglomeration(
+ distance_threshold=filter_cutoff,
+ n_clusters=None,
+ affinity="correlation",
+ linkage="complete",
+ compute_full_tree=True,
+ )
+ fa.fit(scores)
+ X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_})
+ X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif")
+ selected_motifs = (
+ X_cluster.sort_values("var")
+ .drop_duplicates(subset=["label"], keep="last")["motif"]
+ .values
+ )
+ nr_motif = (
+ X_cluster.sort_values("var")
+ .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]]
+ .set_index("label")
+ )
+ X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label")
+ motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif")
+
+ scores = scores[selected_motifs]
+ counts = counts[selected_motifs]
+ score_table = os.path.join(outdir, "motif.nr.score.txt.gz")
+ scores.to_csv(score_table, sep="\t", compression="gzip")
+ count_table = os.path.join(outdir, "motif.nr.count.txt.gz")
+ counts.to_csv(count_table, sep="\t", compression="gzip")
+
+ m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#")
+ m2f = m2f.join(motif_map, on="Motif")
+ m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N"
+ m2f["Motif"] = m2f["motif_nr"]
+ m2f = m2f.drop(columns=["motif_nr"])
+
+ motifs = read_motifs(pfmfile)
+ pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm")
+ with open(pfmfile, "w") as f:
+ for motif in motifs:
+ f.write(f"{motif.to_pfm()}\n")
+ mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")
+ with open(mapfile, "w") as f:
+ f.write(
+ "# Note: this mapping is specifically created for this non-redundant set of motifs.\n"
+ )
+ f.write(
+ "# It also includes factors for motifs that were similar, but this can be\n"
+ )
+ f.write("# specific to this analysis.\n")
+
+ with open(mapfile, "a") as f:
+ m2f.to_csv(f, index=False, sep="\t")
+ logger.info(f"Selected {len(selected_motifs)} motifs")
+ logger.info(f"Motifs: {pfmfile}")
+ logger.info(f"Factor mappings: {mapfile}")
+
if cluster:
cluster = False
for method in methods:
@@ -380,18 +481,14 @@ def run_maelstrom(
for method, scoring, fname in exps:
try:
- if scoring == "count" and count_table is not None:
+ if scoring == "count":
moap_with_table(
fname, count_table, outdir, method, scoring, ncpus=ncpus
)
- elif scoring == "score" and score_table is not None:
+ elif scoring == "score":
moap_with_table(
fname, score_table, outdir, method, scoring, ncpus=ncpus
)
- else:
- moap_with_bg(
- fname, genome, outdir, method, scoring, pfmfile=pfmfile, ncpus=ncpus
- )
except Exception as e:
logger.warn("Method %s with scoring %s failed", method, scoring)
@@ -409,14 +506,34 @@ def run_maelstrom(
if len(methods) > 1:
logger.info("Rank aggregation")
- df_p = df_rank_aggregation(df, dfs, exps)
+ df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)
+
+ # Add percentage of input sequences with motif
+ if df.shape[1] > 1:
+ df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100
+ else:
+ bla = counts.join(df).groupby(df.columns[0]).mean() * 100
+ bla = bla.T
+ bla = bla.rename(
+ columns={col: f"{col} % with motif" for col in bla.columns}
+ )
+ df_p = df_p.join(bla)
+
+ if df.shape[1] > 1:
+ # Add correlation between motif score and signal
+ logger.info("Correlation")
+ for col in df.columns:
+ df_p[f"corr {col}"] = 0
+ for motif in df_p.index:
+ df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0]
+
df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
# df_p = df_p.join(m2f)
# Write motif frequency table
if df.shape[1] == 1:
- mcount = df.join(pd.read_table(count_table, index_col=0, comment="#"))
+ mcount = df.join(counts)
m_group = mcount.groupby(df.columns[0])
freq = m_group.sum() / m_group.count()
freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")
@@ -455,7 +572,10 @@ def __init__(self, outdir):
raise FileNotFoundError("No such directory: " + outdir)
# Load motifs
- fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
+ fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m"))
+
+ if len(fnames) == 0:
+ fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
if len(fnames) > 0:
pfmfile = fnames[0]
with open(pfmfile) as fin:
@@ -474,6 +594,15 @@ def __init__(self, outdir):
self.result = pd.read_table(
os.path.join(outdir, "final.out.txt"), comment="#", index_col=0
)
+ self.correlation = self.result.loc[:, self.result.columns.str.contains("corr")]
+ self.percent_match = self.result.loc[
+ :, self.result.columns.str.contains("% with motif")
+ ]
+ self.result = self.result.loc[
+ :,
+ ~self.result.columns.str.contains("corr")
+ & ~self.result.columns.str.contains("% with motif"),
+ ]
# Read motif results
self.scores = pd.read_table(
@@ -502,11 +631,12 @@ def plot_heatmap(
min_freq=0.01,
threshold=2,
name=True,
- indirect=False,
+ indirect=True,
figsize=None,
- max_len=50,
+ max_number_factors=5,
aspect=1,
- **kwargs
+ cmap="RdBu_r",
+ **kwargs,
):
"""Plot clustered heatmap of predicted motif activity.
@@ -514,7 +644,7 @@ def plot_heatmap(
----------
kind : str, optional
Which data type to use for plotting. Default is 'final', which will
- plot the result of the rang aggregation. Other options are 'freq'
+ plot the result of the rank aggregation. Other options are 'freq'
for the motif frequencies, or any of the individual activities such
as 'rf.score'.
@@ -528,10 +658,10 @@ def plot_heatmap(
Use factor names instead of motif names for plotting.
indirect : bool, optional
- Include indirect factors. Default is False.
+ Include indirect factors (computationally predicted or non-curated). Default is True.
- max_len : int, optional
- Truncate the list of factors to this maximum length.
+ max_number_factors : int, optional
+ Truncate the list of factors to this maximum size.
figsize : tuple, optional
Tuple of figure size (width, height).
@@ -539,8 +669,11 @@ def plot_heatmap(
aspect : int, optional
Aspect ratio for tweaking the plot.
+ cmap : str, optional
+ Color paletter to use, RdBu_r by default.
+
kwargs : other keyword arguments
- All other keyword arguments are passed to sns.clustermap
+ All other keyword arguments are passed to sns.heatmap
Returns
-------
@@ -555,13 +688,17 @@ def plot_heatmap(
filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq)
idx = self.result.loc[filt].index
+
+ if idx.shape[0] == 0:
+ logger.warning("Empty matrix, try lowering the threshold")
+ return
+
if idx.shape[0] >= 100:
logger.warning("The filtered matrix has more than 100 rows.")
logger.warning(
"It might be worthwhile to increase the threshold for visualization"
)
- cmap = "RdBu_r"
if kind == "final":
data = self.result
elif kind == "freq":
@@ -579,18 +716,25 @@ def plot_heatmap(
else:
raise ValueError("Unknown dtype")
- # print(data.head())
- # plt.figure(
m = data.loc[idx]
- vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99))
- vmin = -vmax
+
+ if "vmax" in kwargs:
+ vmax = kwargs.pop("vmax")
+ else:
+ vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99))
+
+ if "vmin" in kwargs:
+ vmin = kwargs.pop("vmin")
+ else:
+ vmin = -vmax
+
if name:
m["factors"] = [
- join_max(
- _get_factor_list(self.motifs[n], indirect),
- max_len,
- ",",
- suffix=",(...)",
+ self.motifs[n].format_factors(
+ max_length=max_number_factors,
+ html=False,
+ include_indirect=indirect,
+ extra_str=",..",
)
for n in m.index
]
@@ -598,7 +742,8 @@ def plot_heatmap(
h, w = m.shape
if figsize is None:
- figsize = (3 + m.shape[1] / 4, 1 + m.shape[0] / 3)
+ figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3)
+
fig = plt.figure(figsize=figsize)
npixels = 30
g = GridSpec(
@@ -606,8 +751,8 @@ def plot_heatmap(
)
ax1 = fig.add_subplot(g[0, :])
ax2 = fig.add_subplot(g[1, :])
- ax2.set_title("Significance (-log10(p-value))")
- dm = pdist(m, metric="euclidean")
+ ax2.set_title("aggregated z-score")
+ dm = pdist(m, metric="correlation")
hc = linkage(dm, method="ward")
leaves = dendrogram(hc, no_plot=True)["leaves"]
cg = sns.heatmap(
@@ -619,10 +764,12 @@ def plot_heatmap(
linewidths=1,
vmin=vmin,
vmax=vmax,
+ **kwargs,
)
+ plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90)
plt.tight_layout()
# cg.ax_col_dendrogram.set_visible(False)
- # plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
+ # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
return cg
def plot_scores(self, motifs, name=True, max_len=50):
diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 45b87356..d4053bc4 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -4,7 +4,6 @@
# the terms of the MIT License, see the file COPYING included with this
# distribution.
""" Module for motif activity prediction """
-from __future__ import print_function
def warn(*args, **kwargs):
@@ -18,7 +17,6 @@ def warn(*args, **kwargs):
import os
import sys
-import shutil
try:
from itertools import izip
@@ -33,13 +31,14 @@ def warn(*args, **kwargs):
from tqdm.auto import tqdm
# scikit-learn
-from sklearn.model_selection import train_test_split, GridSearchCV
-from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
-from sklearn.linear_model import MultiTaskLasso, BayesianRidge
+from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge
+from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import scale, LabelEncoder
-from lightning.classification import CDClassifier
-from lightning.regression import CDRegressor
+from sklearn.svm import LinearSVR
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
import xgboost
@@ -356,252 +355,6 @@ def fit(self, df_X, df_y):
logger.info("Done")
-@register_predictor("LightningRegressor")
-class LightningRegressionMoap(Moap):
- def __init__(self, scale=True, cv=3, ncpus=None):
- """Predict motif activities using lightning CDRegressor
-
- Parameters
- ----------
- scale : boolean, optional, default True
- If ``True``, the motif scores will be scaled
- before classification
-
- cv : int, optional, default 3
- Cross-validation k-fold parameter.
-
- ncpus : int, optional
- Number of threads. Default is the number specified in the config.
-
- Attributes
- ----------
- act_ : DataFrame, shape (n_motifs, n_clusters)
- fitted coefficients
-
- sig_ : DataFrame, shape (n_motifs,)
- boolean values, if coefficients are higher/lower than
- the 1%t from random permutation
- """
-
- self.act_description = "activity values: coefficients from " "fitted model"
-
- if ncpus is None:
- ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
- self.ncpus = ncpus
- self.kfolds = cv
- self.scale = scale
-
- self.act_ = None
- self.pref_table = "score"
- self.supported_tables = ["score", "count"]
- self.ptype = "regression"
-
- def fit(self, df_X, df_y, batch_size=50, shuffle=True, tmpdir=None):
- logger.info("Fitting LightningRegression")
-
- if self.scale:
- # Scale motif scores
- df_X[:] = scale(df_X, axis=0)
-
- # Normalize across samples and features
- # y = df_y.apply(scale, 1).apply(scale, 0)
- y = df_y
- X = df_X.loc[y.index]
-
- if not y.shape[0] == X.shape[0]:
- raise ValueError("number of regions is not equal")
-
- # Define model
- cd = CDRegressor(penalty="l1/l2", C=1.0)
- parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 2)]}
- clf = GridSearchCV(cd, parameters, n_jobs=self.ncpus)
-
- if shuffle:
- idx = list(y.sample(y.shape[1], axis=1, random_state=42).columns)
- else:
- idx = list(y.columns)
-
- if tmpdir:
- if not os.path.exists(tmpdir):
- os.mkdir(tmpdir)
-
- coefs = pd.DataFrame(index=X.columns)
- start_i = 0
- if tmpdir:
- for i in range(0, len(idx), batch_size):
- fname = os.path.join(tmpdir, "{}.feather".format(i))
- if os.path.exists(fname) and os.path.exists(fname + ".done"):
-
- tmp = pd.read_feather(fname)
- tmp = tmp.set_index(tmp.columns[0])
- coefs = coefs.join(tmp)
- else:
- logger.info("Resuming at batch {}".format(i))
- start_i = i
- break
-
- for i in tqdm(range(start_i, len(idx), batch_size)):
- split_y = y[idx[i : i + batch_size]]
-
- # Fit model
- clf.fit(X.values, split_y.values)
- tmp = pd.DataFrame(
- clf.best_estimator_.coef_.T, index=X.columns, columns=split_y.columns
- )
- if tmpdir:
- fname = os.path.join(tmpdir, "{}.feather".format(i))
- tmp.reset_index().rename(columns=str).to_feather(fname)
- # Make sure we don't read corrupted files
- open(fname + ".done", "a").close()
- # Get coefficients
- coefs = coefs.join(tmp)
-
- # Get coefficients
- self.act_ = coefs[y.columns]
-
- logger.info("Done")
-
-
-@register_predictor("LightningClassification")
-class LightningClassificationMoap(Moap):
- def __init__(self, scale=True, permute=False, ncpus=None):
- """Predict motif activities using lightning CDClassifier
-
- Parameters
- ----------
- scale : boolean, optional, default True
- If ``True``, the motif scores will be scaled
- before classification
-
- ncpus : int, optional
- Number of threads. Default is the number specified in the config.
-
- Attributes
- ----------
- act_ : DataFrame, shape (n_motifs, n_clusters)
- fitted coefficients
-
- sig_ : DataFrame, shape (n_motifs,)
- boolean values, if coefficients are higher/lower than
- the 1%t from random permutation
- """
-
- self.act_description = "activity values: coefficients from " "fitted model"
-
- # self.cdc = CDClassifier(random_state=args.seed)
- self.cdc = CDClassifier()
-
- self.parameters = {
- "penalty": ["l1/l2"],
- "loss": ["squared_hinge"],
- "multiclass": [True],
- "max_iter": [20],
- "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
- "C": [0.001, 0.01, 0.1, 0.5, 1.0],
- "tol": [1e-3],
- }
-
- self.kfolds = 10
-
- if ncpus is None:
- ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
-
- self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus)
-
- self.scale = scale
- self.permute = permute
-
- self.act_ = None
- self.sig_ = None
- self.pref_table = "score"
- self.supported_tables = ["score", "count"]
- self.ptype = "classification"
-
- def fit(self, df_X, df_y):
- logger.info("Fitting LightningClassification")
-
- if not df_y.shape[0] == df_X.shape[0]:
- raise ValueError("number of regions is not equal")
- if df_y.shape[1] != 1:
- raise ValueError("y needs to have 1 label column")
-
- if self.scale:
- # Scale motif scores
- df_X[:] = scale(df_X, axis=0)
-
- idx = list(range(df_y.shape[0]))
-
- y = df_y.iloc[idx]
- X = df_X.loc[y.index].values
- y = y.values.flatten()
-
- # Convert (putative) string labels
- label = LabelEncoder()
- y = label.fit_transform(y)
-
- # Split data
- X_train, X_test, y_train, y_test = train_test_split(X, y)
-
- logger.debug("Setting parameters through cross-validation")
- # Determine best parameters based on CV
- self.clf.fit(X_train, y_train)
-
- logger.debug(
- "Average score ({} fold CV): {}".format(
- self.kfolds, self.clf.score(X_test, y_test)
- )
- )
-
- logger.debug("Estimate coefficients using bootstrapping")
-
- # Estimate coefficients using bootstrappig
- # b = BaggingClassifier(self.clf.best_estimator_,
- # max_samples=0.75, n_jobs=-1, random_state=state)
- b = BaggingClassifier(self.clf.best_estimator_, max_samples=0.75, n_jobs=-1)
- b.fit(X, y)
-
- # Get mean coefficients
- coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0)
-
- # Create dataframe of predicted coefficients
- if len(label.classes_) == 2:
- self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))
- else:
- self.act_ = pd.DataFrame(coeffs.T)
-
- # Convert labels back to original names
- self.act_.columns = label.inverse_transform(range(len(label.classes_)))
- self.act_.index = df_X.columns
-
- if self.permute:
- # Permutations
- logger.debug("Permutations")
- random_dfs = []
- for _ in range(10):
- y_random = np.random.permutation(y)
- b.fit(X, y_random)
- coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0)
-
- if len(label.classes_) == 2:
- random_dfs.append(pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))))
- else:
- random_dfs.append(pd.DataFrame(coeffs.T))
- random_df = pd.concat(random_dfs)
-
- # Select cutoff based on percentile
- high_cutoffs = random_df.quantile(0.99)
- low_cutoffs = random_df.quantile(0.01)
-
- # Set significance
- self.sig_ = pd.DataFrame(index=df_X.columns)
- self.sig_["sig"] = False
-
- for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs):
- self.sig_["sig"].loc[self.act_[col] >= c_high] = True
- self.sig_["sig"].loc[self.act_[col] <= c_low] = True
- logger.info("Done")
-
-
@register_predictor("MWU")
class MWUMoap(Moap):
def __init__(self, *args, **kwargs):
@@ -791,22 +544,16 @@ def fit(self, df_X, df_y):
logger.info("Done")
-@register_predictor("Lasso")
-class LassoMoap(Moap):
- def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
- """Predict motif activities using Lasso MultiTask regression
+@register_predictor("MultiTaskLasso")
+class MultiTaskLassoMoap(Moap):
+ def __init__(self, scale=True, ncpus=None):
+ """Predict motif activities using MultiTaskLasso.
Parameters
----------
scale : boolean, optional, default True
If ``True``, the motif scores will be scaled
- before classification
-
- kfolds : integer, optional, default 5
- number of kfolds for parameter search
-
- alpha_stepsize : float, optional, default 1.0
- stepsize for use in alpha gridsearch
+ before classification.
ncpus : int, optional
Number of threads. Default is the number specified in the config.
@@ -814,101 +561,129 @@ def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
Attributes
----------
act_ : DataFrame, shape (n_motifs, n_clusters)
- fitted motif activities
-
- sig_ : DataFrame, shape (n_motifs,)
- boolean values, if coefficients are higher/lower than
- the 1%t from random permutation
+ Coefficients of the regression model.
"""
- self.kfolds = kfolds
- self.act_description = "activity values: coefficients from " "fitted model"
+ self.act_description = "activity values: coefficients of the" "regression model"
- self.scale = scale
if ncpus is None:
ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
self.ncpus = ncpus
-
- # initialize attributes
+ self.scale = scale
self.act_ = None
- self.sig_ = None
+ self.pref_table = "score"
+ self.supported_tables = ["score", "count"]
+ self.ptype = "regression"
+
+ def fit(self, df_X, df_y):
+ logger.info("Fitting MultiTaskLasso")
+
+ if not df_y.shape[0] == df_X.shape[0]:
+ raise ValueError("number of regions is not equal")
+
+ if self.scale:
+ logger.debug("Scaling motif scores")
+ # Scale motif scores
+ df_X.loc[:, :] = scale(df_X, axis=0)
+
+ # logger.debug("Scaling y")
+
+ # Normalize across samples and features
+ # y = df_y.apply(scale, 1).apply(scale, 0)
+ y = df_y
+
+ X = df_X.loc[y.index]
- mtk = MultiTaskLasso()
- parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]}
- self.clf = GridSearchCV(
- mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2"
+ model = Pipeline(
+ [
+ ("scale", StandardScaler()),
+ (
+ "reg",
+ MultiTaskLassoCV(
+ fit_intercept=False, n_alphas=20, n_jobs=self.ncpus
+ ),
+ ),
+ ]
)
+ logger.debug("Fitting model")
+ model.fit(df_X, df_y)
+ logger.info("Done")
+
+ self.act_ = pd.DataFrame(
+ model.steps[1][1].coef_, index=y.columns, columns=X.columns
+ ).T
+
+ def predict(self, df_X):
+ return df_X.dot(self.act_.loc[df_X.columns])
+
+
+@register_predictor("SVR")
+class SVRMoap(Moap):
+ def __init__(self, scale=True, ncpus=None):
+ """Predict motif activities using Support Vector Regression.
+
+ Parameters
+ ----------
+ scale : boolean, optional, default True
+ If ``True``, the motif scores will be scaled
+ before classification.
+
+ ncpus : int, optional
+ Number of threads. Default is the number specified in the config.
+
+ Attributes
+ ----------
+ act_ : DataFrame, shape (n_motifs, n_clusters)
+ SVR weights.
+ """
+
+ self.act_description = "activity values: SVR weights"
+
+ if ncpus is None:
+ ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
+ self.ncpus = ncpus
+ self.scale = scale
+ self.act_ = None
self.pref_table = "score"
self.supported_tables = ["score", "count"]
self.ptype = "regression"
- def fit(self, df_X, df_y, permute=False):
- logger.info("Fitting Lasso")
+ def fit(self, df_X, df_y):
+ logger.info("Fitting SVR")
+
if not df_y.shape[0] == df_X.shape[0]:
raise ValueError("number of regions is not equal")
if self.scale:
+ logger.debug("Scaling motif scores")
# Scale motif scores
- df_X[:] = scale(df_X, axis=0)
+ df_X.loc[:, :] = scale(df_X, axis=0)
+
+ # logger.debug("Scaling y")
- idx = list(range(df_y.shape[0]))
- y = df_y.iloc[idx]
- X = df_X.loc[y.index].values
- y = y.values
-
- # fit coefficients
- coefs = self._get_coefs(X, y)
- self.act_ = pd.DataFrame(coefs.T)
-
- # convert labels back to original names
- self.act_.columns = df_y.columns
- self.act_.index = df_X.columns
-
- if permute:
- # Permutations
- logger.info("permutations\n")
- random_dfs = []
- for _ in range(10):
- y_random = y[np.random.permutation(range(y.shape[0]))]
- coefs = self._get_coefs(X, y_random)
- random_dfs.append(pd.DataFrame(coefs.T))
- random_df = pd.concat(random_dfs)
-
- # Select cutoff based on percentile
- high_cutoffs = random_df.quantile(0.99)
- low_cutoffs = random_df.quantile(0.01)
-
- # Set significance
- self.sig_ = pd.DataFrame(index=df_X.columns)
- self.sig_["sig"] = False
-
- for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs):
- self.sig_["sig"].loc[self.act_[col] >= c_high] = True
- self.sig_["sig"].loc[self.act_[col] <= c_low] = True
+ # Normalize across samples and features
+ # y = df_y.apply(scale, 1).apply(scale, 0)
+ y = df_y
+ self.columns = df_y.columns
+ X = df_X.loc[y.index]
+ clf = LinearSVR()
+ self.model = MultiOutputRegressor(clf, n_jobs=1)
+ logger.debug("Fitting model")
+ self.model.fit(df_X, df_y)
logger.info("Done")
- def _get_coefs(self, X, y):
- logger.info("set alpha through cross-validation\n")
- # Determine best parameters based on CV
- self.clf.fit(X, y)
-
- logger.debug(
- "average score ({} fold CV): {}".format(self.kfolds, self.clf.best_score_)
+ self.act_ = pd.DataFrame(
+ {c: e.coef_ for c, e in zip(df_y.columns, self.model.estimators_)},
+ index=X.columns,
)
- logger.info("Estimate coefficients using bootstrapping\n")
+ def predict(self, df_X):
+ # print(self.model.predict(df_X) )
- n_samples = 0.75 * X.shape[0]
- max_samples = X.shape[0]
- m = self.clf.best_estimator_
- coefs = []
- for _ in range(10):
- idx = np.random.randint(0, n_samples, max_samples)
- m.fit(X[idx], y[idx])
- coefs.append(m.coef_)
- coefs = np.array(coefs).mean(axis=0)
- return coefs
+ return pd.DataFrame(
+ self.model.predict(df_X), index=df_X.index, columns=self.columns
+ )
def moap(
@@ -935,7 +710,7 @@ def moap(
method : str, optional
Motif activity method to use. Any of 'hypergeom', 'lasso',
- 'lightningclassification', 'lightningregressor', 'bayesianridge',
+ 'bayesianridge',
'rf', 'xgboost'. Default is 'hypergeom'.
scoring: str, optional
@@ -1058,13 +833,7 @@ def moap(
motifs = motifs.loc[df.index]
- if method == "lightningregressor":
- outdir = os.path.dirname(outfile)
- tmpname = os.path.join(outdir, ".lightning.tmp")
- clf.fit(motifs, df, tmpdir=tmpname)
- shutil.rmtree(tmpname)
- else:
- clf.fit(motifs, df)
+ clf.fit(motifs, df)
if outfile:
with open(outfile, "w") as f:
diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index 3065a6cb..253bc410 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -10,8 +10,8 @@
import sys
import random
from math import log, sqrt
+from collections import Counter
from warnings import warn
-import six
from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME
from gimmemotifs.c_metrics import pfmscan
@@ -1304,6 +1304,88 @@ def wiggle_pwm(self):
return self.wiggled_pwm
+ def format_factors(
+ self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"
+ ):
+ if html:
+ fmt_d = "{}"
+ fmt_i = "{}"
+ else:
+ fmt_d = fmt_i = "{}"
+
+ if hasattr(self, "factor_info"):
+ fcount = Counter([x.upper() for x in self.factor_info["Factor"]])
+ else:
+ fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME])
+
+ direct = sorted(
+ list(
+ set(
+ [
+ x.upper() if x != "de novo" else x
+ for x in self.factors[DIRECT_NAME]
+ ]
+ )
+ ),
+ key=lambda x: fcount[x],
+ reverse=True,
+ )
+
+ indirect = []
+ if include_indirect:
+ indirect = sorted(
+ list(
+ set(
+ [
+ x.upper()
+ for x in self.factors[INDIRECT_NAME]
+ if x.upper() not in direct
+ ]
+ )
+ ),
+ key=lambda x: fcount[x],
+ reverse=True,
+ )
+
+ if len(direct) > max_length:
+ show_factors = direct[:max_length]
+ else:
+ show_factors = direct[:]
+ for f in sorted(indirect, key=lambda x: fcount[x], reverse=True):
+ if f not in show_factors:
+ show_factors.append(f)
+ if len(show_factors) >= max_length:
+ break
+
+ if "de novo" in show_factors:
+ show_factors = ["de novo"] + sorted(
+ [f for f in show_factors if f != "de novo"],
+ key=lambda x: fcount[x],
+ reverse=True,
+ )
+ else:
+ show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True)
+
+ factor_str = ",".join(
+ [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]
+ )
+
+ if len(direct + indirect) > max_length:
+ factor_str += extra_str
+
+ if html:
+ tooltip = ""
+ if len(direct) > 0:
+ tooltip += "direct: " + ",".join(sorted(direct))
+ if len(indirect) > 0:
+ if tooltip != "":
+ tooltip += "
"
+ tooltip += "predicted: " + ",".join(sorted(indirect))
+
+ factor_str = '' + factor_str + "
"
+
+ return factor_str
+
def default_motifs():
"""Return list of Motif instances from default motif database."""
@@ -1393,14 +1475,8 @@ def parse_motifs(motifs):
motifs : list
List of Motif instances.
"""
- if isinstance(motifs, six.string_types):
- with open(motifs) as f:
- if motifs.endswith("pwm") or motifs.endswith("pfm"):
- motifs = read_motifs(f, fmt="pwm")
- elif motifs.endswith("transfac"):
- motifs = read_motifs(f, fmt="transfac")
- else:
- motifs = read_motifs(f)
+ if isinstance(motifs, str):
+ return read_motifs(motifs)
elif isinstance(motifs, Motif):
motifs = [motifs]
else:
@@ -1426,6 +1502,8 @@ def _add_factors_from_handle(motifs, handle):
m2f_direct = {}
m2f_indirect = {}
for line in open(map_file):
+ if line.startswith("#"):
+ continue
try:
motif, *factor_info = line.strip().split("\t")
if len(factor_info) == 1:
@@ -1438,7 +1516,11 @@ def _add_factors_from_handle(motifs, handle):
except Exception:
pass
+ m2f = pd.read_csv(map_file, sep="\t", comment="#", index_col=0)
+
for motif in motifs:
+ if motif.id in m2f.index:
+ motif.factor_info = m2f.loc[motif.id]
if motif.id in m2f_direct:
motif.factors[DIRECT_NAME] = m2f_direct[motif.id]
if motif.id in m2f_indirect:
@@ -1518,7 +1600,7 @@ def read_motifs(infile=None, fmt="pfm", as_dict=False):
if fmt == "pwm":
fmt = "pfm"
- if infile is None or isinstance(infile, six.string_types):
+ if infile is None or isinstance(infile, str):
infile = pfmfile_location(infile)
with open(infile) as f:
motifs = _read_motifs_from_filehandle(f, fmt)
diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py
index d161c138..d27f9d21 100644
--- a/gimmemotifs/plot.py
+++ b/gimmemotifs/plot.py
@@ -4,7 +4,6 @@
# the terms of the MIT License, see the file COPYING included with this
# distribution.
""" Various plotting functions """
-from __future__ import print_function
from PIL import Image
import seaborn as sns
from mpl_toolkits.axes_grid1 import ImageGrid
@@ -33,8 +32,7 @@
def axes_off(ax):
- """Get rid of all axis ticks, lines, etc.
- """
+ """Get rid of all axis ticks, lines, etc."""
ax.set_frame_on(False)
ax.axes.get_yaxis().set_visible(False)
ax.axes.get_xaxis().set_visible(False)
@@ -355,7 +353,7 @@ def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None):
m = 25 / data.values.max()
for node in t.traverse("levelorder"):
- val = data[[l.name for l in node.get_leaves()]].values.mean()
+ val = data[[leaf.name for leaf in node.get_leaves()]].values.mean()
style = NodeStyle()
style["size"] = 0
diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py
index 15076a86..5ae37690 100644
--- a/gimmemotifs/rank.py
+++ b/gimmemotifs/rank.py
@@ -9,6 +9,7 @@
import subprocess as sp
import pandas as pd
import numpy as np
+from scipy.stats import rankdata, norm
try:
from scipy.special import factorial
@@ -77,23 +78,77 @@ def qStuart(r):
return factorial(N) * v[N]
-def rankagg(df, method="stuart"):
- """Return aggregated ranks.
+def _rank_int(series, c=3.0 / 8, stochastic=True):
+ # Based on code by Edward Mountjoy
+ # See: https://github.com/edm1/rank-based-INT
+ """Perform rank-based inverse normal transformation on pandas series.
+ If stochastic is True ties are given rank randomly, otherwise ties will
+ share the same value. NaN values are ignored.
+ Args:
+ param1 (pandas.Series): Series of values to transform
+ param2 (Optional[float]): Constand parameter (Bloms constant)
+ param3 (Optional[bool]): Whether to randomise rank of ties
+
+ Returns:
+ pandas.Series
+ """
+
+ # Check input
+ assert isinstance(series, pd.Series)
+ assert isinstance(c, float)
+ assert isinstance(stochastic, bool)
+
+ # Set seed
+ np.random.seed(123)
+
+ # Take original series indexes
+ orig_idx = series.index
+
+ # Drop NaNs
+ series = series.loc[~pd.isnull(series)]
+
+ # Get ranks
+ if stochastic:
+ # Shuffle by index
+ series = series.loc[np.random.permutation(series.index)]
+ # Get rank, ties are determined by their position in the series (hence
+ # why we randomised the series)
+ rank = rankdata(series, method="ordinal")
+ else:
+ # Get rank, ties are averaged
+ rank = rankdata(series, method="average")
+
+ # Convert numpy array back to series
+ rank = pd.Series(rank, index=series.index)
+
+ # Convert rank to normal distribution
+ transformed = rank.apply(_rank_to_normal, c=c, n=len(rank))
+
+ return transformed[orig_idx]
+
+def _rank_to_normal(rank, c, n):
+ # Standard quantile function
+ x = (rank - c) / (n - 2 * c + 1)
+ return norm.ppf(x)
+
+
+def _rankagg_int(df):
+ # Convert values to ranks
+ df_int = df.apply(_rank_int)
+ # Combine z-score using Stouffer's method
+ df_int = (df_int.sum(1) / np.sqrt(df_int.shape[1])).to_frame()
+ df_int.columns = ["z-score"]
+ return df_int
+
+
+def _rankagg_stuart(df):
+ """
Implementation is ported from the RobustRankAggreg R package
References:
Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709
Stuart et al., 2003, DOI: 10.1126/science.1087447
-
- Parameters
- ----------
- df : pandas.DataFrame
- DataFrame with values to be ranked and aggregated
-
- Returns
- -------
- pandas.DataFrame with aggregated ranks
"""
rmat = pd.DataFrame(index=df.iloc[:, 0])
@@ -105,3 +160,49 @@ def rankagg(df, method="stuart"):
rmat = rmat.apply(sorted, 1, result_type="expand")
p = rmat.apply(qStuart, 1)
return pd.DataFrame({"score": p}, index=rmat.index)
+
+
+def rankagg(df, method="int_stouffer", include_reverse=True, log_transform=True):
+ """Return aggregated ranks.
+
+ Stuart implementation is ported from the RobustRankAggreg R package
+
+ References:
+ Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709
+ Stuart et al., 2003, DOI: 10.1126/science.1087447
+
+ Parameters
+ ----------
+ df : pandas.DataFrame
+ DataFrame with values to be ranked and aggregated
+ method : str, optional
+ Either "int_stouffer" or "stuart". The "int_stouffer" method is based on combining z-scores
+ from a inverse normal transform of ranks using Stouffer's method.
+
+ Returns
+ -------
+ pandas.DataFrame with aggregated ranks
+ """
+ method = method.lower()
+ if method not in ["stuart", "int_stouffer"]:
+ raise ValueError("unknown method for rank aggregation")
+
+ if method == "stuart":
+ df_asc = pd.DataFrame()
+ df_desc = pd.DataFrame()
+ for col in df.columns:
+ df_asc[col] = (
+ df.sample(frac=1).sort_values(col, ascending=False).index.values
+ )
+ if include_reverse:
+ df_desc[col] = (
+ df.sample(frac=1).sort_values(col, ascending=True).index.values
+ )
+
+ df_result = -np.log10(_rankagg_stuart(df_asc))
+ if include_reverse:
+ df_result += np.log10(_rankagg_stuart(df_desc))
+
+ return df_result
+ if method == "int_stouffer":
+ return _rankagg_int(df)
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 239eb441..e41951dc 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -16,11 +16,21 @@
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests
+from pandas.core.indexing import _non_reducing_slice
+from pandas.io.formats.style import Styler
+import seaborn as sns
+from matplotlib import colors
+import matplotlib.pyplot as plt
+
+try:
+ import emoji
+except ImportError:
+ pass
from gimmemotifs.comparison import MotifComparer
from gimmemotifs.fasta import Fasta
from gimmemotifs.motif import read_motifs
-from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME
+from gimmemotifs.config import MotifConfig
from gimmemotifs.plot import roc_plot
from gimmemotifs.stats import calc_stats, add_star, write_stats
from gimmemotifs import __version__
@@ -28,6 +38,588 @@
logger = logging.getLogger("gimme.report")
+FACTOR_TOOLTIP = "factors
(direct or predicted)
"
+
+
+def _wrap_html_str(x):
+ if " " not in x:
+ return x
+
+ min_pos, max_pos = 0, len(x)
+ if ">" in x and "" in x:
+ m = re.compile(r">[^<>]*<").search(x)
+ min_pos, max_pos = m.start(), m.end()
+
+ positions = [m.start() for m in re.compile(" ").finditer(x)]
+
+ positions = [p for p in positions if min_pos < p < max_pos]
+
+ if len(positions) == 0:
+ return x
+
+ pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0]
+ x = x[:pos] + "
" + x[pos + 1 :]
+ return x
+
+
+def relative_luminance(rgba):
+ """
+ Calculate relative luminance of a color.
+ The calculation adheres to the W3C standards
+ (https://www.w3.org/WAI/GL/wiki/Relative_luminance)
+ Parameters
+ ----------
+ color : rgb or rgba tuple
+ Returns
+ -------
+ float
+ The relative luminance as a value from 0 to 1
+ """
+ r, g, b = (
+ x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4) for x in rgba[:3]
+ )
+ return 0.2126 * r + 0.7152 * g + 0.0722 * b
+
+
+def contrasting_text_color(color, text_color_threshold=0.408):
+ dark = relative_luminance(color) < text_color_threshold
+ text_color = "#f1f1f1" if dark else "#000000"
+ return text_color
+
+
+class ExtraStyler(Styler):
+ """
+ Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS.
+ """
+
+ loader = jinja2.ChoiceLoader(
+ [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader]
+ )
+ env = jinja2.Environment(loader=loader)
+ template = env.get_template("table.tpl")
+
+ def __init__(self, *args, **kwargs):
+ self._data_todo = []
+ self.circle_styles = None
+ self.palette_styles = None
+ self.col_heading_style = {
+ "name": "col_heading",
+ "props": [("border-bottom", "1px solid #e0e0e0")],
+ }
+ super(ExtraStyler, self).__init__(*args, **kwargs)
+ self.display_data = self.data.copy()
+
+ # self.template =
+
+ self._font = "Nunito Sans"
+
+ @property
+ def font(self):
+ return self._font
+
+ @font.setter
+ def font(self, font_name):
+ self._font = font_name
+
+ def set_font(self, font_name):
+ """
+ Set the font that will be used.
+
+ Parameters
+ ----------
+ font_name : str
+ Should be a font name available though the Google Font API.
+
+ Returns
+ -------
+ self : ExtraStyler
+
+ Notes
+ -----
+ ``font_name`` can contain spaces, eg. "Nunito Sans".
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
+ >>> ExtraStyler(df).font("Roboto)
+ """
+ self.font = font_name
+ return self
+
+ def _current_index(self, subset):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+ selected = self.data.loc[subset]
+ idx_slice = pd.IndexSlice[
+ self.data.index.get_indexer(selected.index),
+ self.data.columns.get_indexer(selected.columns),
+ ]
+ return idx_slice
+
+ def _translate(self):
+ self._compute_data()
+ d = super()._translate()
+ circle_styles = self.circle_styles or []
+ palette_styles = self.palette_styles or []
+ col_heading_style = self.col_heading_style or []
+ d.update(
+ {
+ "font": self.font,
+ "circle_styles": circle_styles,
+ "palette_styles": palette_styles,
+ "col_heading_style": col_heading_style,
+ }
+ )
+ return d
+
+ def _compute_data(self):
+ r = self
+ for func, args, kwargs in self._data_todo:
+ r = func(self)(*args, **kwargs)
+ r.data = r.display_data
+ return r
+
+ def _tooltip(self, tip, subset=None, part=None):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ if part is None:
+ part = "data"
+
+ if part == "data":
+ self.display_data.loc[subset] = (
+ ""
+ + self.display_data.loc[subset].astype(str)
+ + "
"
+ )
+ elif part == "columns":
+ idx = self._current_index(subset)[1]
+ rename = dict(
+ zip(
+ self.display_data.columns[idx],
+ ""
+ + self.display_data.columns[idx].astype(str)
+ + "
",
+ )
+ )
+ self.display_data.rename(columns=rename, inplace=True)
+ elif part == "index":
+ idx = self._current_index(subset)[0]
+ rename = dict(
+ zip(
+ self.display_data.index[idx],
+ ""
+ + self.display_data.index[idx].astype(str)
+ + "
",
+ )
+ )
+ self.display_data.rename(index=rename, inplace=True)
+ else:
+ raise ValueError(f"unknown value for part: {part}")
+ return self
+
+ def _wrap_iterable(self, it):
+ return [_wrap_html_str(val) for val in it]
+
+ def _wrap(self, subset=None, axis=0):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ if axis in [0, "columns"]:
+ idx = self._current_index(subset)[1]
+ rename = dict(
+ zip(
+ self.display_data.columns[idx],
+ self._wrap_iterable(self.display_data.columns[idx]),
+ )
+ )
+ self.display_data.rename(columns=rename, inplace=True)
+ elif axis in [1, "index"]:
+ idx = self._current_index(subset)[0]
+ rename = dict(
+ zip(
+ self.display_data.index[idx],
+ self._wrap_iterable(self.display_data.index[idx]),
+ )
+ )
+ self.display_data.rename(index=rename, inplace=True)
+ else:
+ raise ValueError(f"unknown value for axis: {axis}")
+ return self
+
+ def _convert_to_image(self, subset=None, height=30):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ self.display_data.loc[subset] = (
+ f''
+ )
+ return self
+
+ def _border(self, idx, location="left"):
+ return [f"border-{location}: 2px solid #444;" for val in idx]
+
+ def border(
+ self,
+ subset=None,
+ location="bottom",
+ part="data",
+ width="2px",
+ style="solid",
+ color="#444",
+ ):
+ """
+ Add a border to data cells, columns or index.
+
+ Parameters
+ ----------
+ subset : IndexSlice, optional
+ An argument to ``DataFrame.loc`` that restricts which elements
+ ``border`` is applied to. If ``part`` is "columns" or "index"
+ subset should be present in either the columns or the index.
+
+ location : str, optional
+ Location of the border, default is "bottom". Can be "top", "bottom",
+ "right" or "left".
+
+ part : str, optional
+ If ``part`` is "data", the border will be applied to the data cells.
+ Set part to "index" or to "column" to add a border to the index or
+ header, respectively.
+
+ width : str, int or float, optional
+ Valid CSS value for border width.
+
+ style : str, optional
+ Valid CSS value for border style.
+
+ color : str, optional
+ Valid CSS value for border color.
+
+ Returns
+ -------
+ self : ExtraStyler
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
+ >>> ExtraStyler(df).border(part="columns)
+ """
+ if part == "data":
+ self.apply(self._border, subset=subset, location=location)
+ else:
+ self.col_heading_style["props"].append(
+ (f"border-{location}", f"{width} {style} {color}")
+ )
+ return self
+
+ def _align(self, idx, location="center"):
+ return [f"text-align:{location};" for val in idx]
+
+ def align(self, subset=None, location="center", axis=0):
+ """
+ Align text.
+
+ Parameters
+ ----------
+ subset : IndexSlice, optional
+ An argument to ``DataFrame.loc`` that restricts which elements
+ ``center_align`` is applied to.
+
+ location : str, optional
+ "center", "left" or "right"
+
+ axis : {0 or 'index', 1 or 'columns', None}, default 0
+ Apply to each column (``axis=0`` or ``'index'``), to each row
+ (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
+ with ``axis=None``.
+
+ Returns
+ -------
+ self : ExtraStyler
+ """
+ self.apply(self._align, subset=subset, location=location, axis=axis)
+ return self
+
+ def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0):
+ rng = M - m
+ norm = colors.Normalize(m - (rng * low), M + (rng * high))
+ normed = norm(s.values)
+ c = plt.cm.get_cmap(cmap)(normed)
+ return [
+ f"background-color: {colors.rgb2hex(color)}; color: {contrasting_text_color(color)}"
+ for color in c
+ ]
+
+ def to_precision_str(self, subset=None, precision=0, include_zero=True):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ def precision_str(x, precision=precision):
+ if (include_zero or x > 0) and x <= 10 ** -precision:
+ return f"<{10**-precision}"
+ else:
+ return f"{{0:.{precision}f}}".format(x)
+
+ self.display_data.loc[subset] = self.data.loc[subset].applymap(precision_str)
+ return self
+
+ def _circle(
+ self,
+ subset=None,
+ show_text=True,
+ color=None,
+ cmap=None,
+ vmin=None,
+ vmax=None,
+ scale=False,
+ size=25,
+ min_size=5,
+ morph=False,
+ ):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subslice = _non_reducing_slice(subset)
+
+ if color:
+ palette = sns.color_palette([color])
+ # print(palette)
+ elif cmap is None:
+ palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10)
+ else:
+ # if isinstance(palette, str):
+ palette = sns.color_palette(cmap)
+
+ # Make sure we don't select text columns
+ if len(palette) > 1:
+ subslice = pd.IndexSlice[
+ self.data.loc[subslice].index,
+ self.data.loc[subslice].select_dtypes(exclude=["object"]).columns,
+ ]
+ idx = self._current_index(subslice)
+
+ self.circle_styles = self.circle_styles or []
+ circle_id = len(self.circle_styles) + 1
+
+ props = [
+ ("height", f"{size}px"),
+ ("width", f"{size}px"),
+ ("border-radius", "50%"),
+ ("color", "#000"),
+ ("line-height", f"{size}px"),
+ ("display", "inline-block"),
+ ("text-align", "center"),
+ ("vertical-align", "middle"),
+ ]
+
+ self.circle_styles.append({"name": f"circle{circle_id}", "props": props})
+ self.palette_styles = self.palette_styles or []
+ for i, color in enumerate(palette.as_hex()):
+ props = [("background-color", color)]
+ if scale:
+ circle_size = min_size + ((size - min_size) / len(palette) * (i + 1))
+ props += [
+ ("height", f"{circle_size}px"),
+ ("width", f"{circle_size}px"),
+ ("line-height", f"{circle_size}px"),
+ ("text-align", "center"),
+ ]
+ if morph:
+ props += [("border-radius", f"{50 - int(50 / len(palette)) * i}%")]
+ self.palette_styles.append(
+ {"name": f"color{circle_id}_{i}", "props": props}
+ )
+
+ if len(palette) > 1:
+ vmax = (
+ self.data.loc[subslice].max().max() * 1.01
+ if vmax is None
+ else vmax * 1.01
+ )
+ text = self.display_data.iloc[idx].astype(str) if show_text else ""
+ self.display_data.iloc[idx] = (
+ f""
+ + text
+ + "
"
+ )
+ else:
+ text = self.display_data.iloc[idx].astype(str) if show_text else ""
+ self.display_data.iloc[idx] = (
+ f"" + text + "
"
+ )
+
+ return self
+
+ def add_circle(self, **kwargs):
+ self._data_todo.append((lambda instance: instance._circle, (), kwargs))
+ return self
+
+ def wrap(self, **kwargs):
+ self._data_todo.append((lambda instance: instance._wrap, (), kwargs))
+ return self
+
+ def add_tooltip(self, tip, **kwargs):
+ self._data_todo.append((lambda instance: instance._tooltip, (tip,), kwargs))
+ return self
+
+ def convert_to_image(self, **kwargs):
+ self._data_todo.append(
+ (lambda instance: instance._convert_to_image, (), kwargs)
+ )
+ return self
+
+ def rename(self, columns=None, index=None):
+ self.display_data = self.display_data.rename(columns=columns, index=index)
+ return self
+
+ def _emoji_score(self, series, emoji_str=None, bins=None):
+ if emoji_str is None:
+ emoji_str = ":star:"
+ if bins is None:
+ bins = 3
+
+ if isinstance(bins, int):
+ labels = range(1, bins + 1)
+ else:
+ labels = range(1, len(bins))
+
+ return [
+ emoji.emojize(emoji_str * val, use_aliases=True)
+ for val in pd.cut(series, bins=bins, labels=labels)
+ ]
+
+ def _emoji_scale(self, series, emojis=None, bins=None):
+ emoji_dict = {
+ "thumbs": [":thumbsdown:", ":thumbsup:"],
+ "check": [":cross_mark:", ":white_check_mark:"],
+ "smiley": [
+ ":crying_face:",
+ ":slightly_frowning_face:",
+ ":neutral_face:",
+ ":slightly_smiling_face:",
+ ":grin:",
+ ],
+ "black_square": [
+ ":black_small_square:",
+ ":black_medium_small_square:",
+ ":black_medium_square:",
+ ":black_large_square:",
+ ],
+ "white_square": [
+ ":white_small_square:",
+ ":white_medium_small_square:",
+ ":white_medium_square:",
+ ":white_large_square:",
+ ],
+ }
+
+ if emojis is None:
+ emojis = "smiley"
+
+ if emojis in emoji_dict:
+ labels = emoji_dict[emojis]
+ if bins is None:
+ bins = len(labels)
+
+ return [
+ emoji.emojize(val, use_aliases=True)
+ for val in pd.cut(series, bins=bins, labels=labels)
+ ]
+
+ def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ idx = self._current_index(subset=subset)
+
+ result = self.display_data.iloc[idx].apply(
+ self._emoji_scale, axis=axis, result_type="expand", args=(emojis, bins)
+ )
+ self.display_data.iloc[idx] = result.values
+
+ return self.align(subset=subset, location="center", axis=axis)
+
+ def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ idx = self._current_index(subset=subset)
+ result = self.display_data.iloc[idx].apply(
+ self._emoji_score, axis=axis, result_type="expand", args=(emoji_str, bins)
+ )
+ self.display_data.iloc[idx] = result.values
+
+ return self.align(subset=subset, location="left", axis=axis)
+
+ def emojify(self, subset=None):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ idx = self._current_index(subset=subset)
+ result = self.display_data.iloc[idx].applymap(emoji.emojize)
+ self.display_data.iloc[idx] = result.values
+
+ return self
+
+ def scaled_background_gradient(
+ self,
+ subset=None,
+ cmap="RdBu_r",
+ low=0,
+ high=0,
+ center_zero=False,
+ vmin=None,
+ vmax=None,
+ ):
+ subset = pd.IndexSlice[:, :] if subset is None else subset
+ subset = _non_reducing_slice(subset)
+
+ vmax = (
+ self.data.loc[subset]
+ .replace({np.inf: np.nan, -np.inf: np.nan})
+ .max(skipna=True)
+ .max()
+ if vmax is None
+ else vmax
+ )
+ vmin = (
+ self.data.loc[subset]
+ .replace({np.inf: np.nan, -np.inf: np.nan})
+ .min(skipna=True)
+ .min()
+ if vmin is None
+ else vmin
+ )
+
+ if center_zero:
+ vmax = max(abs(vmax), abs(vmin))
+ vmin = -vmax
+
+ r = self
+ for col in self.data.loc[subset].columns:
+ r = r.apply(
+ self._background_gradient,
+ subset=pd.IndexSlice[subset[0], col],
+ cmap=cmap,
+ m=vmin,
+ M=vmax,
+ low=low,
+ high=high,
+ )
+
+ return r
+
def get_roc_values(motif, fg_file, bg_file, genome):
"""Calculate ROC AUC values for ROC plots."""
@@ -167,8 +759,8 @@ class ReportMotif(object):
+ os.path.basename(roc_img_file % (motif.id, bg))
+ ".png"
}
- rm.bg[bg][u"roc_img_link"] = {
- u"href": "images/"
+ rm.bg[bg]["roc_img_link"] = {
+ "href": "images/"
+ os.path.basename(roc_img_file % (motif.id, bg))
+ ".png"
}
@@ -253,93 +845,114 @@ def create_denovo_motif_report(
)
-def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
- df = pd.read_table(infile, index_col=0)
- df = df[np.any(abs(df) >= threshold, 1)]
+def motif_to_factor_series(series, pfmfile=None, motifs=None):
+ if motifs is None:
+ motifs = read_motifs(pfmfile, as_dict=True)
- motifs = read_motifs(pfmfile)
+ if isinstance(series, pd.Index):
+ index = series
+ else:
+ index = series.index
- df.rename_axis(None, inplace=True)
- cols = df.columns
+ factors = [motifs[motif].format_factors(html=True) for motif in series]
+ return pd.Series(data=factors, index=index)
+
+
+def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"):
+ if motifs is None:
+ motifs = read_motifs(pfmfile, as_dict=True)
+
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ if not os.path.exists(os.path.join(outdir, subdir)):
+ os.makedirs(os.path.join(outdir, subdir))
+
+ img_series = []
+ for motif in series:
+ if motif not in motifs:
+ raise ValueError(f"Motif {motif} does not occur in motif database")
+ fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif))
+ if not os.path.exists(fname):
+ motifs[motif].plot_logo(fname=os.path.join(outdir, fname))
+ img_series.append(fname)
+
+ if isinstance(series, pd.Index):
+ index = series
+ else:
+ index = series.index
+ return pd.Series(data=img_series, index=index)
- motifs = read_motifs(pfmfile)
- idx = [motif.id for motif in motifs]
- direct = [
- ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]])))
- for motif in motifs
- ]
- indirect = [
- ",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]])))
- for motif in motifs
- ]
- m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx)
-
- factor_cols = [DIRECT_NAME, INDIRECT_NAME]
- if True:
- for factor_col in factor_cols:
- f = m2f[factor_col].str.len() > 30
- m2f[factor_col] = (
- ''
- + m2f[factor_col].str.slice(0, 30)
- )
- m2f.loc[f, factor_col] += "(...)"
- m2f[factor_col] += "
"
- df = df.join(m2f)
- df["logo"] = [
- '
'.format(re.sub("[()/]", "_", x))
- for x in list(df.index)
+def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
+
+ # Read the maelstrom text report
+ df = pd.read_table(infile, index_col=0)
+
+ # Columns with maelstrom rank aggregation value
+ value_cols = df.columns[
+ ~df.columns.str.contains("corr") & ~df.columns.str.contains("% with motif")
]
+ # Columns with correlation values
+ corr_cols = df.columns[df.columns.str.contains("corr")]
- if not os.path.exists(outdir + "/logos"):
- os.makedirs(outdir + "/logos")
- for motif in motifs:
- if motif.id in df.index:
- motif.plot_logo(
- fname=outdir + "/logos/{}.png".format(re.sub("[()/]", "_", motif.id))
- )
+ df = df[np.any(abs(df[value_cols]) >= threshold, 1)]
- template_dir = MotifConfig().get_template_dir()
- js = open(
- os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8"
- ).read()
- css = open(
- os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
- encoding="utf-8",
- ).read()
- df = df[factor_cols + ["logo"] + list(cols)]
-
- df_styled = df.style
- absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min())))
- target = absmax * 1.75
-
- for col in cols:
- smin = df[col].min()
- smax = df[col].max()
- diff = smax - smin
- low = abs((-target - smin) / diff)
- high = (target - smax) / diff
- df_styled = df_styled.background_gradient(
- cmap="RdBu_r", low=low, high=high, subset=[col]
- )
+ # Add motif logo's
+ df.insert(
+ 0,
+ "logo",
+ motif_to_img_series(df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"),
+ )
+ # Add factors that can bind to the motif
+ df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
- df_styled = df_styled.set_precision(3)
- df_styled = df_styled.set_table_attributes("data-sortable")
- df_styled = df_styled.render()
- df_styled = df_styled.replace(
- "data-sortable", 'class="sortable-theme-slick" data-sortable'
+ rename_columns = {"factors": FACTOR_TOOLTIP}
+
+ df_styled = (
+ ExtraStyler(df)
+ .set_precision(2)
+ .convert_to_image(
+ subset=["logo"],
+ height=30,
+ )
+ .scaled_background_gradient(
+ subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75
+ )
+ .border(subset=list(value_cols[:1]), location="left")
+ .border(part="columns", location="bottom")
+ .set_table_attributes('class="sortable-theme-slick" data-sortable')
+ .align(subset=list(value_cols), location="center")
+ .set_font("Nunito Sans")
+ .rename(columns=rename_columns)
)
+ if len(corr_cols) > 0:
+ df_styled = (
+ df_styled.wrap(subset=list(corr_cols))
+ .align(subset=list(corr_cols), location="center")
+ .scaled_background_gradient(
+ subset=corr_cols,
+ cmap="PuOr_r",
+ center_zero=True,
+ low=1 / 1.75,
+ high=1 / 1.75,
+ )
+ )
+
+ for col in df.columns:
+ if "% with motif" in col:
+ df_styled = (
+ df_styled.add_circle(subset=[col], cmap="Purples", vmax=100, size=40)
+ .wrap(subset=[col])
+ .align(subset=[col], location="center")
+ .border(subset=[col], location="left")
+ .to_precision_str(subset=[col])
+ )
+
+ df_styled = df_styled.wrap().render()
+
with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
- f.write("\n")
- f.write("\n".format(css))
- f.write("\n")
- f.write("\n")
f.write(df_styled)
- f.write("\n".format(js))
- f.write("\n")
def roc_html_report(
@@ -353,102 +966,104 @@ def roc_html_report(
):
df = pd.read_table(infile, index_col=0)
df.rename_axis(None, inplace=True)
- df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1]
+
+ motifs = read_motifs(pfmfile, as_dict=True)
+ if use_motifs is not None:
+ motifs = {k: v for k, v in motifs.items() if k in use_motifs}
+ idx = list(motifs.keys())
+ df = df.loc[idx]
+
+ df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1])
+ df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"]))
+ df = df[df["corrected P-value"] <= threshold]
cols = [
+ "factors",
"logo",
- "# matches",
- "# matches background",
- "P-value",
- "log10 P-value",
- "corrected P-value",
+ "% matches input",
+ "%matches background",
+ "-log10 P-value",
"ROC AUC",
"PR AUC",
"Enr. at 1% FPR",
"Recall at 10% FDR",
]
- motifs = read_motifs(pfmfile)
- if use_motifs is not None:
- motifs = [m for m in motifs if m.id in use_motifs]
-
- idx = [motif.id for motif in motifs]
- df = df.loc[idx]
- direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs]
- indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs]
- m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx)
-
- factor_cols = [DIRECT_NAME, INDIRECT_NAME]
- if True:
- for factor_col in factor_cols:
- f = m2f[factor_col].str.len() > 30
- m2f[factor_col] = (
- ''
- + m2f[factor_col].str.slice(0, 30)
- )
- m2f.loc[f, factor_col] += "(...)"
- m2f[factor_col] += "
"
- df = df.join(m2f)
- cols = factor_cols + cols
-
- df = df[df["corrected P-value"] <= threshold]
-
if link_matches:
df["# matches"] = (
""
+ df["# matches"].astype(str)
+ ""
)
- df["logo"] = [
- '
'.format(re.sub(r"[^-_\w]+", "_", x))
- for x in list(df.index)
- ]
+ # Add motif logo's
+ df.insert(
+ 0,
+ "logo",
+ motif_to_img_series(
+ df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"
+ ),
+ )
+ # Add factors that can bind to the motif
+ df.insert(
+ 0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs)
+ )
+
+ rename_columns = {"factors": FACTOR_TOOLTIP}
df = df[cols]
- if not os.path.exists(outdir + "/logos"):
- os.makedirs(outdir + "/logos")
- for motif in motifs:
- if motif.id in df.index:
- motif.plot_logo(
- fname=outdir
- + "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id))
- )
bar_cols = [
- "log10 P-value",
+ "% matches input",
+ "%matches background",
+ "-log10 P-value",
"ROC AUC",
"PR AUC",
"Enr. at 1% FPR",
"Recall at 10% FDR",
]
- template_dir = MotifConfig().get_template_dir()
- js = open(
- os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8"
- ).read()
- css = open(
- os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
- encoding="utf-8",
- ).read()
+
+ df["% matches input"] = df["% matches input"].astype(int)
+ df["%matches background"] = df["%matches background"].astype(int)
+ rename_columns = {"factors": FACTOR_TOOLTIP}
+ df = df.sort_values("ROC AUC", ascending=False)
with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
- f.write("\n")
- f.write("\n".format(css))
- f.write("\n")
- f.write("\n")
if df.shape[0] > 0:
f.write(
- df.sort_values("ROC AUC", ascending=False)
- .style.bar(bar_cols)
- .set_precision(3)
- .set_table_attributes("data-sortable")
+ ExtraStyler(df)
+ .convert_to_image(
+ subset=["logo"],
+ height=30,
+ )
+ .add_circle(
+ subset=["% matches input", "%matches background"],
+ vmax=100,
+ cmap="Purples",
+ )
+ .scaled_background_gradient(
+ "-log10 P-value", vmin=0, high=0.3, cmap="Reds"
+ )
+ .scaled_background_gradient(
+ "ROC AUC", vmin=0.5, vmax=1, high=0.3, cmap="Reds"
+ )
+ .scaled_background_gradient(
+ "PR AUC", vmin=0, vmax=1, high=0.3, cmap="Reds"
+ )
+ .scaled_background_gradient(
+ "Enr. at 1% FPR", vmin=1, high=0.3, cmap="Reds"
+ )
+ .scaled_background_gradient(
+ "Recall at 10% FDR", vmin=0, vmax=1, high=0.7, cmap="Reds"
+ )
+ .set_precision(2)
+ .set_table_attributes('class="sortable-theme-slick" data-sortable')
+ .wrap(subset=cols)
+ .align(subset=bar_cols, location="center")
+ .rename(columns=rename_columns)
+ .to_precision_str(subset=["% matches input", "%matches background"])
.render()
- .replace("data-sortable", 'class="sortable-theme-slick" data-sortable')
)
else:
- f.write("No enriched motifs found.")
- f.write("\n".format(js))
- f.write("\n")
+ f.write("No enriched motifs found.")
diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index d45d3906..8397fb8f 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -1,11 +1,11 @@
import os
import re
import sys
+from collections import Counter
from functools import partial
from tempfile import mkdtemp, NamedTemporaryFile
import logging
import multiprocessing as mp
-import six
# "hidden" features, in development
try:
@@ -18,7 +18,8 @@
from genomepy import Genome
from diskcache import Cache
import numpy as np
-from scipy.stats import scoreatpercentile
+from sklearn.preprocessing import scale
+import pandas as pd
from gimmemotifs import __version__
from gimmemotifs.background import RandomGenomicFasta, gc_bin_bedfile
@@ -46,7 +47,6 @@ def _pickle_method(m):
# only used when using cache, should not be a requirement
try:
from dogpile.cache import make_region
- from dogpile.cache.api import NO_VALUE
import xxhash
except ImportError:
pass
@@ -250,8 +250,7 @@ def scan_to_file(
zscore=True,
gcnorm=True,
):
- """Scan an inputfile with motifs.
- """
+ """Scan an inputfile with motifs."""
should_close = False
if filepath_or_buffer is None:
fo = sys.stdout
@@ -341,7 +340,7 @@ def scan_to_best_match(
if genome:
s.set_genome(genome)
- if isinstance(motifs, six.string_types):
+ if isinstance(motifs, str):
motifs = read_motifs(motifs)
logger.debug("scanning %s...", fname)
@@ -370,7 +369,9 @@ def parse_threshold_values(motif_file, cutoff):
return threshold
-def scan_sequence(seq, motifs, nreport, scan_rc):
+def scan_sequence(
+ seq, seq_gc_bin, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False
+):
ret = []
# scan for motifs
@@ -378,36 +379,37 @@ def scan_sequence(seq, motifs, nreport, scan_rc):
if cutoff is None:
ret.append([])
else:
- result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc)
+ if zscore:
+ m_mean, m_std = motifs_meanstd[seq_gc_bin][motif.id]
+ result = pwmscan(
+ seq, motif.logodds, motif.pwm_min_score(), nreport, scan_rc
+ )
+ result = [[(row[0] - m_mean) / m_std, row[1], row[2]] for row in result]
+ result = [row for row in result if row[0] >= cutoff]
+ else:
+ result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc)
if cutoff <= motif.pwm_min_score() and len(result) == 0:
result = [[motif.pwm_min_score(), 0, 1]] * nreport
- ret.append(result)
-
- # return results
- return ret
-
-
-def scan_region(region, genome, motifs, nreport, scan_rc):
-
- # retrieve sequence
- chrom, start, end = re.split(r"[:-]", region)
- seq = genome[chrom][int(start) : int(end)].seq.upper()
-
- return scan_sequence(seq, motifs, nreport, scan_rc)
+ ret.append(result)
-def scan_seq_mult(seqs, motifs, nreport, scan_rc):
- ret = []
- for seq in seqs:
- result = scan_sequence(seq.upper(), motifs, nreport, scan_rc)
- ret.append(result)
return ret
-def scan_region_mult(regions, genome, motifs, nreport, scan_rc):
+def scan_seq_mult(
+ seqs, seq_gc_bins, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False
+):
ret = []
- for region in regions:
- result = scan_region(region, genome, motifs, nreport, scan_rc)
+ for seq, seq_gc_bin in zip(seqs, seq_gc_bins):
+ result = scan_sequence(
+ seq.upper(),
+ seq_gc_bin,
+ motifs,
+ nreport,
+ scan_rc,
+ motifs_meanstd=motifs_meanstd,
+ zscore=zscore,
+ )
ret.append(result)
return ret
@@ -529,7 +531,7 @@ class Scanner(object):
def __init__(self, ncpus=None):
self.config = MotifConfig()
- self.threshold = None
+ self._threshold = None
self.genome = None
self.background = None
self.meanstd = {}
@@ -604,19 +606,15 @@ def _meanstd_from_seqs(self, motifs, seqs):
def _threshold_from_seqs(self, motifs, seqs, fpr):
scan_motifs = [(m, m.pwm_min_score()) for m in motifs]
-
table = []
- for x in self._scan_sequences_with_motif(scan_motifs, seqs, 1, True):
- table.append([row[0][0] for row in x])
+ seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs]
+ for gc_bin, result in zip(
+ seq_gc_bins, self._scan_sequences_with_motif(scan_motifs, seqs, 1, True)
+ ):
+ table.append([gc_bin] + [row[0][0] for row in result])
- for (motif, _), scores in zip(scan_motifs, np.array(table).transpose()):
- if len(scores) > 0:
- opt_score = scoreatpercentile(scores, 100 - (100 * fpr))
- yield motif, opt_score # cutoff
- else:
- raise ValueError(
- "Could not determine threshold for motif {}".format(motif)
- )
+ df = pd.DataFrame(table, columns=["gc_bin"] + [m.id for m in motifs])
+ return df
def set_meanstd(self, gc=False):
if not self.background:
@@ -677,8 +675,22 @@ def set_meanstd(self, gc=False):
lock.release()
+ for gc_bin in self.gc_bins:
+ gc_bin = "{:.2f}-{:.2f}".format(*gc_bin)
+ if gc_bin not in self.meanstd:
+ valid_bins = []
+ for b in self.gc_bins:
+ bstr = "{:.2f}-{:.2f}".format(b[0], b[1])
+ if bstr in self.meanstd:
+ valid_bins.append(((b[0] + b[1]) / 2, bstr))
+
+ v = float(gc_bin.split("-")[1])
+ _, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0]
+ logger.warn(f"Using {bstr}")
+ self.meanstd[gc_bin] = self.meanstd[bstr]
+
def set_background(
- self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None
+ self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None
):
"""Set the background to use for FPR and z-score calculations.
@@ -705,6 +717,16 @@ def set_background(
size = int(size)
+ if gc_bins is None:
+ if gc:
+ gc_bins = [(0.0, 0.2), (0.8, 1)]
+ for b in np.arange(0.2, 0.799, 0.05):
+ gc_bins.append((b, b + 0.05))
+ else:
+ gc_bins = [(0, 1)]
+ if nseq is None:
+ nseq = max(10000, len(gc_bins) * 1000)
+
if genome and fname:
raise ValueError("Need either genome or filename for background.")
@@ -736,12 +758,6 @@ def set_background(
if not fa:
if gc:
-
- if gc_bins is None:
- gc_bins = [(0.0, 0.2), (0.8, 1)]
- for b in np.arange(0.2, 0.799, 0.05):
- gc_bins.append((b, b + 0.05))
-
with NamedTemporaryFile() as tmp:
logger.info("using {} sequences".format(nseq))
gc_bin_bedfile(
@@ -757,6 +773,12 @@ def set_background(
if gc_bins:
self.gc_bins = gc_bins
+ @property
+ def threshold(self):
+ if self._threshold is None:
+ self.set_threshold()
+ return self._threshold
+
def set_threshold(self, fpr=None, threshold=None, gc=False):
"""Set motif scanning threshold based on background sequences.
@@ -775,6 +797,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
if threshold and fpr:
raise ValueError("Need either fpr or threshold.")
+ if threshold is None and fpr is None:
+ if self.genome:
+ fpr = 0.01
+ logger.info(f"Using default FPR of {fpr}")
+ else:
+ threshold = 0.95
+ logger.info(
+ f"Genome not specified, using default threshold of {threshold}."
+ )
+ logger.info("This is likely not ideal.")
+
if fpr:
fpr = float(fpr)
if not (0.0 < fpr < 1.0):
@@ -783,11 +816,16 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
if not self.motifs:
raise ValueError("please run set_motifs() first")
- thresholds = {}
motifs = read_motifs(self.motifs)
+ gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins]
if threshold is not None:
- self.threshold = parse_threshold_values(self.motifs, threshold)
+ d = parse_threshold_values(self.motifs, threshold)
+ self._threshold = pd.DataFrame(d, index=[0])
+ self._threshold = self._threshold.join(
+ pd.DataFrame(gc_bins, index=[0] * len(gc_bins), columns=["gc_bin"])
+ )
+ self._threshold = self._threshold.set_index("gc_bin")
return
if not self.background:
@@ -801,36 +839,41 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
lock.acquire()
with Cache(CACHE_DIR) as cache:
scan_motifs = []
+ self._threshold = None
for motif in motifs:
- k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr)
-
- threshold = cache.get(k)
- if threshold is None:
+ k = "{}|{}|{:.4f}|{}".format(
+ motif.hash(), self.background_hash, fpr, ",".join(sorted(gc_bins))
+ )
+ vals = cache.get(k)
+ if vals is None:
scan_motifs.append(motif)
else:
- if np.isclose(threshold, motif.pwm_max_score()):
- thresholds[motif.id] = None
- elif np.isclose(threshold, motif.pwm_min_score()):
- thresholds[motif.id] = 0.0
+ if self._threshold is None:
+ self._threshold = vals.to_frame()
else:
- thresholds[motif.id] = threshold
+ self._threshold[motif.id] = vals
if len(scan_motifs) > 0:
logger.info("determining FPR-based threshold")
- for motif, threshold in self._threshold_from_seqs(
- scan_motifs, seqs, fpr
- ):
- k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr)
- cache.set(k, threshold)
- if np.isclose(threshold, motif.pwm_max_score()):
- thresholds[motif.id] = None
- elif np.isclose(threshold, motif.pwm_min_score()):
- thresholds[motif.id] = 0.0
- else:
- thresholds[motif.id] = threshold
+ df = self._threshold_from_seqs(scan_motifs, seqs, fpr).set_index(
+ "gc_bin"
+ )
+ if self._threshold is None:
+ self._threshold = df
+ else:
+ self._threshold = pd.concat((self._threshold, df), axis=1)
+ for motif in scan_motifs:
+ k = "{}|{}|{:.4f}|{}".format(
+ motif.hash(),
+ self.background_hash,
+ fpr,
+ ",".join(sorted(gc_bins)),
+ )
+ cache.set(k, df[motif.id])
lock.release()
- self.threshold_str = "{}_{}_{}".format(fpr, threshold, self.background_hash)
- self.threshold = thresholds
+ self.threshold_str = "{}_{}_{}_{}".format(
+ fpr, threshold, self.background_hash, ",".join(sorted(gc_bins))
+ )
def set_genome(self, genome):
"""
@@ -887,8 +930,11 @@ def best_match(self, seqs, scan_rc=True, zscore=False, gc=False):
yield [m[0] for m in matches]
def get_seq_bin(self, seq):
- useq = seq.upper()
- gc = round((useq.count("G") + useq.count("C")) / len(useq), 2)
+ if len(str(seq)) == 0:
+ gc = 0
+ else:
+ useq = seq.upper()
+ gc = round((useq.count("G") + useq.count("C")) / len(useq), 2)
if gc == 0:
gc = 0.01
for b_start, b_end in self.gc_bins:
@@ -931,16 +977,9 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
"""
Scan a set of regions or sequences.
"""
-
- if not self.threshold:
- logger.info(
- "Using default threshold of 0.95. " "This is likely not optimal!"
- )
- self.set_threshold(threshold=0.95)
-
seqs = as_fasta(seqs, genome=self.genome)
- it = self._scan_sequences(seqs.seqs, nreport, scan_rc)
+ it = self._scan_sequences(seqs.seqs, nreport, scan_rc, zscore=zscore)
if zscore:
if gc:
@@ -950,165 +989,90 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
if len(self.meanstd) != 1:
self.set_meanstd(gc=gc)
- gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs]
-
logger.debug("Scanning")
- for result, gc_seq in zip(it, gc_seqs):
- if zscore:
- zresult = []
- for i, mrow in enumerate(result):
- try:
- m_mean, m_std = self.get_motif_mean_std(
- gc_seq, self.motif_ids[i]
- )
- except Exception:
- print(self.meanstd)
- print(gc_seq, self.motif_ids[i])
- raise
- mrow = [((x[0] - m_mean) / m_std, x[1], x[2]) for x in mrow]
- zresult.append(mrow)
- yield zresult
- else:
- yield result
+ for result in it:
+ yield result
- def _scan_regions(self, regions, nreport, scan_rc):
- genome = self.genome
- motif_file = self.motifs
- motif_digest = self.checksum.get(motif_file, None)
+ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
+ # Simple case, only one threshold
+ if np.all(self.threshold.nunique(axis=0) == 1):
+ return self.threshold.iloc[0].to_dict()
- # determine which regions are not in the cache
- scan_regions = regions
- if self.use_cache:
- scan_regions = []
- for region in regions:
- key = str((region, genome, motif_digest, nreport, scan_rc))
- ret = self.cache.get(key)
- if ret == NO_VALUE:
- scan_regions.append(region)
-
- # scan the regions that are not in the cache
- if len(scan_regions) > 0:
-
- g = Genome(genome)
-
- motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
- scan_func = partial(
- scan_region_mult,
- genome=g,
- motifs=motifs,
- nreport=nreport,
- scan_rc=scan_rc,
- )
+ if motifs is None:
+ motifs = read_motifs(self.motifs)
+ seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs]
- for region, ret in self._scan_jobs(scan_func, scan_regions):
- # return values or store values in cache
- if self.use_cache:
- # store values in cache
- key = str(
- (
- region,
- genome,
- motif_digest,
- nreport,
- scan_rc,
- self.threshold_str,
- )
- )
- self.cache.set(key, ret)
- else:
- # return values
- yield ret
+ gc_bin_count = Counter(seq_gc_bins)
- if self.use_cache:
- # return results from cache
- for region in regions:
- key = str(
- (region, genome, motif_digest, nreport, scan_rc, self.threshold_str)
- )
- ret = self.cache.get(key)
- if ret == NO_VALUE or ret is None:
- raise Exception(
- "cache is not big enough to hold all "
- "results, try increasing the cache size "
- "or disable cache"
- )
- yield ret
+ _threshold = self.threshold
+ if zscore:
+ grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0)
+ _threshold = pd.DataFrame(
+ np.vstack(grouped.values),
+ index=_threshold.index,
+ columns=_threshold.columns,
+ )
+
+ nseqs = int(20000 / np.sum(list(gc_bin_count.values())))
+ t = {}
+ maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns)
+ # We do this in a loop as the DataFrame will get too big to fit in memory
+ # when the difference between the number of sequences per gc_bin is very
+ # high.
+ _threshold = _threshold.reset_index()
+ idx = np.hstack(
+ [
+ _threshold[_threshold[_threshold.columns[0]] == gc_bin]
+ .sample(nseqs * count, replace=True, random_state=42)
+ .index.values
+ for gc_bin, count in gc_bin_count.items()
+ ]
+ )
+ for motif in _threshold.columns[1:]:
+ val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher")
+ if val < maxt.loc[motif]:
+ t[motif] = val
+ else:
+ t[motif] = None
+ return t
def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):
scan_func = partial(
scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc
)
-
for ret in self._scan_jobs(scan_func, seqs):
yield ret[1]
- def _scan_sequences(self, seqs, nreport, scan_rc):
-
- motif_file = self.motifs
- motif_digest = self.checksum.get(motif_file, None)
-
- scan_seqs = seqs
- if self.use_cache:
- # determine which sequences are not in the cache
- hashes = dict([(s.upper(), xxhash.xxh64(s.upper()).digest()) for s in seqs])
- scan_seqs = []
-
- for seq, seq_hash in hashes.items():
- key = str(
- (seq_hash, motif_digest, nreport, scan_rc, self.threshold_str)
- )
- ret = self.cache.get(key)
- if ret == NO_VALUE or ret is None:
- scan_seqs.append(seq.upper())
-
- # scan the sequences that are not in the cache
- if len(scan_seqs) > 0:
- motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
- scan_func = partial(
- scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc
- )
-
- for seq, ret in self._scan_jobs(scan_func, scan_seqs):
- if self.use_cache:
- h = hashes[seq]
- key = str((h, motif_digest, nreport, scan_rc, self.threshold_str))
- self.cache.set(key, ret)
- else:
- yield ret
-
- if self.use_cache:
- # return results from cache
- for seq in seqs:
- key = str(
- (
- hashes[seq.upper()],
- motif_digest,
- nreport,
- scan_rc,
- self.threshold_str,
- )
- )
- ret = self.cache.get(key)
- if ret == NO_VALUE or ret is None:
- raise Exception(
- "cache is not big enough to hold all "
- "results, try increasing the cache size "
- "or disable cache"
- )
+ def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False):
+ thresholds = self.get_gc_thresholds(seqs, zscore=zscore)
+ motifs = [(m, thresholds[m.id]) for m in read_motifs(self.motifs)]
+ motifs_meanstd = None
+ if zscore:
+ motifs_meanstd = self.meanstd
- yield ret
+ scan_func = partial(
+ scan_seq_mult,
+ motifs=motifs,
+ nreport=nreport,
+ scan_rc=scan_rc,
+ motifs_meanstd=motifs_meanstd,
+ zscore=zscore,
+ )
+ for _, ret in self._scan_jobs(scan_func, seqs):
+ yield ret
def _scan_jobs(self, scan_func, scan_seqs):
batchsize = 1000
+
if self.ncpus > 1:
for i in range((len(scan_seqs) - 1) // batchsize + 1):
batch = scan_seqs[i * batchsize : (i + 1) * batchsize]
chunksize = len(batch) // self.ncpus + 1
jobs = []
for j in range((len(batch) - 1) // chunksize + 1):
- job = self.pool.apply_async(
- scan_func, (batch[j * chunksize : (j + 1) * chunksize],)
- )
+ batch_seqs = batch[j * chunksize : (j + 1) * chunksize]
+ seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs]
+ job = self.pool.apply_async(scan_func, (batch_seqs, seq_gc_bins))
jobs.append(job)
for k, job in enumerate(jobs):
@@ -1117,7 +1081,8 @@ def _scan_jobs(self, scan_func, scan_seqs):
yield region, ret
else:
for i in range((len(scan_seqs) - 1) // batchsize + 1):
- for _j, ret in enumerate(
- scan_func(scan_seqs[i * batchsize : (i + 1) * batchsize])
- ):
+ batch_seqs = scan_seqs[i * batchsize : (i + 1) * batchsize]
+ seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs]
+
+ for _j, ret in enumerate(scan_func(batch_seqs, seq_gc_bins)):
yield scan_seqs[i], ret
diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py
index 0319e8ad..3bb69d60 100644
--- a/gimmemotifs/tools/meme.py
+++ b/gimmemotifs/tools/meme.py
@@ -1,5 +1,6 @@
from .motifprogram import MotifProgram
import io
+import os
import re
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
@@ -75,8 +76,11 @@ def _run_program(self, bin, fastafile, params=None):
if not default_params["single"]:
cmd.append(strand)
- # sys.stderr.write(" ".join(cmd) + "\n")
- p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE)
+ # Fix to run in Docker
+ env = os.environ.copy()
+ env["OMPI_MCA_plm_rsh_agent"] = "sh"
+
+ p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env)
stdout, stderr = p.communicate()
motifs = []
diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py
index c8d17e45..5f39ddef 100644
--- a/gimmemotifs/tools/memew.py
+++ b/gimmemotifs/tools/memew.py
@@ -1,5 +1,6 @@
from .motifprogram import MotifProgram
import io
+import os
import re
from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
@@ -76,8 +77,12 @@ def _run_program(self, bin, fastafile, params=None):
if not default_params["single"]:
cmd.append(strand)
+ # Fix to run in Docker
+ env = os.environ.copy()
+ env["OMPI_MCA_plm_rsh_agent"] = "sh"
+
# sys.stderr.write(" ".join(cmd) + "\n")
- p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE)
+ p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env)
stdout, stderr = p.communicate()
motifs = []
diff --git a/gimmemotifs/tools/weeder.py b/gimmemotifs/tools/weeder.py
index 0361c2d2..0ad4334c 100644
--- a/gimmemotifs/tools/weeder.py
+++ b/gimmemotifs/tools/weeder.py
@@ -76,7 +76,7 @@ def _run_program(self, bin, fastafile, params=None):
shutil.copy(fastafile, name)
fastafile = name
- cmd = "{} -f {} -O".format(self.cmd, fastafile, weeder_organism)
+ cmd = "{} -f {} -O {}".format(self.cmd, fastafile, weeder_organism)
if params["single"]:
cmd += " -ss"
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index 17bd6fb9..a1b3c71b 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -5,8 +5,6 @@
# distribution.
""" Odds and ends that for which I didn't (yet) find another place """
-from __future__ import print_function
-
# Python imports
import os
import re
@@ -15,18 +13,21 @@
import logging
import mmap
import random
-import six
import tempfile
import requests
+from io import TextIOWrapper
+from functools import singledispatch
from subprocess import Popen
from tempfile import NamedTemporaryFile
from shutil import copyfile
# External imports
+import pyfaidx
from scipy import special
import numpy as np
import pybedtools
from genomepy import Genome
+from Bio.SeqIO.FastaIO import SimpleFastaParser
# gimme imports
@@ -49,8 +50,7 @@ def rc(seq):
def narrowpeak_to_bed(inputfile, bedfile, size=0):
- """Convert narrowPeak file to BED file.
- """
+ """Convert narrowPeak file to BED file."""
p = re.compile(r"^(#|track|browser)")
warn_no_summit = True
with open(bedfile, "w") as f_out:
@@ -88,7 +88,7 @@ def pfmfile_location(infile):
"database specified in the config file."
)
- if isinstance(infile, six.string_types):
+ if isinstance(infile, str):
if not os.path.exists(infile):
motif_dir = config.get_motif_dir()
checkfile = os.path.join(motif_dir, infile)
@@ -132,7 +132,7 @@ def phyper_single(k, good, bad, N):
def phyper(k, good, bad, N):
- """ Current hypergeometric implementation in scipy is broken,
+ """Current hypergeometric implementation in scipy is broken,
so here's the correct version.
"""
pvalues = [phyper_single(x, good, bad, N) for x in range(k + 1, N + 1)]
@@ -293,8 +293,8 @@ def motif_localization(fastafile, motif, size, outfile, cutoff=0.9):
def parse_cutoff(motifs, cutoff, default=0.9):
- """ Provide either a file with one cutoff per motif or a single cutoff
- returns a hash with motif id as key and cutoff as value
+ """Provide either a file with one cutoff per motif or a single cutoff
+ returns a hash with motif id as key and cutoff as value
"""
cutoffs = {}
@@ -473,7 +473,7 @@ def get_seqs_type(seqs):
- region file
- BED file
"""
- region_p = re.compile(r"^(.+):(\d+)-(\d+)$")
+ region_p = re.compile(r"^([^\s:]+\@)?(.+):(\d+)-(\d+)$")
if isinstance(seqs, Fasta):
return "fasta"
elif isinstance(seqs, list) or isinstance(seqs, np.ndarray):
@@ -499,24 +499,197 @@ def get_seqs_type(seqs):
raise ValueError("unknown type {}".format(type(seqs).__name__))
-def as_fasta(seqs, genome=None):
- ftype = get_seqs_type(seqs)
- if ftype == "fasta":
- return seqs
- elif ftype == "fastafile":
- return Fasta(seqs)
+# Regular expression to check for region (chr:start-end or genome@chr:start-end)
+region_p = re.compile(r"^[^@]+@([^\s]+):(\d+)-(\d+)$")
+
+
+def _check_minsize(fa, minsize):
+ """
+ Raise ValueError if there is any sequence that is shorter than minsize.
+ If minsize is None the size will not be checked.
+ """
+ if minsize is None:
+ return fa
+
+ for name, seq in fa.items():
+ if len(seq) < minsize:
+ raise ValueError(f"sequence {name} is shorter than {minsize}")
+
+ return fa
+
+
+def _genomepy_convert(to_convert, genome, minsize=None):
+ """
+ Convert a variety of inputs using track2fasta().
+ """
+ if genome is None:
+ raise ValueError("input file is not a FASTA file, need a genome!")
+
+ if isinstance(genome, Genome):
+ g = genome
else:
- if genome is None:
- raise ValueError("need genome to convert to FASTA")
+ g = Genome(genome)
+
+ tmpfile = NamedTemporaryFile()
+ g.track2fasta(to_convert, tmpfile.name)
+
+ fa = as_seqdict(tmpfile.name)
+ return _check_minsize(fa, minsize)
+
+
+def _as_seqdict_genome_regions(regions, minsize=None):
+ """
+ Accepts list of regions where the genome is encoded in the region,
+ using the genome@chrom:start-end format.
+ """
+ genomic_regions = {}
+ for region in regions:
+ genome, region = region.split("@")
+ if genome not in genomic_regions:
+ Genome(genome)
+ genomic_regions[genome] = []
+ genomic_regions[genome].append(region)
+
+ tmpfa = NamedTemporaryFile(mode="w", delete=False)
+ for genome, g_regions in genomic_regions.items():
+ g = Genome(genome)
+
+ fa = g.track2fasta(g_regions)
+
+ for seq in fa:
+ seq.name = f"{genome}@{seq.name}"
+ print(seq.__repr__(), file=tmpfa)
+
+ tmpfa.flush()
+
+ # Open tempfile and restore original sequence order
+ fa = as_seqdict(tmpfa.name)
+ fa = {region: fa[region] for region in regions}
+ return _check_minsize(fa, minsize)
+
+
+@singledispatch
+def as_seqdict(to_convert, genome=None, minsize=None):
+ """
+ Convert input to a dictionary with name as key and sequence as value.
+
+ If the input contains genomic coordinates, the genome needs to be
+ specified. If minsize is specified all sequences will be checked if they
+ are not shorter than minsize. If regions (or a region file) are used as
+ the input, the genome can optionally be specified in the region using the
+ following format: genome@chrom:start-end.
+
+ Current supported input types include:
+ * FASTA, BED and region files.
+ * List or numpy.ndarray of regions.
+ * pyfaidx.Fasta object.
+ * pybedtools.BedTool object.
+
+ Parameters
+ ----------
+ to_convert : list, str, pyfaidx.Fasta or pybedtools.BedTool
+ Input to convert to FASTA-like dictionary
+
+ genome : str, optional
+ Genomepy genome name.
+
+ minsize : int or None, optional
+ If specified, check if all sequences have at least size minsize.
+
+ Returns
+ -------
+ dict with sequence names as key and sequences as value.
+ """
+ raise NotImplementedError(f"Not implement for {type(to_convert)}")
+
+
+@as_seqdict.register(list)
+def _as_seqdict_list(to_convert, genome=None, minsize=None):
+ """
+ Accepts list of regions as input.
+ """
+ if region_p.match(to_convert[0]):
+ return _as_seqdict_genome_regions(to_convert, minsize)
+
+ return _genomepy_convert(to_convert, genome, minsize)
+
+
+@as_seqdict.register(TextIOWrapper)
+def _as_seqdict_file_object(to_convert, genome=None, minsize=None):
+ """
+ Accepts file object as input, should be a FASTA file.
+ """
+ fa = {x: y for x, y in SimpleFastaParser(to_convert)}
+ return _check_minsize(fa, minsize)
+
+
+@as_seqdict.register(str)
+def _as_seqdict_filename(to_convert, genome=None, minsize=None):
+ """
+ Accepts filename as input.
+ """
+ if not os.path.exists(to_convert):
+ raise ValueError("Assuming filename, but it does not exist")
+
+ f = open(to_convert)
+ fa = as_seqdict(f)
+
+ if any(fa):
+ return _check_minsize(fa, minsize)
+
+ with open(to_convert) as f:
+ line = ""
+ while True:
+ line = f.readline()
+ if line == "":
+ break
+ if not line.startswith("#"):
+ break
+
+ if line == "":
+ raise IOError(f"empty file {to_convert}")
+
+ if region_p.match(line.strip()):
+ regions = [myline.strip() for myline in [line] + f.readlines()]
+ return _as_seqdict_genome_regions(regions, minsize=None)
+
+ # Biopython parser resulted in empty dict
+ # Assuming it's a BED or region file
+ return _genomepy_convert(to_convert, genome, minsize)
+
+
+@as_seqdict.register(pyfaidx.Fasta)
+def _as_seqdict_pyfaidx(to_convert, genome=None, minsize=None):
+ """
+ Accepts pyfaidx.Fasta object as input.
+ """
+ fa = {k: str(v) for k, v in to_convert.items()}
+ return _check_minsize(fa, minsize)
+
+
+@as_seqdict.register(pybedtools.BedTool)
+def _as_seqdict_bedtool(to_convert, genome=None, minsize=None):
+ """
+ Accepts pybedtools.BedTool as input.
+ """
+ return _genomepy_convert(
+ ["{}:{}-{}".format(*f[:3]) for f in to_convert], genome, minsize
+ )
+
+
+@as_seqdict.register(np.ndarray)
+def _as_seqdict_array(to_convert, genome=None, minsize=None):
+ """
+ Accepts numpy.ndarray with regions as input.
+ """
+ return as_seqdict(list(to_convert), genome, minsize)
+
- tmpfa = NamedTemporaryFile()
- if isinstance(genome, str):
- genome = Genome(genome)
+def as_fasta(to_convert, genome=None, minsize=None):
+ if isinstance(to_convert, Fasta):
+ return to_convert
- if isinstance(seqs, np.ndarray):
- seqs = list(seqs)
- genome.track2fasta(seqs, tmpfa.name)
- return Fasta(tmpfa.name)
+ return Fasta(fdict=as_seqdict(to_convert, genome, minsize))
def file_checksum(fname):
@@ -539,11 +712,11 @@ def file_checksum(fname):
return checksum
-def join_max(a, l, sep="", suffix=""):
+def join_max(a, length, sep="", suffix=""):
lengths = [len(x) for x in a]
total = 0
for i, size in enumerate(lengths + [0]):
- if total > (l - len(suffix)):
+ if total > (length - len(suffix)):
return sep.join(a[: i - 1]) + suffix
if i > 0:
total += 1
diff --git a/readthedocs.yml b/readthedocs.yml
index 9a6a856f..e458c84e 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -1,18 +1,18 @@
# Required
version: 2
-# Build documentation in the docs/ directory with Sphinx
+conda:
+ environment: .rtd-environment.yml
+
+ # Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
+build:
+ image: latest
+
python:
- version: 3.7
- install:
- - requirements: docs/requirements.txt
- - method: pip
- path: .
- extra_requirements:
- - docs
- - method: setuptools
- path: another/package
- system_packages: true
+ version: 3.7
+ install:
+ - method: setuptools
+ path: .
diff --git a/requirements.txt b/requirements.txt
index 604878ff..ec991b68 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,12 +10,9 @@ pyyaml >= 3.10
pybedtools
statsmodels
scikit-learn
-sklearn-contrib-lightning
seaborn
pysam
xgboost
diskcache
xxhash
-six
-future
pillow
diff --git a/scripts/combine_peaks b/scripts/combine_peaks
index 155d6e83..da2870c0 100644
--- a/scripts/combine_peaks
+++ b/scripts/combine_peaks
@@ -46,7 +46,7 @@ def read_peak_file_to_df(fname):
"qval",
"peak",
]
- df = pd.read_table(fname, names=header)
+ df = pd.read_table(fname, names=header, dtype={"chrom": "str"})
df["chrom"] = df["chrom"].astype(str)
# get the summit
@@ -57,7 +57,7 @@ def read_peak_file_to_df(fname):
df["value"] = df["qval"]
df = df[summit_header]
elif ftype == "bed":
- df = pd.read_table(fname, names=summit_header)
+ df = pd.read_table(fname, names=summit_header, dtype={"chrom": "str"})
if ((df["end"] - df["start"]) != 1).sum() != 0:
raise ValueError(f"{fname} does not contain summits.")
else:
@@ -116,7 +116,7 @@ def combine_peaks(peaks, genome, window, scale_value):
# store summit location + associated value in col4
df_all["col4"] = (
- df_all["chrom"]
+ df_all["chrom"].astype(str)
+ ";"
+ df_all["start"].astype(str)
+ ";"
diff --git a/scripts/coverage_table b/scripts/coverage_table
index 64385b65..14f3128f 100644
--- a/scripts/coverage_table
+++ b/scripts/coverage_table
@@ -9,6 +9,7 @@ import pysam
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
+import qnorm
from gimmemotifs import __version__
@@ -27,11 +28,12 @@ def make_table(
datafiles,
window,
log_transform=True,
- scale_table=True,
+ normalization="none",
top=0,
topmethod="var",
rmdup=True,
rmrepeats=True,
+ ncpus=12
):
for x in datafiles:
if not os.path.isfile(x):
@@ -49,7 +51,7 @@ def make_table(
data = {}
try:
# Load data in parallel
- pool = multiprocessing.Pool(processes=12)
+ pool = multiprocessing.Pool(processes=ncpus)
jobs = []
for datafile in datafiles:
jobs.append(
@@ -99,9 +101,14 @@ def make_table(
if log_transform:
print("Log transform", file=sys.stderr)
df = np.log1p(df)
- if scale_table:
- print("Scale", file=sys.stderr)
+ if normalization == "scale":
+ print("Normalization by scaling", file=sys.stderr)
df[:] = scale(df, axis=0)
+ if normalization == "quantile":
+ print("Normalization by quantile normalization", file=sys.stderr)
+ df = qnorm.quantile_normalize(df)
+ else:
+ print("No normalization", file=sys.stderr)
if top > 0:
if topmethod == "var":
@@ -165,12 +172,12 @@ if __name__ == "__main__":
action="store_true",
)
parser.add_argument(
- "-s",
- "--scale",
- dest="scale_table",
- help="Scale per datafile",
- default=False,
- action="store_true",
+ "-n",
+ "--normalization",
+ dest="normalization",
+ help="Normalization: none, quantile or scale",
+ default="none",
+ metavar="METHOD",
)
parser.add_argument(
"-t", "--top", dest="top", help="Select regions.", default=0, type=int
@@ -196,6 +203,14 @@ if __name__ == "__main__":
action="store_false",
default=True,
)
+ parser.add_argument(
+ "--nthreads",
+ dest="ncpus",
+ help="Number of threads",
+ metavar="INT",
+ type=int,
+ default=12,
+ )
args = parser.parse_args()
peakfile = args.peakfile
@@ -205,11 +220,12 @@ if __name__ == "__main__":
datafiles,
args.window,
log_transform=args.log_transform,
- scale_table=args.scale_table,
+ normalization=args.normalization,
top=args.top,
topmethod=args.topmethod,
rmdup=args.rmdup,
rmrepeats=args.rmrepeats,
+ ncpus=args.ncpus
)
yesno = {True: "yes", False: "no"}
@@ -222,7 +238,7 @@ output.write("# Window: {}\n".format(args.window))
output.write("# Duplicates removed: {}\n".format(yesno[args.rmdup]))
output.write("# MAPQ 0 removed: {}\n".format(yesno[args.rmrepeats]))
output.write("# Log transformed: {}\n".format(yesno[args.log_transform]))
-output.write("# Scaled: {}\n".format(yesno[args.scale_table]))
+output.write("# Normalization: {}\n".format(args.normalization))
if args.top > 0:
output.write("# Top {} regions selected by {}\n".format(args.top, args.topmethod))
df.to_csv(output, sep="\t", float_format="%0.5f")
diff --git a/setup.py b/setup.py
index 5f9fd60f..b5dc2217 100644
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,6 @@ def run(self):
"pybedtools",
"statsmodels",
"scikit-learn",
- "sklearn-contrib-lightning",
"seaborn",
"pysam",
"xgboost >= 0.71",
@@ -132,11 +131,10 @@ def run(self):
"diskcache",
"xxhash",
"configparser",
- "six",
- "future",
- "genomepy >= 0.7.2",
+ "genomepy >= 0.8.3",
"tqdm",
"pillow",
"logomaker",
+ "qnorm",
],
)
diff --git a/test/data/rank/ranked.txt b/test/data/rank/ranked.txt
index f5f36f40..bb52cd79 100644
--- a/test/data/rank/ranked.txt
+++ b/test/data/rank/ranked.txt
@@ -1,6 +1,6 @@
a b c d e
-1 bZIP bZIP SOX T-box POU
-2 AP2 AP2 AP2 AP2 AP2
-3 T-box SOX T-box SOX SOX
-4 SOX T-box POU POU T-box
-5 POU POU bZIP bZIP bZIP
+bZIP 5 5 1 1 1
+AP2 4 4 4 4 4
+T-box 3 2 3 5 2
+SOX 2 3 5 3 3
+POU 1 1 2 2 4
diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py
index 269e5b92..53bffc77 100644
--- a/test/test_maelstrom.py
+++ b/test/test_maelstrom.py
@@ -26,13 +26,30 @@ def test1_maelstrom(self):
self.clusters,
"mm10",
self.outdir,
+ filter_redundant=False,
score_table=self.score_table,
count_table=self.count_table,
plot=False,
)
df = pd.read_table(self.outfile, index_col=0, comment="#")
print(df.shape)
- self.assertEquals((623, 4), df.shape)
+
+ self.assertEquals((623, 8), df.shape)
+
+ # Filter redundant motifs
+ run_maelstrom(
+ self.clusters,
+ "mm10",
+ self.outdir,
+ filter_redundant=True,
+ score_table=self.score_table,
+ count_table=self.count_table,
+ plot=False,
+ )
+ df = pd.read_table(self.outfile, index_col=0, comment="#")
+ print(df.shape)
+ self.assertEquals((156, 8), df.shape)
+
for fname in glob(os.path.join(self.outdir, "activity*")):
os.unlink(fname)
diff --git a/test/test_moap.py b/test/test_moap.py
index 775b3534..c6c13be3 100644
--- a/test/test_moap.py
+++ b/test/test_moap.py
@@ -20,7 +20,7 @@ def setUp(self):
def test1_moap(self):
""" Test motif activity prediction """
- for method in ["mwu", "rf", "lightningclassification"]:
+ for method in ["mwu", "rf"]:
df = moap(
self.clusters,
method=method,
@@ -41,7 +41,7 @@ def test1_moap(self):
def test2_moap(self):
""" Test motif activity prediction for two clusters """
- for method in ["mwu", "rf", "lightningclassification"]:
+ for method in ["mwu", "rf"]:
df = moap(
self.clusters2,
method=method,
diff --git a/test/test_rank.py b/test/test_rank.py
index 5cb533e9..cca52132 100644
--- a/test/test_rank.py
+++ b/test/test_rank.py
@@ -2,7 +2,7 @@
import tempfile
import os
import pandas as pd
-from gimmemotifs.rank import rankagg
+from gimmemotifs.rank import rankagg, _rankagg_stuart
class TestRank(unittest.TestCase):
@@ -17,13 +17,15 @@ def setUp(self):
def test1_rankagg(self):
""" Test rank aggregation """
df = pd.read_csv(self.fname, index_col=0, sep="\t")
- result = rankagg(df)
- self.assertEqual("AP2", result.sort_values("score").index[0])
+ result = rankagg(df, method="stuart")
+ self.assertEqual("AP2", result.sort_values("score").index[-1])
+ result = rankagg(df, method="int_stouffer")
+ self.assertEqual("AP2", result.sort_values("z-score").index[-1])
def test2_rankagg(self):
""" Test Python implementation of rank aggregation """
df = pd.read_csv(self.rank_in, index_col=0, sep="\t")
- result = rankagg(df)["score"].values
+ result = _rankagg_stuart(df)["score"].values
ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values
for v1, v2 in zip(ref, result):
self.assertAlmostEqual(v1, v2)
diff --git a/test/test_tools.py b/test/test_tools.py
index d6dba2db..9903ce54 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -35,6 +35,7 @@ def test_tool(tool_name):
"trawler", # unpredictable, sometimes doesn't find the motif
"weeder", # doesn't work at the moment
"posmo", # motif doesn't predictably look like AP1
+ "dreme", # current dreme in bioconda is broken
]:
return
diff --git a/test/test_utils.py b/test/test_utils.py
index 87f7b9da..c4f72c2f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -13,7 +13,7 @@ class TestUtils(unittest.TestCase):
""" A test class to test utils functions """
def setUp(self):
- self.genome_dir = "test/data/genome_index"
+ self.genomes_dir = "test/data/genome_index"
self.datadir = "test/data/utils"
def test1_phyper(self):
@@ -31,7 +31,7 @@ def test2_as_fasta(self):
""" convert bed, regions, etc to Fasta """
tmpdir = mkdtemp()
- g = Genome("genome", genome_dir=self.genome_dir)
+ g = Genome("genome", genomes_dir=self.genomes_dir)
fafile = os.path.join(self.datadir, "test.fa")
fa = Fasta(fafile)
diff --git a/versioneer.py b/versioneer.py
index 64fea1c8..cce201c7 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -1,4 +1,3 @@
-
# Version: 0.18
"""The Versioneer - like a rocketeer, but for versions.
@@ -276,7 +275,6 @@
"""
-from __future__ import print_function
try:
import configparser
except ImportError:
@@ -308,11 +306,13 @@ def get_root():
setup_py = os.path.join(root, "setup.py")
versioneer_py = os.path.join(root, "versioneer.py")
if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
- err = ("Versioneer was unable to run the project root directory. "
- "Versioneer requires setup.py to be executed from "
- "its immediate directory (like 'python setup.py COMMAND'), "
- "or in a way that lets it use sys.argv[0] to find the root "
- "(like 'python path/to/setup.py COMMAND').")
+ err = (
+ "Versioneer was unable to run the project root directory. "
+ "Versioneer requires setup.py to be executed from "
+ "its immediate directory (like 'python setup.py COMMAND'), "
+ "or in a way that lets it use sys.argv[0] to find the root "
+ "(like 'python path/to/setup.py COMMAND')."
+ )
raise VersioneerBadRootError(err)
try:
# Certain runtime workflows (setup.py install/develop in a setuptools
@@ -325,8 +325,10 @@ def get_root():
me_dir = os.path.normcase(os.path.splitext(me)[0])
vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
if me_dir != vsr_dir:
- print("Warning: build in %s is using versioneer.py from %s"
- % (os.path.dirname(me), versioneer_py))
+ print(
+ "Warning: build in %s is using versioneer.py from %s"
+ % (os.path.dirname(me), versioneer_py)
+ )
except NameError:
pass
return root
@@ -348,6 +350,7 @@ def get(parser, name):
if parser.has_option("versioneer", name):
return parser.get("versioneer", name)
return None
+
cfg = VersioneerConfig()
cfg.VCS = VCS
cfg.style = get(parser, "style") or ""
@@ -372,17 +375,18 @@ class NotThisMethod(Exception):
def register_vcs_handler(vcs, method): # decorator
"""Decorator to mark a method as the handler for a particular VCS."""
+
def decorate(f):
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
+
return decorate
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
- env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
"""Call the given command(s)."""
assert isinstance(commands, list)
p = None
@@ -390,10 +394,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
try:
dispcmd = str([c] + args)
# remember shell=False, so use git.cmd on windows, not just git
- p = subprocess.Popen([c] + args, cwd=cwd, env=env,
- stdout=subprocess.PIPE,
- stderr=(subprocess.PIPE if hide_stderr
- else None))
+ p = subprocess.Popen(
+ [c] + args,
+ cwd=cwd,
+ env=env,
+ stdout=subprocess.PIPE,
+ stderr=(subprocess.PIPE if hide_stderr else None),
+ )
break
except EnvironmentError:
e = sys.exc_info()[1]
@@ -418,7 +425,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
return stdout, p.returncode
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY[
+ "git"
+] = '''
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
@@ -993,7 +1002,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
- tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+ tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
@@ -1002,7 +1011,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
- tags = set([r for r in refs if re.search(r'\d', r)])
+ tags = set([r for r in refs if re.search(r"\d", r)])
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
@@ -1010,19 +1019,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
- r = ref[len(tag_prefix):]
+ r = ref[len(tag_prefix) :]
if verbose:
print("picking %s" % r)
- return {"version": r,
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": None,
- "date": date}
+ return {
+ "version": r,
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": None,
+ "date": date,
+ }
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
- return {"version": "0+unknown",
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": "no suitable tags", "date": None}
+ return {
+ "version": "0+unknown",
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": "no suitable tags",
+ "date": None,
+ }
@register_vcs_handler("git", "pieces_from_vcs")
@@ -1037,8 +1053,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
- out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
- hide_stderr=True)
+ out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
@@ -1046,10 +1061,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
- describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long",
- "--match", "%s*" % tag_prefix],
- cwd=root)
+ describe_out, rc = run_command(
+ GITS,
+ [
+ "describe",
+ "--tags",
+ "--dirty",
+ "--always",
+ "--long",
+ "--match",
+ "%s*" % tag_prefix,
+ ],
+ cwd=root,
+ )
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
@@ -1072,17 +1096,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
- git_describe = git_describe[:git_describe.rindex("-dirty")]
+ git_describe = git_describe[: git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
- mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+ mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
- pieces["error"] = ("unable to parse git-describe output: '%s'"
- % describe_out)
+ pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
return pieces
# tag
@@ -1091,10 +1114,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
- pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
- % (full_tag, tag_prefix))
+ pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+ full_tag,
+ tag_prefix,
+ )
return pieces
- pieces["closest-tag"] = full_tag[len(tag_prefix):]
+ pieces["closest-tag"] = full_tag[len(tag_prefix) :]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
@@ -1105,13 +1130,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
else:
# HEX: no tags
pieces["closest-tag"] = None
- count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
- cwd=root)
+ count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
pieces["distance"] = int(count_out) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
- date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
- cwd=root)[0].strip()
+ date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+ 0
+ ].strip()
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
@@ -1167,16 +1192,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
for i in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
- return {"version": dirname[len(parentdir_prefix):],
- "full-revisionid": None,
- "dirty": False, "error": None, "date": None}
+ return {
+ "version": dirname[len(parentdir_prefix) :],
+ "full-revisionid": None,
+ "dirty": False,
+ "error": None,
+ "date": None,
+ }
else:
rootdirs.append(root)
root = os.path.dirname(root) # up a level
if verbose:
- print("Tried directories %s but none started with prefix %s" %
- (str(rootdirs), parentdir_prefix))
+ print(
+ "Tried directories %s but none started with prefix %s"
+ % (str(rootdirs), parentdir_prefix)
+ )
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
@@ -1205,11 +1236,13 @@ def versions_from_file(filename):
contents = f.read()
except EnvironmentError:
raise NotThisMethod("unable to read _version.py")
- mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON",
- contents, re.M | re.S)
+ mo = re.search(
+ r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
+ )
if not mo:
- mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON",
- contents, re.M | re.S)
+ mo = re.search(
+ r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
+ )
if not mo:
raise NotThisMethod("no version_json in _version.py")
return json.loads(mo.group(1))
@@ -1218,8 +1251,7 @@ def versions_from_file(filename):
def write_to_version_file(filename, versions):
"""Write the given version number to the given _version.py file."""
os.unlink(filename)
- contents = json.dumps(versions, sort_keys=True,
- indent=1, separators=(",", ": "))
+ contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
with open(filename, "w") as f:
f.write(SHORT_VERSION_PY % contents)
@@ -1251,8 +1283,7 @@ def render_pep440(pieces):
rendered += ".dirty"
else:
# exception #1
- rendered = "0+untagged.%d.g%s" % (pieces["distance"],
- pieces["short"])
+ rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
@@ -1366,11 +1397,13 @@ def render_git_describe_long(pieces):
def render(pieces, style):
"""Render the given version pieces into the requested style."""
if pieces["error"]:
- return {"version": "unknown",
- "full-revisionid": pieces.get("long"),
- "dirty": None,
- "error": pieces["error"],
- "date": None}
+ return {
+ "version": "unknown",
+ "full-revisionid": pieces.get("long"),
+ "dirty": None,
+ "error": pieces["error"],
+ "date": None,
+ }
if not style or style == "default":
style = "pep440" # the default
@@ -1390,9 +1423,13 @@ def render(pieces, style):
else:
raise ValueError("unknown style '%s'" % style)
- return {"version": rendered, "full-revisionid": pieces["long"],
- "dirty": pieces["dirty"], "error": None,
- "date": pieces.get("date")}
+ return {
+ "version": rendered,
+ "full-revisionid": pieces["long"],
+ "dirty": pieces["dirty"],
+ "error": None,
+ "date": pieces.get("date"),
+ }
class VersioneerBadRootError(Exception):
@@ -1415,8 +1452,9 @@ def get_versions(verbose=False):
handlers = HANDLERS.get(cfg.VCS)
assert handlers, "unrecognized VCS '%s'" % cfg.VCS
verbose = verbose or cfg.verbose
- assert cfg.versionfile_source is not None, \
- "please set versioneer.versionfile_source"
+ assert (
+ cfg.versionfile_source is not None
+ ), "please set versioneer.versionfile_source"
assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1470,9 +1508,13 @@ def get_versions(verbose=False):
if verbose:
print("unable to compute version")
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None, "error": "unable to compute version",
- "date": None}
+ return {
+ "version": "0+unknown",
+ "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to compute version",
+ "date": None,
+ }
def get_version():
@@ -1521,6 +1563,7 @@ def run(self):
print(" date: %s" % vers.get("date"))
if vers["error"]:
print(" error: %s" % vers["error"])
+
cmds["version"] = cmd_version
# we override "build_py" in both distutils and setuptools
@@ -1553,14 +1596,15 @@ def run(self):
# now locate _version.py in the new build/ directory and replace
# it with an updated value
if cfg.versionfile_build:
- target_versionfile = os.path.join(self.build_lib,
- cfg.versionfile_build)
+ target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
print("UPDATING %s" % target_versionfile)
write_to_version_file(target_versionfile, versions)
+
cmds["build_py"] = cmd_build_py
if "cx_Freeze" in sys.modules: # cx_freeze enabled?
from cx_Freeze.dist import build_exe as _build_exe
+
# nczeczulin reports that py2exe won't like the pep440-style string
# as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
# setup(console=[{
@@ -1581,17 +1625,21 @@ def run(self):
os.unlink(target_versionfile)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG %
- {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
+ f.write(
+ LONG
+ % {
+ "DOLLAR": "$",
+ "STYLE": cfg.style,
+ "TAG_PREFIX": cfg.tag_prefix,
+ "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+ "VERSIONFILE_SOURCE": cfg.versionfile_source,
+ }
+ )
+
cmds["build_exe"] = cmd_build_exe
del cmds["build_py"]
- if 'py2exe' in sys.modules: # py2exe enabled?
+ if "py2exe" in sys.modules: # py2exe enabled?
try:
from py2exe.distutils_buildexe import py2exe as _py2exe # py3
except ImportError:
@@ -1610,13 +1658,17 @@ def run(self):
os.unlink(target_versionfile)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG %
- {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
+ f.write(
+ LONG
+ % {
+ "DOLLAR": "$",
+ "STYLE": cfg.style,
+ "TAG_PREFIX": cfg.tag_prefix,
+ "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+ "VERSIONFILE_SOURCE": cfg.versionfile_source,
+ }
+ )
+
cmds["py2exe"] = cmd_py2exe
# we override different "sdist" commands for both environments
@@ -1643,8 +1695,10 @@ def make_release_tree(self, base_dir, files):
# updated value
target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
print("UPDATING %s" % target_versionfile)
- write_to_version_file(target_versionfile,
- self._versioneer_generated_versions)
+ write_to_version_file(
+ target_versionfile, self._versioneer_generated_versions
+ )
+
cmds["sdist"] = cmd_sdist
return cmds
@@ -1699,11 +1753,13 @@ def do_setup():
root = get_root()
try:
cfg = get_config_from_root(root)
- except (EnvironmentError, configparser.NoSectionError,
- configparser.NoOptionError) as e:
+ except (
+ EnvironmentError,
+ configparser.NoSectionError,
+ configparser.NoOptionError,
+ ) as e:
if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
- print("Adding sample versioneer config to setup.cfg",
- file=sys.stderr)
+ print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
with open(os.path.join(root, "setup.cfg"), "a") as f:
f.write(SAMPLE_CONFIG)
print(CONFIG_ERROR, file=sys.stderr)
@@ -1712,15 +1768,18 @@ def do_setup():
print(" creating %s" % cfg.versionfile_source)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
- f.write(LONG % {"DOLLAR": "$",
- "STYLE": cfg.style,
- "TAG_PREFIX": cfg.tag_prefix,
- "PARENTDIR_PREFIX": cfg.parentdir_prefix,
- "VERSIONFILE_SOURCE": cfg.versionfile_source,
- })
-
- ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
- "__init__.py")
+ f.write(
+ LONG
+ % {
+ "DOLLAR": "$",
+ "STYLE": cfg.style,
+ "TAG_PREFIX": cfg.tag_prefix,
+ "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+ "VERSIONFILE_SOURCE": cfg.versionfile_source,
+ }
+ )
+
+ ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
if os.path.exists(ipy):
try:
with open(ipy, "r") as f:
@@ -1762,8 +1821,10 @@ def do_setup():
else:
print(" 'versioneer.py' already in MANIFEST.in")
if cfg.versionfile_source not in simple_includes:
- print(" appending versionfile_source ('%s') to MANIFEST.in" %
- cfg.versionfile_source)
+ print(
+ " appending versionfile_source ('%s') to MANIFEST.in"
+ % cfg.versionfile_source
+ )
with open(manifest_in, "a") as f:
f.write("include %s\n" % cfg.versionfile_source)
else: