From 5b23b2ec7cce1cdb864302896078ea38e9cfed83 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Fri, 17 Apr 2020 16:30:14 +0200
Subject: [PATCH 01/85] fix for Ensembl genomes

---
 gimmemotifs/background.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
index e308f556..32deb2df 100644
--- a/gimmemotifs/background.py
+++ b/gimmemotifs/background.py
@@ -360,6 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100):
         cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)]
 
     df.columns = cols
+    
+    # Make really sure that column 'chrom' is a string
+    df.dropna(subset=['chrom'], inplace=True)
+    df['chrom'] = df['chrom'].apply(str).astype("string")
+    
     df.reset_index()[cols].to_feather(fname)
 
 

From a5265906b440fa0290dda7a334169cb8179629a3 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Fri, 17 Apr 2020 17:12:20 +0200
Subject: [PATCH 02/85] style

---
 gimmemotifs/background.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
index 32deb2df..9ecf0318 100644
--- a/gimmemotifs/background.py
+++ b/gimmemotifs/background.py
@@ -360,11 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100):
         cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)]
 
     df.columns = cols
-    
+
     # Make really sure that column 'chrom' is a string
-    df.dropna(subset=['chrom'], inplace=True)
-    df['chrom'] = df['chrom'].apply(str).astype("string")
-    
+    df.dropna(subset=["chrom"], inplace=True)
+    df["chrom"] = df["chrom"].apply(str).astype("string")
+
     df.reset_index()[cols].to_feather(fname)
 
 

From 40684d54b5d8fb114f69152d23c3e9e4d1fa76ab Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Wed, 27 May 2020 08:33:39 +0200
Subject: [PATCH 03/85] fix #118

---
 gimmemotifs/denovo.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/gimmemotifs/denovo.py b/gimmemotifs/denovo.py
index 2d1913ac..2cd1225f 100644
--- a/gimmemotifs/denovo.py
+++ b/gimmemotifs/denovo.py
@@ -669,11 +669,14 @@ def gimme_motifs(
         sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True)
         final_motifs, stats = rename_motifs(sorted_motifs, result.stats)
 
-    with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
-        for m in final_motifs:
-            f.write("{}\n".format(m.to_pwm()))
+    motifs_found = len(final_motifs) > 0
 
-    if create_report:
+    if motifs_found:
+        with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f:
+            for m in final_motifs:
+                f.write("{}\n".format(m.to_pwm()))
+
+    if motifs_found and create_report:
         bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background])
 
         create_denovo_motif_report(
@@ -700,7 +703,7 @@ def gimme_motifs(
 
     logger.info("finished")
     logger.info("output dir: %s", outdir)
-    if cluster:
+    if motifs_found and cluster:
         logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html"))
 
     return final_motifs

From b3aec2300afac7d5fd264ddbba03a17c4a6a9784 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Wed, 27 May 2020 08:37:28 +0200
Subject: [PATCH 04/85] MEME fix in docker

---
 gimmemotifs/tools/meme.py  | 1 +
 gimmemotifs/tools/memew.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py
index 0319e8ad..a6a4d639 100644
--- a/gimmemotifs/tools/meme.py
+++ b/gimmemotifs/tools/meme.py
@@ -58,6 +58,7 @@ def _run_program(self, bin, fastafile, params=None):
         number = default_params["number"]
 
         cmd = [
+            "OMPI_MCA_plm_rsh_agent=sh",  # Fix to run in Docker
             bin,
             fastafile,
             "-text",
diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py
index c8d17e45..8cdeac7c 100644
--- a/gimmemotifs/tools/memew.py
+++ b/gimmemotifs/tools/memew.py
@@ -57,6 +57,7 @@ def _run_program(self, bin, fastafile, params=None):
         number = default_params["number"]
 
         cmd = [
+            "OMPI_MCA_plm_rsh_agent=sh",
             bin,
             fastafile,
             "-text",

From cd04aaf3a8a074dab4886d0d4e29c69188e7ac68 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 14:56:34 +0200
Subject: [PATCH 05/85] remove six

---
 gimmemotifs/commands/__init__.py | 6 +-----
 gimmemotifs/motif.py             | 5 ++---
 gimmemotifs/scanner.py           | 3 +--
 gimmemotifs/utils.py             | 3 +--
 4 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/gimmemotifs/commands/__init__.py b/gimmemotifs/commands/__init__.py
index c20bb546..f944936c 100644
--- a/gimmemotifs/commands/__init__.py
+++ b/gimmemotifs/commands/__init__.py
@@ -1,13 +1,9 @@
 import pkgutil
-import six
 import os
 
 dirname = os.path.split(__file__)[0]
 
-if six.PY3:
-    level = 0
-else:
-    level = -1
+level = 0
 
 # Dynamically load all commands
 for _importer, cmdname, _ in pkgutil.iter_modules([dirname]):
diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index 3065a6cb..cb8aba8b 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -11,7 +11,6 @@
 import random
 from math import log, sqrt
 from warnings import warn
-import six
 
 from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME
 from gimmemotifs.c_metrics import pfmscan
@@ -1393,7 +1392,7 @@ def parse_motifs(motifs):
     motifs : list
         List of Motif instances.
     """
-    if isinstance(motifs, six.string_types):
+    if isinstance(motifs, str):
         with open(motifs) as f:
             if motifs.endswith("pwm") or motifs.endswith("pfm"):
                 motifs = read_motifs(f, fmt="pwm")
@@ -1518,7 +1517,7 @@ def read_motifs(infile=None, fmt="pfm", as_dict=False):
     if fmt == "pwm":
         fmt = "pfm"
 
-    if infile is None or isinstance(infile, six.string_types):
+    if infile is None or isinstance(infile, str):
         infile = pfmfile_location(infile)
         with open(infile) as f:
             motifs = _read_motifs_from_filehandle(f, fmt)
diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index d45d3906..337c912d 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -5,7 +5,6 @@
 from tempfile import mkdtemp, NamedTemporaryFile
 import logging
 import multiprocessing as mp
-import six
 
 # "hidden" features, in development
 try:
@@ -341,7 +340,7 @@ def scan_to_best_match(
     if genome:
         s.set_genome(genome)
 
-    if isinstance(motifs, six.string_types):
+    if isinstance(motifs, str):
         motifs = read_motifs(motifs)
 
     logger.debug("scanning %s...", fname)
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index 17bd6fb9..ad4b87b8 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -15,7 +15,6 @@
 import logging
 import mmap
 import random
-import six
 import tempfile
 import requests
 from subprocess import Popen
@@ -88,7 +87,7 @@ def pfmfile_location(infile):
                 "database specified in the config file."
             )
 
-    if isinstance(infile, six.string_types):
+    if isinstance(infile, str):
         if not os.path.exists(infile):
             motif_dir = config.get_motif_dir()
             checkfile = os.path.join(motif_dir, infile)

From 1d78626d4ccaf7e18966f1092a3e09db7de5c8e1 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 15:01:19 +0200
Subject: [PATCH 06/85] black

---
 conda_env.dev.txt         |   2 -
 conda_env.osx.txt         |   2 -
 conda_env.test.txt        |  37 ++++++
 conda_env.txt             |  16 +--
 gimmemotifs/background.py |   2 -
 gimmemotifs/comparison.py |   2 -
 gimmemotifs/moap.py       |   1 -
 gimmemotifs/plot.py       |   1 -
 gimmemotifs/utils.py      |   2 -
 requirements.txt          |   2 -
 setup.py                  |   2 -
 versioneer.py             | 267 +++++++++++++++++++++++---------------
 12 files changed, 207 insertions(+), 129 deletions(-)
 create mode 100644 conda_env.test.txt

diff --git a/conda_env.dev.txt b/conda_env.dev.txt
index 31297494..9704440a 100644
--- a/conda_env.dev.txt
+++ b/conda_env.dev.txt
@@ -3,7 +3,6 @@ configparser
 dinamo
 diskcache
 feather-format
-future
 gadem
 genomepy >=0.6.1
 ghostscript
@@ -27,7 +26,6 @@ pyyaml >=3.10
 scikit-learn >=0.18
 scipy >=1.3.0
 seaborn
-six
 sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index dc2adacb..8ee94491 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -2,7 +2,6 @@ bedtools
 configparser
 diskcache
 feather-format
-future
 gadem
 genomepy >=0.6.1
 ghostscript
@@ -24,7 +23,6 @@ pyyaml >=3.10
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
-six
 sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
diff --git a/conda_env.test.txt b/conda_env.test.txt
new file mode 100644
index 00000000..81760361
--- /dev/null
+++ b/conda_env.test.txt
@@ -0,0 +1,37 @@
+bedtools
+configparser
+dinamo
+diskcache
+feather-format
+gadem
+genomepy >=0.6.1
+ghostscript
+homer
+icu=58
+ipywidgets  # Necessary for progress bar in Jupyter notebook
+jinja2
+logomaker
+matplotlib >=2.0
+meme >=5
+ncurses
+numpy
+pillow
+prosampler
+pyarrow
+pybedtools
+python >=3.8
+python-xxhash
+pyyaml >=3.10
+scikit-learn >=0.18
+scipy <1.3.0
+seaborn
+sklearn-contrib-lightning
+statsmodels
+tqdm >=4.27.0
+trawler
+ucsc-bigbedtobed
+ucsc-genepredtobed
+weeder
+xdg
+xgboost >=0.71
+xxmotif
diff --git a/conda_env.txt b/conda_env.txt
index 5767b058..3c9ea758 100644
--- a/conda_env.txt
+++ b/conda_env.txt
@@ -1,39 +1,35 @@
-bedtools
 configparser
 dinamo
 diskcache
 feather-format
-future
 gadem
-genomepy >=0.6.1
+genomepy >=0.8.3
 ghostscript
 homer
-icu=58
 ipywidgets  # Necessary for progress bar in Jupyter notebook
 jinja2
 logomaker
-matplotlib >=2.0
-meme >=5
+matplotlib-base >=3.1.2
+meme >=5.1.1
 ncurses
 numpy
+pandas >=1.0.3
 pillow
 prosampler
-pyarrow
+pyarrow >=0.16.0
 pybedtools
 pysam
-python
+python >=3
 python-xxhash
 pyyaml >=3.10
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
-six
 sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
 trawler
 ucsc-bigbedtobed
-ucsc-genepredtobed
 weeder
 xdg
 xgboost >=0.71
diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
index 9ecf0318..60c96d53 100644
--- a/gimmemotifs/background.py
+++ b/gimmemotifs/background.py
@@ -11,8 +11,6 @@
 similar genomic distribution as the input.
 
 """
-from __future__ import division
-
 # Python imports
 import gzip
 import os
diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py
index 94c0b808..40565869 100644
--- a/gimmemotifs/comparison.py
+++ b/gimmemotifs/comparison.py
@@ -6,8 +6,6 @@
 """
 Module to compare DNA sequence motifs (positional frequency matrices)
 """
-from __future__ import print_function
-
 # Python imports
 import sys
 import os
diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 45b87356..16d0517b 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -4,7 +4,6 @@
 # the terms of the MIT License, see the file COPYING included with this
 # distribution.
 """ Module for motif activity prediction """
-from __future__ import print_function
 
 
 def warn(*args, **kwargs):
diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py
index d161c138..436f48ed 100644
--- a/gimmemotifs/plot.py
+++ b/gimmemotifs/plot.py
@@ -4,7 +4,6 @@
 # the terms of the MIT License, see the file COPYING included with this
 # distribution.
 """ Various plotting functions """
-from __future__ import print_function
 from PIL import Image
 import seaborn as sns
 from mpl_toolkits.axes_grid1 import ImageGrid
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index ad4b87b8..c65b58af 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -5,8 +5,6 @@
 # distribution.
 
 """ Odds and ends that for which I didn't (yet) find another place """
-from __future__ import print_function
-
 # Python imports
 import os
 import re
diff --git a/requirements.txt b/requirements.txt
index 604878ff..2ff4f9e7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,4 @@ pysam
 xgboost
 diskcache
 xxhash
-six
-future
 pillow
diff --git a/setup.py b/setup.py
index 5f9fd60f..aa961c94 100644
--- a/setup.py
+++ b/setup.py
@@ -132,8 +132,6 @@ def run(self):
         "diskcache",
         "xxhash",
         "configparser",
-        "six",
-        "future",
         "genomepy >= 0.7.2",
         "tqdm",
         "pillow",
diff --git a/versioneer.py b/versioneer.py
index 64fea1c8..cce201c7 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -1,4 +1,3 @@
-
 # Version: 0.18
 
 """The Versioneer - like a rocketeer, but for versions.
@@ -276,7 +275,6 @@
 
 """
 
-from __future__ import print_function
 try:
     import configparser
 except ImportError:
@@ -308,11 +306,13 @@ def get_root():
         setup_py = os.path.join(root, "setup.py")
         versioneer_py = os.path.join(root, "versioneer.py")
     if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = ("Versioneer was unable to run the project root directory. "
-               "Versioneer requires setup.py to be executed from "
-               "its immediate directory (like 'python setup.py COMMAND'), "
-               "or in a way that lets it use sys.argv[0] to find the root "
-               "(like 'python path/to/setup.py COMMAND').")
+        err = (
+            "Versioneer was unable to run the project root directory. "
+            "Versioneer requires setup.py to be executed from "
+            "its immediate directory (like 'python setup.py COMMAND'), "
+            "or in a way that lets it use sys.argv[0] to find the root "
+            "(like 'python path/to/setup.py COMMAND')."
+        )
         raise VersioneerBadRootError(err)
     try:
         # Certain runtime workflows (setup.py install/develop in a setuptools
@@ -325,8 +325,10 @@ def get_root():
         me_dir = os.path.normcase(os.path.splitext(me)[0])
         vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
         if me_dir != vsr_dir:
-            print("Warning: build in %s is using versioneer.py from %s"
-                  % (os.path.dirname(me), versioneer_py))
+            print(
+                "Warning: build in %s is using versioneer.py from %s"
+                % (os.path.dirname(me), versioneer_py)
+            )
     except NameError:
         pass
     return root
@@ -348,6 +350,7 @@ def get(parser, name):
         if parser.has_option("versioneer", name):
             return parser.get("versioneer", name)
         return None
+
     cfg = VersioneerConfig()
     cfg.VCS = VCS
     cfg.style = get(parser, "style") or ""
@@ -372,17 +375,18 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Decorator to mark a method as the handler for a particular VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -390,10 +394,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -418,7 +425,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
     return stdout, p.returncode
 
 
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY[
+    "git"
+] = '''
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -993,7 +1002,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -1002,7 +1011,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -1010,19 +1019,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -1037,8 +1053,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -1046,10 +1061,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -1072,17 +1096,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
         # tag
@@ -1091,10 +1114,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -1105,13 +1130,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -1167,16 +1192,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -1205,11 +1236,13 @@ def versions_from_file(filename):
             contents = f.read()
     except EnvironmentError:
         raise NotThisMethod("unable to read _version.py")
-    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
-                   contents, re.M | re.S)
+    mo = re.search(
+        r"version_json = '''\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
+    )
     if not mo:
-        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
-                       contents, re.M | re.S)
+        mo = re.search(
+            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
+        )
     if not mo:
         raise NotThisMethod("no version_json in _version.py")
     return json.loads(mo.group(1))
@@ -1218,8 +1251,7 @@ def versions_from_file(filename):
 def write_to_version_file(filename, versions):
     """Write the given version number to the given _version.py file."""
     os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True,
-                          indent=1, separators=(",", ": "))
+    contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
     with open(filename, "w") as f:
         f.write(SHORT_VERSION_PY % contents)
 
@@ -1251,8 +1283,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -1366,11 +1397,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -1390,9 +1423,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 class VersioneerBadRootError(Exception):
@@ -1415,8 +1452,9 @@ def get_versions(verbose=False):
     handlers = HANDLERS.get(cfg.VCS)
     assert handlers, "unrecognized VCS '%s'" % cfg.VCS
     verbose = verbose or cfg.verbose
-    assert cfg.versionfile_source is not None, \
-        "please set versioneer.versionfile_source"
+    assert (
+        cfg.versionfile_source is not None
+    ), "please set versioneer.versionfile_source"
     assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
 
     versionfile_abs = os.path.join(root, cfg.versionfile_source)
@@ -1470,9 +1508,13 @@ def get_versions(verbose=False):
     if verbose:
         print("unable to compute version")
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None, "error": "unable to compute version",
-            "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
 
 
 def get_version():
@@ -1521,6 +1563,7 @@ def run(self):
             print(" date: %s" % vers.get("date"))
             if vers["error"]:
                 print(" error: %s" % vers["error"])
+
     cmds["version"] = cmd_version
 
     # we override "build_py" in both distutils and setuptools
@@ -1553,14 +1596,15 @@ def run(self):
             # now locate _version.py in the new build/ directory and replace
             # it with an updated value
             if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib,
-                                                  cfg.versionfile_build)
+                target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
                 print("UPDATING %s" % target_versionfile)
                 write_to_version_file(target_versionfile, versions)
+
     cmds["build_py"] = cmd_build_py
 
     if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
         from cx_Freeze.dist import build_exe as _build_exe
+
         # nczeczulin reports that py2exe won't like the pep440-style string
         # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
         # setup(console=[{
@@ -1581,17 +1625,21 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["build_exe"] = cmd_build_exe
         del cmds["build_py"]
 
-    if 'py2exe' in sys.modules:  # py2exe enabled?
+    if "py2exe" in sys.modules:  # py2exe enabled?
         try:
             from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
         except ImportError:
@@ -1610,13 +1658,17 @@ def run(self):
                 os.unlink(target_versionfile)
                 with open(cfg.versionfile_source, "w") as f:
                     LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(LONG %
-                            {"DOLLAR": "$",
-                             "STYLE": cfg.style,
-                             "TAG_PREFIX": cfg.tag_prefix,
-                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                             })
+                    f.write(
+                        LONG
+                        % {
+                            "DOLLAR": "$",
+                            "STYLE": cfg.style,
+                            "TAG_PREFIX": cfg.tag_prefix,
+                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        }
+                    )
+
         cmds["py2exe"] = cmd_py2exe
 
     # we override different "sdist" commands for both environments
@@ -1643,8 +1695,10 @@ def make_release_tree(self, base_dir, files):
             # updated value
             target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
             print("UPDATING %s" % target_versionfile)
-            write_to_version_file(target_versionfile,
-                                  self._versioneer_generated_versions)
+            write_to_version_file(
+                target_versionfile, self._versioneer_generated_versions
+            )
+
     cmds["sdist"] = cmd_sdist
 
     return cmds
@@ -1699,11 +1753,13 @@ def do_setup():
     root = get_root()
     try:
         cfg = get_config_from_root(root)
-    except (EnvironmentError, configparser.NoSectionError,
-            configparser.NoOptionError) as e:
+    except (
+        EnvironmentError,
+        configparser.NoSectionError,
+        configparser.NoOptionError,
+    ) as e:
         if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg",
-                  file=sys.stderr)
+            print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
             with open(os.path.join(root, "setup.cfg"), "a") as f:
                 f.write(SAMPLE_CONFIG)
         print(CONFIG_ERROR, file=sys.stderr)
@@ -1712,15 +1768,18 @@ def do_setup():
     print(" creating %s" % cfg.versionfile_source)
     with open(cfg.versionfile_source, "w") as f:
         LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(LONG % {"DOLLAR": "$",
-                        "STYLE": cfg.style,
-                        "TAG_PREFIX": cfg.tag_prefix,
-                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        })
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
-                       "__init__.py")
+        f.write(
+            LONG
+            % {
+                "DOLLAR": "$",
+                "STYLE": cfg.style,
+                "TAG_PREFIX": cfg.tag_prefix,
+                "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                "VERSIONFILE_SOURCE": cfg.versionfile_source,
+            }
+        )
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
     if os.path.exists(ipy):
         try:
             with open(ipy, "r") as f:
@@ -1762,8 +1821,10 @@ def do_setup():
     else:
         print(" 'versioneer.py' already in MANIFEST.in")
     if cfg.versionfile_source not in simple_includes:
-        print(" appending versionfile_source ('%s') to MANIFEST.in" %
-              cfg.versionfile_source)
+        print(
+            " appending versionfile_source ('%s') to MANIFEST.in"
+            % cfg.versionfile_source
+        )
         with open(manifest_in, "a") as f:
             f.write("include %s\n" % cfg.versionfile_source)
     else:

From c23c50fa3e9853b39ef1b0bbef4696f180c5bee9 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 19:50:18 +0200
Subject: [PATCH 07/85] remove lightning

---
 conda_env.dev.txt   |  11 +-
 conda_env.osx.txt   |   1 -
 conda_env.test.txt  |   1 -
 conda_env.txt       |   1 -
 gimmemotifs/moap.py | 258 +-------------------------------------------
 requirements.txt    |   1 -
 setup.py            |   1 -
 7 files changed, 6 insertions(+), 268 deletions(-)

diff --git a/conda_env.dev.txt b/conda_env.dev.txt
index 9704440a..cb3f62e3 100644
--- a/conda_env.dev.txt
+++ b/conda_env.dev.txt
@@ -4,10 +4,8 @@ dinamo
 diskcache
 feather-format
 gadem
-genomepy >=0.6.1
-ghostscript
+genomepy >=0.8.3
 homer
-icu=58
 ipywidgets  # Necessary for progress bar in Jupyter notebook
 jinja2
 logomaker
@@ -17,16 +15,15 @@ ncurses
 numpy
 prosampler
 pillow
-pyarrow
+pyarrow >=0.16.0
 pybedtools
 pysam
 python
 python-xxhash
 pyyaml >=3.10
-scikit-learn >=0.18
-scipy >=1.3.0
+scikit-learn >=0.23
+scipy >=1.4.1
 seaborn
-sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
 trawler
diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index 8ee94491..8eb820e6 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -23,7 +23,6 @@ pyyaml >=3.10
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
-sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
 trawler
diff --git a/conda_env.test.txt b/conda_env.test.txt
index 81760361..a61ea95d 100644
--- a/conda_env.test.txt
+++ b/conda_env.test.txt
@@ -25,7 +25,6 @@ pyyaml >=3.10
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
-sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
 trawler
diff --git a/conda_env.txt b/conda_env.txt
index 3c9ea758..441b7f03 100644
--- a/conda_env.txt
+++ b/conda_env.txt
@@ -25,7 +25,6 @@ pyyaml >=3.10
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
-sklearn-contrib-lightning
 statsmodels
 tqdm >=4.27.0
 trawler
diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 16d0517b..08efe91e 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -37,8 +37,6 @@ def warn(*args, **kwargs):
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.linear_model import MultiTaskLasso, BayesianRidge
 from sklearn.preprocessing import scale, LabelEncoder
-from lightning.classification import CDClassifier
-from lightning.regression import CDRegressor
 
 import xgboost
 
@@ -355,252 +353,6 @@ def fit(self, df_X, df_y):
         logger.info("Done")
 
 
-@register_predictor("LightningRegressor")
-class LightningRegressionMoap(Moap):
-    def __init__(self, scale=True, cv=3, ncpus=None):
-        """Predict motif activities using lightning CDRegressor
-
-        Parameters
-        ----------
-        scale : boolean, optional, default True
-            If ``True``, the motif scores will be scaled
-            before classification
-
-        cv : int, optional, default 3
-            Cross-validation k-fold parameter.
-
-        ncpus : int, optional
-            Number of threads. Default is the number specified in the config.
-
-        Attributes
-        ----------
-        act_ : DataFrame, shape (n_motifs, n_clusters)
-            fitted coefficients
-
-        sig_ : DataFrame, shape (n_motifs,)
-            boolean values, if coefficients are higher/lower than
-            the 1%t from random permutation
-        """
-
-        self.act_description = "activity values: coefficients from " "fitted model"
-
-        if ncpus is None:
-            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
-        self.ncpus = ncpus
-        self.kfolds = cv
-        self.scale = scale
-
-        self.act_ = None
-        self.pref_table = "score"
-        self.supported_tables = ["score", "count"]
-        self.ptype = "regression"
-
-    def fit(self, df_X, df_y, batch_size=50, shuffle=True, tmpdir=None):
-        logger.info("Fitting LightningRegression")
-
-        if self.scale:
-            # Scale motif scores
-            df_X[:] = scale(df_X, axis=0)
-
-        # Normalize across samples and features
-        # y = df_y.apply(scale, 1).apply(scale, 0)
-        y = df_y
-        X = df_X.loc[y.index]
-
-        if not y.shape[0] == X.shape[0]:
-            raise ValueError("number of regions is not equal")
-
-        # Define model
-        cd = CDRegressor(penalty="l1/l2", C=1.0)
-        parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 2)]}
-        clf = GridSearchCV(cd, parameters, n_jobs=self.ncpus)
-
-        if shuffle:
-            idx = list(y.sample(y.shape[1], axis=1, random_state=42).columns)
-        else:
-            idx = list(y.columns)
-
-        if tmpdir:
-            if not os.path.exists(tmpdir):
-                os.mkdir(tmpdir)
-
-        coefs = pd.DataFrame(index=X.columns)
-        start_i = 0
-        if tmpdir:
-            for i in range(0, len(idx), batch_size):
-                fname = os.path.join(tmpdir, "{}.feather".format(i))
-                if os.path.exists(fname) and os.path.exists(fname + ".done"):
-
-                    tmp = pd.read_feather(fname)
-                    tmp = tmp.set_index(tmp.columns[0])
-                    coefs = coefs.join(tmp)
-                else:
-                    logger.info("Resuming at batch {}".format(i))
-                    start_i = i
-                    break
-
-        for i in tqdm(range(start_i, len(idx), batch_size)):
-            split_y = y[idx[i : i + batch_size]]
-
-            # Fit model
-            clf.fit(X.values, split_y.values)
-            tmp = pd.DataFrame(
-                clf.best_estimator_.coef_.T, index=X.columns, columns=split_y.columns
-            )
-            if tmpdir:
-                fname = os.path.join(tmpdir, "{}.feather".format(i))
-                tmp.reset_index().rename(columns=str).to_feather(fname)
-                # Make sure we don't read corrupted files
-                open(fname + ".done", "a").close()
-            # Get coefficients
-            coefs = coefs.join(tmp)
-
-        # Get coefficients
-        self.act_ = coefs[y.columns]
-
-        logger.info("Done")
-
-
-@register_predictor("LightningClassification")
-class LightningClassificationMoap(Moap):
-    def __init__(self, scale=True, permute=False, ncpus=None):
-        """Predict motif activities using lightning CDClassifier
-
-        Parameters
-        ----------
-        scale : boolean, optional, default True
-            If ``True``, the motif scores will be scaled
-            before classification
-
-        ncpus : int, optional
-            Number of threads. Default is the number specified in the config.
-
-        Attributes
-        ----------
-        act_ : DataFrame, shape (n_motifs, n_clusters)
-            fitted coefficients
-
-        sig_ : DataFrame, shape (n_motifs,)
-            boolean values, if coefficients are higher/lower than
-            the 1%t from random permutation
-        """
-
-        self.act_description = "activity values: coefficients from " "fitted model"
-
-        # self.cdc = CDClassifier(random_state=args.seed)
-        self.cdc = CDClassifier()
-
-        self.parameters = {
-            "penalty": ["l1/l2"],
-            "loss": ["squared_hinge"],
-            "multiclass": [True],
-            "max_iter": [20],
-            "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
-            "C": [0.001, 0.01, 0.1, 0.5, 1.0],
-            "tol": [1e-3],
-        }
-
-        self.kfolds = 10
-
-        if ncpus is None:
-            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
-
-        self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus)
-
-        self.scale = scale
-        self.permute = permute
-
-        self.act_ = None
-        self.sig_ = None
-        self.pref_table = "score"
-        self.supported_tables = ["score", "count"]
-        self.ptype = "classification"
-
-    def fit(self, df_X, df_y):
-        logger.info("Fitting LightningClassification")
-
-        if not df_y.shape[0] == df_X.shape[0]:
-            raise ValueError("number of regions is not equal")
-        if df_y.shape[1] != 1:
-            raise ValueError("y needs to have 1 label column")
-
-        if self.scale:
-            # Scale motif scores
-            df_X[:] = scale(df_X, axis=0)
-
-        idx = list(range(df_y.shape[0]))
-
-        y = df_y.iloc[idx]
-        X = df_X.loc[y.index].values
-        y = y.values.flatten()
-
-        # Convert (putative) string labels
-        label = LabelEncoder()
-        y = label.fit_transform(y)
-
-        # Split data
-        X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-        logger.debug("Setting parameters through cross-validation")
-        # Determine best parameters based on CV
-        self.clf.fit(X_train, y_train)
-
-        logger.debug(
-            "Average score ({} fold CV): {}".format(
-                self.kfolds, self.clf.score(X_test, y_test)
-            )
-        )
-
-        logger.debug("Estimate coefficients using bootstrapping")
-
-        # Estimate coefficients using bootstrappig
-        # b = BaggingClassifier(self.clf.best_estimator_,
-        #        max_samples=0.75, n_jobs=-1, random_state=state)
-        b = BaggingClassifier(self.clf.best_estimator_, max_samples=0.75, n_jobs=-1)
-        b.fit(X, y)
-
-        # Get mean coefficients
-        coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0)
-
-        # Create dataframe of predicted coefficients
-        if len(label.classes_) == 2:
-            self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))
-        else:
-            self.act_ = pd.DataFrame(coeffs.T)
-
-        # Convert labels back to original names
-        self.act_.columns = label.inverse_transform(range(len(label.classes_)))
-        self.act_.index = df_X.columns
-
-        if self.permute:
-            # Permutations
-            logger.debug("Permutations")
-            random_dfs = []
-            for _ in range(10):
-                y_random = np.random.permutation(y)
-                b.fit(X, y_random)
-                coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0)
-
-                if len(label.classes_) == 2:
-                    random_dfs.append(pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))))
-                else:
-                    random_dfs.append(pd.DataFrame(coeffs.T))
-            random_df = pd.concat(random_dfs)
-
-            # Select cutoff based on percentile
-            high_cutoffs = random_df.quantile(0.99)
-            low_cutoffs = random_df.quantile(0.01)
-
-            # Set significance
-            self.sig_ = pd.DataFrame(index=df_X.columns)
-            self.sig_["sig"] = False
-
-            for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs):
-                self.sig_["sig"].loc[self.act_[col] >= c_high] = True
-                self.sig_["sig"].loc[self.act_[col] <= c_low] = True
-        logger.info("Done")
-
-
 @register_predictor("MWU")
 class MWUMoap(Moap):
     def __init__(self, *args, **kwargs):
@@ -934,7 +686,7 @@ def moap(
 
     method : str, optional
         Motif activity method to use. Any of 'hypergeom', 'lasso',
-        'lightningclassification', 'lightningregressor', 'bayesianridge',
+        'bayesianridge',
         'rf', 'xgboost'. Default is 'hypergeom'.
 
     scoring:  str, optional
@@ -1057,13 +809,7 @@ def moap(
 
     motifs = motifs.loc[df.index]
 
-    if method == "lightningregressor":
-        outdir = os.path.dirname(outfile)
-        tmpname = os.path.join(outdir, ".lightning.tmp")
-        clf.fit(motifs, df, tmpdir=tmpname)
-        shutil.rmtree(tmpname)
-    else:
-        clf.fit(motifs, df)
+    clf.fit(motifs, df)
 
     if outfile:
         with open(outfile, "w") as f:
diff --git a/requirements.txt b/requirements.txt
index 2ff4f9e7..ec991b68 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,6 @@ pyyaml >= 3.10
 pybedtools
 statsmodels
 scikit-learn
-sklearn-contrib-lightning
 seaborn
 pysam
 xgboost
diff --git a/setup.py b/setup.py
index aa961c94..5d790bf1 100644
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,6 @@ def run(self):
         "pybedtools",
         "statsmodels",
         "scikit-learn",
-        "sklearn-contrib-lightning",
         "seaborn",
         "pysam",
         "xgboost >= 0.71",

From 241d22ec739a8e3934a3ce200567d8fa878fc186 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 19:51:33 +0200
Subject: [PATCH 08/85] style

---
 gimmemotifs/moap.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 08efe91e..fdcabe98 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -17,7 +17,6 @@ def warn(*args, **kwargs):
 
 import os
 import sys
-import shutil
 
 try:
     from itertools import izip
@@ -32,8 +31,8 @@ def warn(*args, **kwargs):
 from tqdm.auto import tqdm
 
 # scikit-learn
-from sklearn.model_selection import train_test_split, GridSearchCV
-from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.ensemble import RandomForestClassifier
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.linear_model import MultiTaskLasso, BayesianRidge
 from sklearn.preprocessing import scale, LabelEncoder

From 3887c9df7cbd67753c262dd07d5dec7e93133c43 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 21:27:44 +0200
Subject: [PATCH 09/85] remove lightning tests

---
 test/test_moap.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_moap.py b/test/test_moap.py
index 775b3534..c6c13be3 100644
--- a/test/test_moap.py
+++ b/test/test_moap.py
@@ -20,7 +20,7 @@ def setUp(self):
     def test1_moap(self):
         """ Test motif activity prediction """
 
-        for method in ["mwu", "rf", "lightningclassification"]:
+        for method in ["mwu", "rf"]:
             df = moap(
                 self.clusters,
                 method=method,
@@ -41,7 +41,7 @@ def test1_moap(self):
     def test2_moap(self):
         """ Test motif activity prediction for two clusters """
 
-        for method in ["mwu", "rf", "lightningclassification"]:
+        for method in ["mwu", "rf"]:
             df = moap(
                 self.clusters2,
                 method=method,

From d1fd3b99443412b13b8ae9f4d5e2b88cf4ec298d Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Mon, 8 Jun 2020 21:28:14 +0200
Subject: [PATCH 10/85] black

---
 gimmemotifs/tools/meme.py  | 9 ++++++---
 gimmemotifs/tools/memew.py | 9 +++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py
index a6a4d639..3bb69d60 100644
--- a/gimmemotifs/tools/meme.py
+++ b/gimmemotifs/tools/meme.py
@@ -1,5 +1,6 @@
 from .motifprogram import MotifProgram
 import io
+import os
 import re
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
@@ -58,7 +59,6 @@ def _run_program(self, bin, fastafile, params=None):
         number = default_params["number"]
 
         cmd = [
-            "OMPI_MCA_plm_rsh_agent=sh",  # Fix to run in Docker
             bin,
             fastafile,
             "-text",
@@ -76,8 +76,11 @@ def _run_program(self, bin, fastafile, params=None):
         if not default_params["single"]:
             cmd.append(strand)
 
-        # sys.stderr.write(" ".join(cmd) + "\n")
-        p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE)
+        # Fix to run in Docker
+        env = os.environ.copy()
+        env["OMPI_MCA_plm_rsh_agent"] = "sh"
+
+        p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env)
         stdout, stderr = p.communicate()
 
         motifs = []
diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py
index 8cdeac7c..643d737c 100644
--- a/gimmemotifs/tools/memew.py
+++ b/gimmemotifs/tools/memew.py
@@ -1,6 +1,8 @@
 from .motifprogram import MotifProgram
 import io
+import os
 import re
+import sys
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 
@@ -57,7 +59,6 @@ def _run_program(self, bin, fastafile, params=None):
         number = default_params["number"]
 
         cmd = [
-            "OMPI_MCA_plm_rsh_agent=sh",
             bin,
             fastafile,
             "-text",
@@ -77,8 +78,12 @@ def _run_program(self, bin, fastafile, params=None):
         if not default_params["single"]:
             cmd.append(strand)
 
+        # Fix to run in Docker
+        env = os.environ.copy()
+        env["OMPI_MCA_plm_rsh_agent"] = "sh"
+
         # sys.stderr.write(" ".join(cmd) + "\n")
-        p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE)
+        p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env)
         stdout, stderr = p.communicate()
 
         motifs = []

From b3e6f211de351a26f5b048c1d84e115d3945298d Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 09:06:50 +0200
Subject: [PATCH 11/85] fix test for genomepy>=0.8.3

---
 test/test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 87f7b9da..c4f72c2f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -13,7 +13,7 @@ class TestUtils(unittest.TestCase):
     """ A test class to test utils functions """
 
     def setUp(self):
-        self.genome_dir = "test/data/genome_index"
+        self.genomes_dir = "test/data/genome_index"
         self.datadir = "test/data/utils"
 
     def test1_phyper(self):
@@ -31,7 +31,7 @@ def test2_as_fasta(self):
         """ convert bed, regions, etc to Fasta """
         tmpdir = mkdtemp()
 
-        g = Genome("genome", genome_dir=self.genome_dir)
+        g = Genome("genome", genomes_dir=self.genomes_dir)
 
         fafile = os.path.join(self.datadir, "test.fa")
         fa = Fasta(fafile)

From bb99e8e2619671489bf3081b084ac3dd83fbf8d0 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 09:12:14 +0200
Subject: [PATCH 12/85] fix flake errors

---
 gimmemotifs/background.py      | 20 +++++++++++---------
 gimmemotifs/commands/motifs.py |  2 +-
 gimmemotifs/comparison.py      | 10 ++++++----
 gimmemotifs/plot.py            |  2 +-
 gimmemotifs/tools/memew.py     |  1 -
 gimmemotifs/tools/weeder.py    |  2 +-
 gimmemotifs/utils.py           |  4 ++--
 7 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py
index 60c96d53..46d33233 100644
--- a/gimmemotifs/background.py
+++ b/gimmemotifs/background.py
@@ -249,15 +249,17 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None):
         for _i in range(k - 1):
             new_init = []
             for x in init:
-                for l in alphabet:
-                    new_init.append(x + l)
+                for letter in alphabet:
+                    new_init.append(x + letter)
             init = new_init[:]
 
-        self.trans = dict([(word, dict([(l, 0.0) for l in alphabet])) for word in init])
+        self.trans = dict(
+            [(word, dict([(letter, 0.0) for letter in alphabet])) for word in init]
+        )
         new_init = []
         for x in init:
-            for l in alphabet:
-                new_init.append(x + l)
+            for letter in alphabet:
+                new_init.append(x + letter)
 
         kmercount = dict([(word, 0) for word in new_init])
         lettercount = dict([(word[:k], 0) for word in new_init])
@@ -284,9 +286,9 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None):
         for k, v in lettercount.items():
             self.init[k] = v / total
 
-    def _generate_sequence(self, l):
+    def _generate_sequence(self, length):
         sequence = list(self._weighted_random(list(self.init.items())))
-        for _ in range(l - self.k):
+        for _ in range(length - self.k):
             sequence.append(
                 self._weighted_random(
                     list(self.trans["".join(sequence[-self.k :])].items())
@@ -294,10 +296,10 @@ def _generate_sequence(self, l):
             )
         return "".join(sequence)
 
-    def _weighted_random(self, l):
+    def _weighted_random(self, weighted_list):
         n = random.uniform(0, 1)
         item = None
-        for item, weight in l:  # noqa: B007
+        for item, weight in weighted_list:  # noqa: B007
             if n < weight:
                 break
             else:
diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index 429fca70..a1681c8d 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -151,7 +151,7 @@ def motifs(args):
     delete_sample = False
     if ftype == "narrowpeak":
         f = NamedTemporaryFile(delete=False)
-        logger.debug("Using %s as temporary BED file".format(f.name))
+        logger.debug("Using {} as temporary BED file".format(f.name))
         narrowpeak_to_bed(args.sample, f.name, size=args.size)
         sample = f.name
         delete_sample = True
diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py
index 40565869..f964205d 100644
--- a/gimmemotifs/comparison.py
+++ b/gimmemotifs/comparison.py
@@ -888,12 +888,14 @@ def generate_score_dist(self, motifs, match, metric, combine):
         f = open(score_file, "w")
 
         all_scores = {}
-        for l in [len(motif) for motif in motifs]:
-            all_scores[l] = {}
+        for motif_len in [len(motif) for motif in motifs]:
+            all_scores[motif_len] = {}
 
         sorted_motifs = {}
-        for l in all_scores.keys():
-            sorted_motifs[l] = [motif for motif in motifs if len(motif) == l]
+        for motif_len in all_scores.keys():
+            sorted_motifs[motif_len] = [
+                motif for motif in motifs if len(motif) == motif_len
+            ]
 
         for l1 in all_scores.keys():
             for l2 in all_scores.keys():
diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py
index 436f48ed..6703b6e4 100644
--- a/gimmemotifs/plot.py
+++ b/gimmemotifs/plot.py
@@ -354,7 +354,7 @@ def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None):
     m = 25 / data.values.max()
 
     for node in t.traverse("levelorder"):
-        val = data[[l.name for l in node.get_leaves()]].values.mean()
+        val = data[[leaf.name for leaf in node.get_leaves()]].values.mean()
         style = NodeStyle()
         style["size"] = 0
 
diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py
index 643d737c..5f39ddef 100644
--- a/gimmemotifs/tools/memew.py
+++ b/gimmemotifs/tools/memew.py
@@ -2,7 +2,6 @@
 import io
 import os
 import re
-import sys
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 
diff --git a/gimmemotifs/tools/weeder.py b/gimmemotifs/tools/weeder.py
index 0361c2d2..0ad4334c 100644
--- a/gimmemotifs/tools/weeder.py
+++ b/gimmemotifs/tools/weeder.py
@@ -76,7 +76,7 @@ def _run_program(self, bin, fastafile, params=None):
         shutil.copy(fastafile, name)
         fastafile = name
 
-        cmd = "{} -f {} -O".format(self.cmd, fastafile, weeder_organism)
+        cmd = "{} -f {} -O {}".format(self.cmd, fastafile, weeder_organism)
 
         if params["single"]:
             cmd += " -ss"
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index c65b58af..27d7bed3 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -536,11 +536,11 @@ def file_checksum(fname):
     return checksum
 
 
-def join_max(a, l, sep="", suffix=""):
+def join_max(a, length, sep="", suffix=""):
     lengths = [len(x) for x in a]
     total = 0
     for i, size in enumerate(lengths + [0]):
-        if total > (l - len(suffix)):
+        if total > (length - len(suffix)):
             return sep.join(a[: i - 1]) + suffix
         if i > 0:
             total += 1

From cb63c3ce687ff70137a2aaf8f709de081b4e7c21 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 09:47:58 +0200
Subject: [PATCH 13/85] update test

---
 test/test_tools.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_tools.py b/test/test_tools.py
index d6dba2db..9903ce54 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -35,6 +35,7 @@ def test_tool(tool_name):
         "trawler",  # unpredictable, sometimes doesn't find the motif
         "weeder",  # doesn't work at the moment
         "posmo",  # motif doesn't predictably look like AP1
+        "dreme",  # current dreme in bioconda is broken
     ]:
         return
 

From 8cbb1c318a74abdf7d20020f605c1ce37744fe8b Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 10:09:44 +0200
Subject: [PATCH 14/85] check if openmp problem is resolved

---
 conda_env.osx.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index 8eb820e6..b4944b03 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -8,6 +8,7 @@ ghostscript
 homer
 jinja2
 logomaker
+llvm-openmp
 matplotlib >=2.0
 meme >=5
 ncurses

From 40c24555e95a0cc39686ce71aae9bd938386551e Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 10:28:08 +0200
Subject: [PATCH 15/85] test osx fix

---
 conda_env.osx.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index b4944b03..dc597576 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -31,7 +31,7 @@ ucsc-bigbedtobed
 ucsc-genepredtobed
 weeder
 xdg
-xgboost >=0.71
+py-xgboost=0.90
 xxmotif
 
 # development-specific

From a42f71bb6fde00c804dfb71d101339e36187e482 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 11:00:13 +0200
Subject: [PATCH 16/85] update travis config

---
 .travis.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b452bd84..c239bba1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,8 +21,11 @@ before_install:
     fi
   - chmod +x miniconda.sh
   - ./miniconda.sh -b -p $HOME/miniconda -f
-  - export PATH=$HOME/miniconda/bin:$PATH
-  - conda config --set always_yes yes 
+  - source "$HOME/miniconda/etc/profile.d/conda.sh"
+  - hash -r
+  - conda config --set always_yes yes --set changeps1 no
+  - conda update -q conda
+  - conda info -a
   - if [ "$TRAVIS_OS_NAME" == "osx" ]; then ulimit -S -n 4096; ulimit -a; fi
 
 install:
@@ -34,7 +37,7 @@ install:
     else
       conda env create -q -f conda_env.osx.txt -n gimme;
     fi
-  - source activate gimme
+  - conda activate gimme
   - python setup.py build && pip install -e .
 
 before_script:

From e6ab1875f1a1707fefd17b5d319b07ff83f30cb9 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 11:51:12 +0200
Subject: [PATCH 17/85] struggling with osx

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index c239bba1..c8a17716 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,6 +38,7 @@ install:
       conda env create -q -f conda_env.osx.txt -n gimme;
     fi
   - conda activate gimme
+  - conda list
   - python setup.py build && pip install -e .
 
 before_script:

From fc8b26974f2d5839b461ce39ff82b00b01da03fb Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 12:17:07 +0200
Subject: [PATCH 18/85] osx

---
 conda_env.osx.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index dc597576..84842a0e 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -21,8 +21,8 @@ pysam
 python
 python-xxhash
 pyyaml >=3.10
-scikit-learn >=0.18
-scipy <1.3.0
+scikit-learn >=0.23
+scipy
 seaborn
 statsmodels
 tqdm >=4.27.0
@@ -31,7 +31,7 @@ ucsc-bigbedtobed
 ucsc-genepredtobed
 weeder
 xdg
-py-xgboost=0.90
+xgboost=0.72
 xxmotif
 
 # development-specific

From 353729d35879eb6b27e087ec14ca6fd74617ccb8 Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Tue, 9 Jun 2020 13:34:15 +0200
Subject: [PATCH 19/85] test again

---
 conda_env.osx.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index 84842a0e..9c931dd4 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -3,7 +3,7 @@ configparser
 diskcache
 feather-format
 gadem
-genomepy >=0.6.1
+genomepy >=0.8.3
 ghostscript
 homer
 jinja2
@@ -21,7 +21,7 @@ pysam
 python
 python-xxhash
 pyyaml >=3.10
-scikit-learn >=0.23
+scikit-learn
 scipy
 seaborn
 statsmodels

From 8065900f21458eae6b7fb60a3c02b3a05738c65e Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Wed, 24 Jun 2020 10:04:32 +0200
Subject: [PATCH 20/85] coverage_table configurable nr of threads

---
 scripts/coverage_table | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/scripts/coverage_table b/scripts/coverage_table
index 64385b65..4997304b 100644
--- a/scripts/coverage_table
+++ b/scripts/coverage_table
@@ -32,6 +32,7 @@ def make_table(
     topmethod="var",
     rmdup=True,
     rmrepeats=True,
+    ncpus=12
 ):
     for x in datafiles:
         if not os.path.isfile(x):
@@ -49,7 +50,7 @@ def make_table(
     data = {}
     try:
         # Load data in parallel
-        pool = multiprocessing.Pool(processes=12)
+        pool = multiprocessing.Pool(processes=ncpus)
         jobs = []
         for datafile in datafiles:
             jobs.append(
@@ -196,7 +197,14 @@ if __name__ == "__main__":
         action="store_false",
         default=True,
     )
-
+    parser.add_argument(
+        "--nthreads",
+        dest="ncpus",
+        help="Number of threads",
+        metavar="INT",
+        type=int,
+        default=12,
+    )
     args = parser.parse_args()
     peakfile = args.peakfile
     datafiles = args.datafiles
@@ -210,6 +218,7 @@ if __name__ == "__main__":
         topmethod=args.topmethod,
         rmdup=args.rmdup,
         rmrepeats=args.rmrepeats,
+        ncpus=args.ncpus
     )
 
 yesno = {True: "yes", False: "no"}

From 76e82d1a484489d896332ae430e2d270879e3396 Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Wed, 24 Jun 2020 10:06:43 +0200
Subject: [PATCH 21/85] minor style

---
 scripts/coverage_table | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/coverage_table b/scripts/coverage_table
index 4997304b..829e67df 100644
--- a/scripts/coverage_table
+++ b/scripts/coverage_table
@@ -205,6 +205,7 @@ if __name__ == "__main__":
         type=int,
         default=12,
     )
+
     args = parser.parse_args()
     peakfile = args.peakfile
     datafiles = args.datafiles

From 71e8f48e29c555128130d04a6ecc5bd142a08bc5 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 11:57:40 +0200
Subject: [PATCH 22/85] center maelstrom input by default; improved maelstrom
 report

---
 .../sortable/sortable-theme-slick.css         |   3 +-
 data/templates/table.tpl                      | 215 +++++++
 gimmemotifs/cli.py                            |   7 +
 gimmemotifs/commands/maelstrom.py             |   2 +
 gimmemotifs/maelstrom.py                      |  36 +-
 gimmemotifs/report.py                         | 553 +++++++++++++++---
 6 files changed, 719 insertions(+), 97 deletions(-)
 create mode 100644 data/templates/table.tpl

diff --git a/data/templates/sortable/sortable-theme-slick.css b/data/templates/sortable/sortable-theme-slick.css
index df20f5a8..a699c304 100644
--- a/data/templates/sortable/sortable-theme-slick.css
+++ b/data/templates/sortable/sortable-theme-slick.css
@@ -1,6 +1,7 @@
 /* line 2, ../sass/_sortable.sass */
 table[data-sortable] {
-  font-size: 80%;
+  font-family: 'Nunito Sans';
+  font-size: 90%;
   border-collapse: collapse;
   border-spacing: 0;
 }
diff --git a/data/templates/table.tpl b/data/templates/table.tpl
new file mode 100644
index 00000000..bf836b55
--- /dev/null
+++ b/data/templates/table.tpl
@@ -0,0 +1,215 @@
+{# Update the template_structure.html document too #}
+{%- block before_style -%}{%- endblock before_style -%}
+{% block style %}
+<link rel="stylesheet" href="https://fonts.googleapis.com/css?family={{font.replace(" ", "+")}}"/>
+<style  type="text/css" >
+/* line 2, ../sass/_sortable.sass */
+table[data-sortable] {
+  font-family: '{{font}}';
+  font-size: 90%;
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+/* line 6, ../sass/_sortable.sass */
+table[data-sortable] th {
+  vertical-align: bottom;
+  font-weight: bold;
+}
+/* line 10, ../sass/_sortable.sass */
+table[data-sortable] th, table[data-sortable] td {
+  text-align: left;
+  padding: 2px;
+}
+/* line 14, ../sass/_sortable.sass */
+table[data-sortable] th:not([data-sortable="false"]) {
+  -webkit-user-select: none;
+  -moz-user-select: none;
+  -ms-user-select: none;
+  -o-user-select: none;
+  user-select: none;
+  -webkit-tap-highlight-color: rgba(0, 0, 0, 0);
+  -webkit-touch-callout: none;
+  cursor: pointer;
+}
+/* line 26, ../sass/_sortable.sass */
+table[data-sortable] th:after {
+  content: "";
+  visibility: hidden;
+  display: inline-block;
+  vertical-align: inherit;
+  height: 0;
+  width: 0;
+  border-width: 5px;
+  border-style: solid;
+  border-color: transparent;
+  margin-right: 1px;
+  margin-left: 10px;
+  float: right;
+}
+/* line 40, ../sass/_sortable.sass */
+table[data-sortable] th[data-sorted="true"]:after {
+  visibility: visible;
+}
+/* line 43, ../sass/_sortable.sass */
+table[data-sortable] th[data-sorted-direction="descending"]:after {
+  border-top-color: inherit;
+  margin-top: 8px;
+}
+/* line 47, ../sass/_sortable.sass */
+table[data-sortable] th[data-sorted-direction="ascending"]:after {
+  border-bottom-color: inherit;
+  margin-top: 3px;
+}
+
+/* line 6, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick {
+  color: #333333;
+  background: white;
+  border: 1px solid #e0e0e0;
+}
+/* line 11, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick thead th {
+  background-image: -webkit-gradient(linear, 50% 0%, 50% 100%, color-stop(0%, #ffffff), color-stop(100%, #eeeeee));
+  background-image: -webkit-linear-gradient(#ffffff, #eeeeee);
+  background-image: -moz-linear-gradient(#ffffff, #eeeeee);
+  background-image: -o-linear-gradient(#ffffff, #eeeeee);
+  background-image: linear-gradient(#ffffff, #eeeeee);
+  background-color: #f0f0f0;
+}
+/* line 16, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick tbody td {
+  border-top: 1px solid #e0e0e0;
+}
+/* line 19, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick tbody > tr:nth-child(odd) > td {
+  background-color: #f9f9f9;
+}
+/* line 22, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick th[data-sorted="true"] {
+  -webkit-box-shadow: inset 1px 0 #bce8f1, inset -1px 0 #bce8f1;
+  -moz-box-shadow: inset 1px 0 #bce8f1, inset -1px 0 #bce8f1;
+  box-shadow: inset 1px 0 #bce8f1, inset -1px 0 #bce8f1;
+  color: #3a87ad;
+  background: #d9edf7;
+  border-bottom-color: #bce8f1;
+}
+/* line 28, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick th[data-sorted="true"]:first-child {
+  -webkit-box-shadow: inset -1px 0 #bce8f1;
+  -moz-box-shadow: inset -1px 0 #bce8f1;
+  box-shadow: inset -1px 0 #bce8f1;
+}
+/* line 31, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick th[data-sorted="true"]:last-child {
+  -webkit-box-shadow: inset 1px 0 #bce8f1;
+  -moz-box-shadow: inset 1px 0 #bce8f1;
+  box-shadow: inset 1px 0 #bce8f1;
+}
+/* line 34, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick th[data-sorted="true"][data-sorted-direction="descending"]:after {
+  border-top-color: #3a87ad;
+}
+/* line 37, ../sass/sortable-theme-slick.sass */
+table[data-sortable].sortable-theme-slick th[data-sorted="true"][data-sorted-direction="ascending"]:after {
+  border-bottom-color: #3a87ad;
+}
+
+
+{% block col_heading_style %}
+    .{{col_heading_style.name}} {
+    {% for p,val in col_heading_style.props %}
+      {{p}}: {{val}};
+    {% endfor -%}
+    }
+{% endblock col_heading_style %}
+
+{% block circle_styles %}
+{% for s in circle_styles %}
+    .{{s.name}} {
+    {% for p,val in s.props %}
+      {{p}}: {{val}};
+    {% endfor -%}
+    }
+{%- endfor -%}
+{% endblock circle_styles %}
+{% block palette_styles %}
+{% for s in palette_styles %}
+    .{{s.name}} {
+    {% for p,val in s.props %}
+      {{p}}: {{val}};
+    {% endfor -%}
+    }
+{%- endfor -%}
+{% endblock palette_styles %}
+
+
+{% block table_styles %}
+{% for s in table_styles %}
+    #T_{{uuid}} {{s.selector}} {
+    {% for p,val in s.props %}
+      {{p}}: {{val}};
+    {% endfor -%}
+    }
+{%- endfor -%}
+{% endblock table_styles %}
+{% block before_cellstyle %}{% endblock before_cellstyle %}
+{% block cellstyle %}
+{%- for s in cellstyle %}
+    #T_{{uuid}}{{s.selector}} {
+    {% for p,val in s.props %}
+        {{p}}: {{val}};
+    {% endfor %}
+    }
+{%- endfor -%}
+{%- endblock cellstyle %}
+</style>
+{%- endblock style %}
+{%- block before_table %}{% endblock before_table %}
+{%- block table %}
+<table id="T_{{uuid}}" {% if table_attributes %}{{ table_attributes }}{% endif %}>
+{%- block caption %}
+{%- if caption -%}
+    <caption>{{caption}}</caption>
+{%- endif -%}
+{%- endblock caption %}
+{%- block thead %}
+<thead>
+    {%- block before_head_rows %}{% endblock %}
+    {%- for r in head %}
+    {%- block head_tr scoped %}
+    <tr>
+        {%- for c in r %}
+        {%- if c.is_visible != False %}
+        <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}}</{{ c.type }}>
+        {%- endif %}
+        {%- endfor %}
+    </tr>
+    {%- endblock head_tr %}
+    {%- endfor %}
+    {%- block after_head_rows %}{% endblock %}
+</thead>
+{%- endblock thead %}
+{%- block tbody %}
+<tbody>
+    {% block before_rows %}{% endblock before_rows %}
+    {% for r in body %}
+    {% block tr scoped %}
+    <tr>
+        {% for c in r %}
+        {% if c.is_visible != False %}
+        <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }}</{{ c.type }}>
+        {% endif %}
+        {%- endfor %}
+    </tr>
+    {% endblock tr %}
+    {%- endfor %}
+    {%- block after_rows %}{%- endblock after_rows %}
+</tbody>
+{%- endblock tbody %}
+</table>
+{%- endblock table %}
+{%- block after_table %}{% endblock after_table %}
+<script>
+/*! sortable.js 0.8.0 */
+(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(-?[£$¤]?[\d,.e\-]+%?|inf)$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this);
+</script>
diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index bc0e602f..4878e96b 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -326,6 +326,13 @@ def cli(sys_args):
         default=default_pfm_file,
         metavar="pfmfile",
     )
+    p.add_argument(
+        "--nocenter",
+        dest="center",
+        help="Don't mean-center the rows by default",
+        default=True,
+        action="store_false",
+    )
     p.add_argument(
         "-m",
         "--methods",
diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py
index b7d8e43f..f8573160 100755
--- a/gimmemotifs/commands/maelstrom.py
+++ b/gimmemotifs/commands/maelstrom.py
@@ -18,6 +18,7 @@ def maelstrom(args):
     methods = args.methods
     ncpus = args.ncpus
     zscore = args.zscore
+    center = args.center
     gc = args.gc
 
     if not os.path.exists(infile):
@@ -35,4 +36,5 @@ def maelstrom(args):
         ncpus=ncpus,
         zscore=zscore,
         gc=gc,
+        center=center,
     )
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index c4bf21ab..6d24058c 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -22,6 +22,7 @@
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import scale
+from scipy.stats import pearsonr
 from scipy.cluster import hierarchy
 from scipy.spatial.distance import pdist
 from scipy.cluster.hierarchy import linkage, dendrogram
@@ -220,6 +221,7 @@ def run_maelstrom(
     ncpus=None,
     zscore=True,
     gc=True,
+    center=False,
 ):
     """Run maelstrom on an input table.
 
@@ -264,6 +266,9 @@ def run_maelstrom(
 
     gc : bool, optional
         Use GC% bins to normalize motif scores.
+
+    center : bool, optional
+        Mean-center the input table.
     """
     logger.info("Starting maelstrom")
     if infile.endswith("feather"):
@@ -272,6 +277,20 @@ def run_maelstrom(
     else:
         df = pd.read_table(infile, index_col=0, comment="#")
 
+    # Check if the input is mean-centered
+    if df.shape[1] > 1 and not np.allclose(df.mean(1), 0):
+        if center:
+            logger.info(
+                "Input is not mean-centered, setting the mean of all rows to 0."
+            )
+            logger.info("Use --nocenter to change this behavior")
+            df = df.sub(df.mean(axis=1), axis=0)
+        else:
+            logger.info("Input is not mean-centered, but --nocenter was specified.")
+            logger.info(
+                "Leaving the data as-is, but make sure this is what your really want."
+            )
+
     # Check for duplicates
     if df.index.duplicated(keep=False).any():
         logger.warning("Input file contains duplicate regions!")
@@ -407,16 +426,31 @@ def run_maelstrom(
         except FileNotFoundError:
             logger.warn("Activity file for {} not found!\n".format(t))
 
+    counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
+    scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")
+    
     if len(methods) > 1:
         logger.info("Rank aggregation")
         df_p = df_rank_aggregation(df, dfs, exps)
+        
+        # Add correlation between motif score and signal
+        logger.info("Correlation")
+        cols = df_p.columns
+        for col in cols[::-1]:
+            df_p.insert(0, f"correlation {col}", 0)
+            for motif in df_p.index:
+                df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0]
+        
+        # Add percentage of input sequences with motif
+        df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100)
+        
         df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
     # df_p = df_p.join(m2f)
 
     # Write motif frequency table
 
     if df.shape[1] == 1:
-        mcount = df.join(pd.read_table(count_table, index_col=0, comment="#"))
+        mcount = df.join(counts)
         m_group = mcount.groupby(df.columns[0])
         freq = m_group.sum() / m_group.count()
         freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t")
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 239eb441..e7f63285 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -16,6 +16,9 @@
 import numpy as np
 import pandas as pd
 from statsmodels.stats.multitest import multipletests
+from pandas.core.indexing import _non_reducing_slice
+from pandas.io.formats.style import Styler
+import seaborn as sns
 
 from gimmemotifs.comparison import MotifComparer
 from gimmemotifs.fasta import Fasta
@@ -28,6 +31,327 @@
 
 logger = logging.getLogger("gimme.report")
 
+FACTOR_TOOLTIP = "<div title='\"Direct\" means that there is direct evidence of binding or that this assignment is based on curated information. \"Predicted\" means that the motif comes from a non-curated ChIP-seq experiment or that the factor was computationally predicted to bind this motif based on its DNA binding domain.'>factors<br/>(<span style='color:black'>direct</span> or <span style='color:#666666'>predicted</span>)</div>"
+
+
+def _wrap_html_str(x):
+    if " " not in x:
+        return x
+
+    min_pos, max_pos = 0, len(x)
+    if ">" in x and "</" in x:
+        m = re.compile(r">[^<>]*<").search(x)
+        min_pos, max_pos = m.start(), m.end()
+
+    positions = [m.start() for m in re.compile(" ").finditer(x)]
+    positions = [p for p in positions if min_pos < p < max_pos]
+
+    pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0]
+    x = x[:pos] + "<br/>" + x[pos + 1 :]
+    return x
+
+
+class ExtraStyler(Styler):
+    loader = jinja2.ChoiceLoader(
+        [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader]
+    )
+    env = jinja2.Environment(loader=loader)
+    template = env.get_template("table.tpl")
+
+    def __init__(self, *args, **kwargs):
+        self._data_todo = []
+        self.circle_styles = None
+        self.palette_styles = None
+        self.col_heading_style = {
+            "name": "col_heading",
+            "props": [("border-bottom", "1px solid #e0e0e0")],
+        }
+        super(ExtraStyler, self).__init__(*args, **kwargs)
+        self.display_data = self.data.copy()
+
+        # self.template =
+
+        self._font = "Nunito Sans"
+
+    @property
+    def font(self):
+        return self._font
+
+    @font.setter
+    def font(self, font_name):
+        self._font = font_name
+
+    def set_font(self, font_name):
+        self.font = font_name
+        return self
+
+    def _current_index(self, subset, axis=0):
+        selected = self.data.loc[subset]
+        if axis == 0 or axis == "columns":
+            return self.data.columns.get_indexer(selected.columns)
+        if axis == 1 or axis == "index":
+            return self.data.index.get_indexer(selected.index)
+
+        raise ValueError(f"unknown axis {axis}")
+
+    def _translate(self):
+        self._compute_data()
+        d = super()._translate()
+        circle_styles = self.circle_styles or []
+        palette_styles = self.palette_styles or []
+        col_heading_style = self.col_heading_style or []
+        d.update(
+            {
+                "font": self.font,
+                "circle_styles": circle_styles,
+                "palette_styles": palette_styles,
+                "col_heading_style": col_heading_style,
+            }
+        )
+        return d
+
+    def _compute_data(self):
+        r = self
+        for func, args, kwargs in self._data_todo:
+            r = func(self)(*args, **kwargs)
+        r.data = r.display_data
+        return r
+
+    def _tooltip(self, tip, subset=None, part=None):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        if part is None:
+            part = "data"
+
+        if part == "data":
+            self.display_data.loc[subset] = (
+                "<div title='"
+                + tip
+                + "'>"
+                + self.display_data.loc[subset].astype(str)
+                + "</div>"
+            )
+        elif part == "columns":
+            idx = self._current_index(subset, axis="columns")
+            rename = dict(
+                zip(
+                    self.display_data.columns[idx],
+                    "<div title='"
+                    + tip
+                    + "'>"
+                    + self.display_data.columns[idx].astype(str)
+                    + "</div>",
+                )
+            )
+            self.display_data.rename(columns=rename, inplace=True)
+        elif part == "index":
+            idx = self._current_index(subset, axis="index")
+            rename = dict(
+                zip(
+                    self.display_data.index[idx],
+                    "<div title='"
+                    + tip
+                    + "'>"
+                    + self.display_data.index[idx].astype(str)
+                    + "</div>",
+                )
+            )
+            self.display_data.rename(index=rename, inplace=True)
+        else:
+            raise ValueError(f"unknown value for part: {part}")
+        return self
+
+    def _wrap_iterable(self, it):
+        return [_wrap_html_str(val) for val in it]
+
+    def _wrap(self, subset=None, axis=0):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        if axis in [0, "columns"]:
+            idx = self._current_index(subset, axis="columns")
+            rename = dict(
+                zip(
+                    self.display_data.columns[idx],
+                    self._wrap_iterable(self.display_data.columns[idx]),
+                )
+            )
+            self.display_data.rename(columns=rename, inplace=True)
+        elif axis in [1, "index"]:
+            idx = self._current_index(subset, axis="index")
+            rename = dict(
+                zip(
+                    self.display_data.index[idx],
+                    self._wrap_iterable(self.display_data.index[idx]),
+                )
+            )
+            self.display_data.rename(index=rename, inplace=True)
+        else:
+            raise ValueError(f"unknown value for axis: {axis}")
+        return self
+
+    def _convert_to_image(self, subset=None, height=30):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        self.display_data.loc[subset] = (
+            f'<div style="height:{height}px;object-fit:contain;"><img src="'
+            + self.data.loc[subset].astype(str)
+            + '" style="height:100%;width:100%;object-fit:contain;"/></div>'
+        )
+        return self
+
+    def _border(self, idx, location="left"):
+        return [f"border-{location}: 2px solid #444;" for val in idx]
+
+    def border(self, subset=None, location="left", part="data"):
+        if part == "data":
+            self.apply(self._border, subset=subset, location=location)
+        else:
+            self.col_heading_style["props"].append(
+                (f"border-{location}", "2px solid #444")
+            )
+        return self
+
+    def _center_align(self, idx):
+        return ["text-align:center;" for val in idx]
+
+    def center_align(self, subset=None, axis=0):
+        self.apply(self._center_align, subset=subset, axis=axis)
+        return self
+
+    def scaled_background_gradient(
+        self, subset=None, cmap="RdBu_r", scale_factor=1, center_zero=True
+    ):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+        absmax = np.max(
+            (
+                abs(self.data.loc[subset].max().max()),
+                abs(self.data.loc[subset].min().min()),
+            )
+        )
+        target = absmax * scale_factor
+        r = self
+        for col in self.data.loc[subset].columns:
+            smin = self.data[col].min()
+            smax = self.data[col].max()
+            diff = smax - smin
+
+            if center_zero:
+                # Make sure center of palette is at 0
+                low = abs((-target - smin) / diff)
+                high = (target - smax) / diff
+            else:
+                high = 1 / scale_factor
+                low = 1 / scale_factor
+
+            r = r.background_gradient(cmap=cmap, low=low, high=high, subset=[col])
+        return r
+
+    def _circle(
+        self,
+        subset=None,
+        show_text=True,
+        color=None,
+        palette=None,
+        vmin=None,
+        vmax=None,
+        scale=False,
+        size=25,
+        min_size=5,
+        morph=False,
+    ):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+        # Make sure we don't select text columns
+        subslice = pd.IndexSlice[
+            self.data.loc[subset].index,
+            self.data.loc[subset].select_dtypes(exclude=["object"]).columns,
+        ]
+
+        self.circle_styles = self.circle_styles or []
+        circle_id = len(self.circle_styles) + 1
+
+        props = [
+            ("height", f"{size}px"),
+            ("width", f"{size}px"),
+            ("border-radius", "50%"),
+            ("color", "#000"),
+            ("line-height", f"{size}px"),
+            ("display", "inline-block"),
+            ("text-align", "center"),
+            ("vertical-align", "middle"),
+        ]
+
+        if color:
+            palette = sns.color_palette([color])
+            # print(palette)
+        elif palette is None:
+            palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10)
+        else:
+            # if isinstance(palette, str):
+            palette = sns.color_palette(palette)
+
+        self.circle_styles.append({"name": f"circle{circle_id}", "props": props})
+        self.palette_styles = self.palette_styles or []
+        for i, color in enumerate(palette.as_hex()):
+            props = [("background-color", color)]
+            if scale:
+                circle_size = min_size + ((size - min_size) / len(palette) * (i + 1))
+                props += [
+                    ("height", f"{circle_size}px"),
+                    ("width", f"{circle_size}px"),
+                    ("line-height", f"{circle_size}px"),
+                    ("text-align", "center"),
+                ]
+            if morph:
+                props += [("border-radius", f"{50 - int(50 / len(palette)) * i}%")]
+            self.palette_styles.append(
+                {"name": f"color{circle_id}_{i}", "props": props}
+            )
+
+        vmax = vmax or self.data.loc[subslice].max().max() * 1.01
+        text = self.display_data.loc[subslice].astype(str) if show_text else ""
+        self.display_data.loc[subslice] = (
+            f"<div class='circle{circle_id} color{circle_id}_"
+            + (self.data.loc[subslice] / (vmax / len(palette))).astype(int).astype(str)
+            + "'>"
+            + text
+            + "</div>"
+        )
+
+        return self
+
+    def add_circle(self, **kwargs):
+        self._data_todo.append(
+            (lambda instance: getattr(instance, "_circle"), (), kwargs)
+        )
+        return self
+
+    def wrap(self, **kwargs):
+        self._data_todo.append(
+            (lambda instance: getattr(instance, "_wrap"), (), kwargs)
+        )
+        return self
+
+    def add_tooltip(self, tip, **kwargs):
+        self._data_todo.append(
+            (lambda instance: getattr(instance, "_tooltip"), (tip,), kwargs)
+        )
+        return self
+
+    def convert_to_image(self, **kwargs):
+        self._data_todo.append(
+            (lambda instance: getattr(instance, "_convert_to_image"), (), kwargs)
+        )
+        return self
+
+    def rename(self, columns=None, index=None):
+        self.display_data = self.display_data.rename(columns=columns, index=index)
+        return self
+
 
 def get_roc_values(motif, fg_file, bg_file, genome):
     """Calculate ROC AUC values for ROC plots."""
@@ -167,8 +491,8 @@ class ReportMotif(object):
                 + os.path.basename(roc_img_file % (motif.id, bg))
                 + ".png"
             }
-            rm.bg[bg][u"roc_img_link"] = {
-                u"href": "images/"
+            rm.bg[bg]["roc_img_link"] = {
+                "href": "images/"
                 + os.path.basename(roc_img_file % (motif.id, bg))
                 + ".png"
             }
@@ -253,93 +577,142 @@ def create_denovo_motif_report(
     )
 
 
+def format_factors(motif, max_length=5):
+    fmt_d = "<span style='color:black'>{}</span>"
+    fmt_i = "<span style='color:#666666'>{}</span>"
+
+    direct = sorted(list(set([x.upper() for x in motif.factors[DIRECT_NAME]])))
+    indirect = sorted(
+        list(
+            set(
+                [
+                    x.upper()
+                    for x in motif.factors[INDIRECT_NAME]
+                    if x.upper() not in direct
+                ]
+            )
+        )
+    )
+
+    if len(direct) > max_length:
+        show_factors = direct[:max_length]
+    else:
+        show_factors = direct[:]
+        for f in indirect:
+            if f not in show_factors:
+                show_factors.append(f)
+            if len(show_factors) >= max_length:
+                break
+    show_factors = sorted(show_factors)
+
+    factor_str = ",".join(
+        [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]
+    )
+
+    if len(direct + indirect) > max_length:
+        factor_str += ", (...)"
+
+    tooltip = ""
+    if len(direct) > 0:
+        tooltip += "direct: " + ",".join(sorted(direct))
+    if len(indirect) > 0:
+        if tooltip != "":
+            tooltip += "&#10;"
+        tooltip += "predicted: " + ",".join(sorted(indirect))
+
+    factor_str = '<div title="' + tooltip + '">' + factor_str + "</div>"
+
+    return factor_str
+
+
+def motif_to_factor_series(series, pfmfile=None, motifs=None):
+    if motifs is None:
+        motifs = read_motifs(pfmfile, as_dict=True)
+
+    if isinstance(series, pd.Index):
+        index = series
+    else:
+        index = series.index
+
+    factors = [format_factors(motifs[motif]) for motif in series]
+    return pd.Series(data=factors, index=index)
+
+
+def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"):
+    if motifs is None:
+        motifs = read_motifs(pfmfile, as_dict=True)
+
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    if not os.path.exists(os.path.join(outdir, subdir)):
+        os.makedirs(os.path.join(outdir, subdir))
+
+    img_series = []
+    for motif in series:
+        if motif not in motifs:
+            raise ValueError(f"Motif {motif} does not occur in motif database")
+        fname = subdir + "/{}.png".format(re.sub("[()/]", "_", motif))
+        if not os.path.exists(fname):
+            motifs[motif].plot_logo(fname=os.path.join(outdir, fname))
+        img_series.append(fname)
+
+    if isinstance(series, pd.Index):
+        index = series
+    else:
+        index = series.index
+    return pd.Series(data=img_series, index=index)
+
+
 def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
+
+    # Read the maelstrom text report
     df = pd.read_table(infile, index_col=0)
-    df = df[np.any(abs(df) >= threshold, 1)]
 
-    motifs = read_motifs(pfmfile)
+    # Columns with maelstrom rank aggregation value
+    value_cols = df.columns[
+        ~df.columns.str.contains("correlation") & ~df.columns.isin(["% with motif"])
+    ]
+    # Columns with correlation values
+    corr_cols = df.columns[df.columns.str.contains("correlation")]
 
-    df.rename_axis(None, inplace=True)
-    cols = df.columns
+    df = df[np.any(abs(df[value_cols]) >= threshold, 1)]
 
-    motifs = read_motifs(pfmfile)
-    idx = [motif.id for motif in motifs]
-    direct = [
-        ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]])))
-        for motif in motifs
-    ]
-    indirect = [
-        ",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]])))
-        for motif in motifs
-    ]
-    m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx)
-
-    factor_cols = [DIRECT_NAME, INDIRECT_NAME]
-    if True:
-        for factor_col in factor_cols:
-            f = m2f[factor_col].str.len() > 30
-            m2f[factor_col] = (
-                '<div title="'
-                + m2f[factor_col]
-                + '">'
-                + m2f[factor_col].str.slice(0, 30)
-            )
-            m2f.loc[f, factor_col] += "(...)"
-            m2f[factor_col] += "</div>"
-        df = df.join(m2f)
+    # Add motif logo's
+    df.insert(
+        0,
+        "logo",
+        motif_to_img_series(df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"),
+    )
+    # Add factors that can bind to the motif
+    df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
 
-    df["logo"] = [
-        '<img src="logos/{}.png" height=40/>'.format(re.sub("[()/]", "_", x))
-        for x in list(df.index)
-    ]
+    df["% with motif"] = df["% with motif"].astype(int)
 
-    if not os.path.exists(outdir + "/logos"):
-        os.makedirs(outdir + "/logos")
-    for motif in motifs:
-        if motif.id in df.index:
-            motif.plot_logo(
-                fname=outdir + "/logos/{}.png".format(re.sub("[()/]", "_", motif.id))
-            )
+    rename_columns = {"factors": FACTOR_TOOLTIP}
 
-    template_dir = MotifConfig().get_template_dir()
-    js = open(
-        os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8"
-    ).read()
-    css = open(
-        os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
-        encoding="utf-8",
-    ).read()
-    df = df[factor_cols + ["logo"] + list(cols)]
-
-    df_styled = df.style
-    absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min())))
-    target = absmax * 1.75
-
-    for col in cols:
-        smin = df[col].min()
-        smax = df[col].max()
-        diff = smax - smin
-        low = abs((-target - smin) / diff)
-        high = (target - smax) / diff
-        df_styled = df_styled.background_gradient(
-            cmap="RdBu_r", low=low, high=high, subset=[col]
+    df_styled = (
+        ExtraStyler(df)
+        .set_precision(2)
+        .convert_to_image(subset=["logo"], height=30,)
+        .scaled_background_gradient(
+            subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75
         )
-
-    df_styled = df_styled.set_precision(3)
-    df_styled = df_styled.set_table_attributes("data-sortable")
-    df_styled = df_styled.render()
-    df_styled = df_styled.replace(
-        "data-sortable", 'class="sortable-theme-slick" data-sortable'
+        .scaled_background_gradient(
+            subset=value_cols, center_zero=True, scale_factor=1.75
+        )
+        .border(subset=list(value_cols[:1]) + ["% with motif"], location="left")
+        .border(part="columns", location="bottom")
+        .add_circle(subset=["% with motif"], palette="Purples", vmax=100, size=40)
+        .set_table_attributes('class="sortable-theme-slick" data-sortable')
+        .center_align(subset=list(value_cols) + list(corr_cols) + ["% with motif"])
+        .wrap(subset=["% with motif"] + list(corr_cols))
+        .set_font("Nunito Sans")
+        .rename(columns=rename_columns,)
+        .render()
     )
 
     with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
-        f.write("<head>\n")
-        f.write("<style>{}</style>\n".format(css))
-        f.write("</head>\n")
-        f.write("<body>\n")
         f.write(df_styled)
-        f.write("<script>{}</script>\n".format(js))
-        f.write("</body>\n")
 
 
 def roc_html_report(
@@ -374,24 +747,10 @@ def roc_html_report(
 
     idx = [motif.id for motif in motifs]
     df = df.loc[idx]
-    direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs]
-    indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs]
-    m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx)
-
-    factor_cols = [DIRECT_NAME, INDIRECT_NAME]
-    if True:
-        for factor_col in factor_cols:
-            f = m2f[factor_col].str.len() > 30
-            m2f[factor_col] = (
-                '<div title="'
-                + m2f[factor_col]
-                + '">'
-                + m2f[factor_col].str.slice(0, 30)
-            )
-            m2f.loc[f, factor_col] += "(...)"
-            m2f[factor_col] += "</div>"
-        df = df.join(m2f)
-        cols = factor_cols + cols
+
+    # Add factors that can bind to the motif
+    df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
+    cols = ["factors"] + cols
 
     df = df[df["corrected P-value"] <= threshold]
 
@@ -410,6 +769,8 @@ def roc_html_report(
     ]
 
     df = df[cols]
+
+    df = df.rename(columns={"factors": FACTOR_TOOLTIP})
     if not os.path.exists(outdir + "/logos"):
         os.makedirs(outdir + "/logos")
     for motif in motifs:
@@ -441,10 +802,12 @@ def roc_html_report(
         f.write("<body>\n")
         if df.shape[0] > 0:
             f.write(
-                df.sort_values("ROC AUC", ascending=False)
+                df.reset_index()
+                .sort_values("ROC AUC", ascending=False)
                 .style.bar(bar_cols)
                 .set_precision(3)
                 .set_table_attributes("data-sortable")
+                .hide_index()
                 .render()
                 .replace("data-sortable", 'class="sortable-theme-slick" data-sortable')
             )

From 85c0e8cf06cbde3c02a04bd16685015f1379b74a Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 12:03:09 +0200
Subject: [PATCH 23/85] don't add correlation if input is cluster-based

---
 gimmemotifs/maelstrom.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 6d24058c..0c78eae8 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -428,22 +428,23 @@ def run_maelstrom(
 
     counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
     scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")
-    
+
     if len(methods) > 1:
         logger.info("Rank aggregation")
         df_p = df_rank_aggregation(df, dfs, exps)
-        
-        # Add correlation between motif score and signal
-        logger.info("Correlation")
-        cols = df_p.columns
-        for col in cols[::-1]:
-            df_p.insert(0, f"correlation {col}", 0)
-            for motif in df_p.index:
-                df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0]
-        
+
+        if df.shape[1] > 1:
+            # Add correlation between motif score and signal
+            logger.info("Correlation")
+            cols = df_p.columns
+            for col in cols[::-1]:
+                df_p.insert(0, f"correlation {col}", 0)
+                for motif in df_p.index:
+                    df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0]
+
         # Add percentage of input sequences with motif
         df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100)
-        
+
         df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
     # df_p = df_p.join(m2f)
 

From 1e4b5a598dc57c452ec3c446b3a646c385effab3 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 12:03:35 +0200
Subject: [PATCH 24/85] black

---
 gimmemotifs/maelstrom.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 0c78eae8..97ecc69f 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -440,7 +440,9 @@ def run_maelstrom(
             for col in cols[::-1]:
                 df_p.insert(0, f"correlation {col}", 0)
                 for motif in df_p.index:
-                    df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0]
+                    df_p.loc[motif, f"correlation {col}"] = pearsonr(
+                        df[col], scores[motif]
+                    )[0]
 
         # Add percentage of input sequences with motif
         df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100)
@@ -541,7 +543,7 @@ def plot_heatmap(
         figsize=None,
         max_len=50,
         aspect=1,
-        **kwargs
+        **kwargs,
     ):
         """Plot clustered heatmap of predicted motif activity.
 

From 0f36a0ccd42910ba74b68d11184b7df4a0bd7c4b Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 13:22:12 +0200
Subject: [PATCH 25/85] support old maelstrom output

---
 gimmemotifs/report.py | 128 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 115 insertions(+), 13 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index e7f63285..9b21cddb 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -52,6 +52,10 @@ def _wrap_html_str(x):
 
 
 class ExtraStyler(Styler):
+    """
+    Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS.
+    """
+
     loader = jinja2.ChoiceLoader(
         [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader]
     )
@@ -82,6 +86,27 @@ def font(self, font_name):
         self._font = font_name
 
     def set_font(self, font_name):
+        """
+        Set the font that will be used.
+        
+        Parameters
+        ----------
+        font_name : str
+            Should be a font name available though the Google Font API.
+        
+        Returns
+        -------
+        self : ExtraStyler
+        
+        Notes
+        -----
+        ``font_name`` can contain spaces, eg. "Nunito Sans".
+        
+        Examples
+        --------
+        >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
+        >>> ExtraStyler(df).font("Roboto)  
+        """
         self.font = font_name
         return self
 
@@ -205,12 +230,57 @@ def _convert_to_image(self, subset=None, height=30):
     def _border(self, idx, location="left"):
         return [f"border-{location}: 2px solid #444;" for val in idx]
 
-    def border(self, subset=None, location="left", part="data"):
+    def border(
+        self,
+        subset=None,
+        location="bottom",
+        part="data",
+        width="2px",
+        style="solid",
+        color="#444",
+    ):
+        """
+        Add a border to data cells, columns or index.
+
+        Parameters
+        ----------
+        subset : IndexSlice, optional
+            An argument to ``DataFrame.loc`` that restricts which elements
+            ``border`` is applied to. If ``part`` is "columns" or "index"
+            subset should be present in either the columns or the index.
+        
+        location : str, optional
+            Location of the border, default is "bottom". Can be "top", "bottom",
+            "right" or "left".
+
+        part : str, optional
+            If ``part`` is "data", the border will be applied to the data cells.
+            Set part to "index" or to "column" to add a border to the index or 
+            header, respectively.
+
+        width : str, int or float, optional
+            Valid CSS value for border width.
+
+        style : str,  optional
+            Valid CSS value for border style.
+
+        color : str,  optional
+            Valid CSS value for border color.
+
+        Returns
+        -------
+        self : ExtraStyler
+        
+        Examples
+        --------
+        >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
+        >>> ExtraStyler(df).border(part="columns)
+        """
         if part == "data":
             self.apply(self._border, subset=subset, location=location)
         else:
             self.col_heading_style["props"].append(
-                (f"border-{location}", "2px solid #444")
+                (f"border-{location}", f"{width} {style} {color}")
             )
         return self
 
@@ -218,6 +288,24 @@ def _center_align(self, idx):
         return ["text-align:center;" for val in idx]
 
     def center_align(self, subset=None, axis=0):
+        """
+        Center align text.
+
+        Parameters
+        ----------
+        subset : IndexSlice, optional
+            An argument to ``DataFrame.loc`` that restricts which elements
+            ``center_align`` is applied to. 
+
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Apply to each column (``axis=0`` or ``'index'``), to each row
+            (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
+            with ``axis=None``.
+        
+        Returns
+        -------
+        self : ExtraStyler
+        """
         self.apply(self._center_align, subset=subset, axis=axis)
         return self
 
@@ -686,31 +774,45 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
     # Add factors that can bind to the motif
     df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
 
-    df["% with motif"] = df["% with motif"].astype(int)
-
     rename_columns = {"factors": FACTOR_TOOLTIP}
 
     df_styled = (
         ExtraStyler(df)
         .set_precision(2)
         .convert_to_image(subset=["logo"], height=30,)
-        .scaled_background_gradient(
-            subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75
-        )
         .scaled_background_gradient(
             subset=value_cols, center_zero=True, scale_factor=1.75
         )
-        .border(subset=list(value_cols[:1]) + ["% with motif"], location="left")
+        .border(subset=list(value_cols[:1]), location="left")
         .border(part="columns", location="bottom")
-        .add_circle(subset=["% with motif"], palette="Purples", vmax=100, size=40)
         .set_table_attributes('class="sortable-theme-slick" data-sortable')
-        .center_align(subset=list(value_cols) + list(corr_cols) + ["% with motif"])
-        .wrap(subset=["% with motif"] + list(corr_cols))
+        .center_align(subset=list(value_cols))
         .set_font("Nunito Sans")
-        .rename(columns=rename_columns,)
-        .render()
+        .rename(columns=rename_columns)
     )
 
+    if len(corr_cols) > 0:
+        df_styled = (
+            df_styled.wrap(subset=list(corr_cols))
+            .center_align(subset=list(corr_cols))
+            .scaled_background_gradient(
+                subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75
+            )
+        )
+
+    if "% with motif" in df.columns:
+        df["% with motif"] = df["% with motif"].astype(int)
+        df_styled = (
+            df_styled.add_circle(
+                subset=["% with motif"], palette="Purples", vmax=100, size=40
+            )
+            .wrap(subset=["% with motif"])
+            .center_align(subset=["% with motif"])
+            .border(subset=["% with motif"], location="left")
+        )
+
+    df_styled = df_styled.render()
+
     with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
         f.write(df_styled)
 

From f8b7bc5c5f8054c5f1b39fafd305da9a89c58a75 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 14:59:53 +0200
Subject: [PATCH 26/85] Fix B009 warning

---
 gimmemotifs/report.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 9b21cddb..e429f643 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -414,25 +414,25 @@ def _circle(
 
     def add_circle(self, **kwargs):
         self._data_todo.append(
-            (lambda instance: getattr(instance, "_circle"), (), kwargs)
+            (lambda instance: instance._circle, (), kwargs)
         )
         return self
 
     def wrap(self, **kwargs):
         self._data_todo.append(
-            (lambda instance: getattr(instance, "_wrap"), (), kwargs)
+            (lambda instance: instance._wrap, (), kwargs)
         )
         return self
 
     def add_tooltip(self, tip, **kwargs):
         self._data_todo.append(
-            (lambda instance: getattr(instance, "_tooltip"), (tip,), kwargs)
+            (lambda instance: instance._tooltip, (tip,), kwargs)
         )
         return self
 
     def convert_to_image(self, **kwargs):
         self._data_todo.append(
-            (lambda instance: getattr(instance, "_convert_to_image"), (), kwargs)
+            (lambda instance: instance._convert_to_image, (), kwargs)
         )
         return self
 

From e72e411ac7290adaa30e33caf32e4465c14d0a37 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 7 Jul 2020 15:01:35 +0200
Subject: [PATCH 27/85] fix test

---
 test/test_maelstrom.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py
index 269e5b92..4edda61b 100644
--- a/test/test_maelstrom.py
+++ b/test/test_maelstrom.py
@@ -32,7 +32,7 @@ def test1_maelstrom(self):
         )
         df = pd.read_table(self.outfile, index_col=0, comment="#")
         print(df.shape)
-        self.assertEquals((623, 4), df.shape)
+        self.assertEquals((623, 5), df.shape)
 
         for fname in glob(os.path.join(self.outdir, "activity*")):
             os.unlink(fname)

From d8eae732cb4eacb3dec6669386f79e23c5cb095c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 8 Jul 2020 10:29:41 +0200
Subject: [PATCH 28/85] emojify

---
 gimmemotifs/report.py | 199 +++++++++++++++++++++++++++++++-----------
 1 file changed, 150 insertions(+), 49 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index e429f643..8fdc27d8 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -20,6 +20,11 @@
 from pandas.io.formats.style import Styler
 import seaborn as sns
 
+try:
+    import emoji
+except ImportError:
+    pass
+
 from gimmemotifs.comparison import MotifComparer
 from gimmemotifs.fasta import Fasta
 from gimmemotifs.motif import read_motifs
@@ -88,36 +93,37 @@ def font(self, font_name):
     def set_font(self, font_name):
         """
         Set the font that will be used.
-        
+
         Parameters
         ----------
         font_name : str
             Should be a font name available though the Google Font API.
-        
+
         Returns
         -------
         self : ExtraStyler
-        
+
         Notes
         -----
         ``font_name`` can contain spaces, eg. "Nunito Sans".
-        
+
         Examples
         --------
         >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
-        >>> ExtraStyler(df).font("Roboto)  
+        >>> ExtraStyler(df).font("Roboto)
         """
         self.font = font_name
         return self
 
-    def _current_index(self, subset, axis=0):
+    def _current_index(self, subset):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
         selected = self.data.loc[subset]
-        if axis == 0 or axis == "columns":
-            return self.data.columns.get_indexer(selected.columns)
-        if axis == 1 or axis == "index":
-            return self.data.index.get_indexer(selected.index)
-
-        raise ValueError(f"unknown axis {axis}")
+        idx_slice = pd.IndexSlice[
+            self.data.index.get_indexer(selected.index),
+            self.data.columns.get_indexer(selected.columns),
+        ]
+        return idx_slice
 
     def _translate(self):
         self._compute_data()
@@ -158,7 +164,7 @@ def _tooltip(self, tip, subset=None, part=None):
                 + "</div>"
             )
         elif part == "columns":
-            idx = self._current_index(subset, axis="columns")
+            idx = self._current_index(subset)[1]
             rename = dict(
                 zip(
                     self.display_data.columns[idx],
@@ -171,7 +177,7 @@ def _tooltip(self, tip, subset=None, part=None):
             )
             self.display_data.rename(columns=rename, inplace=True)
         elif part == "index":
-            idx = self._current_index(subset, axis="index")
+            idx = self._current_index(subset)[0]
             rename = dict(
                 zip(
                     self.display_data.index[idx],
@@ -195,7 +201,7 @@ def _wrap(self, subset=None, axis=0):
         subset = _non_reducing_slice(subset)
 
         if axis in [0, "columns"]:
-            idx = self._current_index(subset, axis="columns")
+            idx = self._current_index(subset)[1]
             rename = dict(
                 zip(
                     self.display_data.columns[idx],
@@ -204,7 +210,7 @@ def _wrap(self, subset=None, axis=0):
             )
             self.display_data.rename(columns=rename, inplace=True)
         elif axis in [1, "index"]:
-            idx = self._current_index(subset, axis="index")
+            idx = self._current_index(subset)[0]
             rename = dict(
                 zip(
                     self.display_data.index[idx],
@@ -248,14 +254,14 @@ def border(
             An argument to ``DataFrame.loc`` that restricts which elements
             ``border`` is applied to. If ``part`` is "columns" or "index"
             subset should be present in either the columns or the index.
-        
+
         location : str, optional
             Location of the border, default is "bottom". Can be "top", "bottom",
             "right" or "left".
 
         part : str, optional
             If ``part`` is "data", the border will be applied to the data cells.
-            Set part to "index" or to "column" to add a border to the index or 
+            Set part to "index" or to "column" to add a border to the index or
             header, respectively.
 
         width : str, int or float, optional
@@ -270,7 +276,7 @@ def border(
         Returns
         -------
         self : ExtraStyler
-        
+
         Examples
         --------
         >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b'])
@@ -284,29 +290,32 @@ def border(
             )
         return self
 
-    def _center_align(self, idx):
-        return ["text-align:center;" for val in idx]
+    def _align(self, idx, location="center"):
+        return [f"text-align:{location};" for val in idx]
 
-    def center_align(self, subset=None, axis=0):
+    def align(self, subset=None, location="center", axis=0):
         """
-        Center align text.
+        Align text.
 
         Parameters
         ----------
         subset : IndexSlice, optional
             An argument to ``DataFrame.loc`` that restricts which elements
-            ``center_align`` is applied to. 
+            ``center_align`` is applied to.
+
+        location : str, optional
+            "center", "left" or "right"
 
         axis : {0 or 'index', 1 or 'columns', None}, default 0
             Apply to each column (``axis=0`` or ``'index'``), to each row
             (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
             with ``axis=None``.
-        
+
         Returns
         -------
         self : ExtraStyler
         """
-        self.apply(self._center_align, subset=subset, axis=axis)
+        self.apply(self._align, subset=subset, location=location, axis=axis)
         return self
 
     def scaled_background_gradient(
@@ -352,12 +361,13 @@ def _circle(
         morph=False,
     ):
         subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
+        subslice = _non_reducing_slice(subset)
         # Make sure we don't select text columns
-        subslice = pd.IndexSlice[
-            self.data.loc[subset].index,
-            self.data.loc[subset].select_dtypes(exclude=["object"]).columns,
-        ]
+        if scale or morph:
+            subslice = pd.IndexSlice[
+                self.data.loc[subset].index,
+                self.data.loc[subset].select_dtypes(exclude=["object"]).columns,
+            ]
 
         self.circle_styles = self.circle_styles or []
         circle_id = len(self.circle_styles) + 1
@@ -400,34 +410,36 @@ def _circle(
                 {"name": f"color{circle_id}_{i}", "props": props}
             )
 
-        vmax = vmax or self.data.loc[subslice].max().max() * 1.01
-        text = self.display_data.loc[subslice].astype(str) if show_text else ""
-        self.display_data.loc[subslice] = (
-            f"<div class='circle{circle_id} color{circle_id}_"
-            + (self.data.loc[subslice] / (vmax / len(palette))).astype(int).astype(str)
-            + "'>"
-            + text
-            + "</div>"
-        )
+        if scale or morph:
+            vmax = vmax or self.data.loc[subslice].max().max() * 1.01
+            text = self.display_data.loc[subslice].astype(str) if show_text else ""
+            self.display_data.loc[subslice] = (
+                f"<div class='circle{circle_id} color{circle_id}_"
+                + (self.data.loc[subslice] / (vmax / len(palette)))
+                .astype(int)
+                .astype(str)
+                + "'>"
+                + text
+                + "</div>"
+            )
+        else:
+            text = self.display_data.loc[subslice].astype(str) if show_text else ""
+            self.display_data.loc[subslice] = (
+                f"<div class='circle{circle_id} color{circle_id}_0'>" + text + "</div>"
+            )
 
         return self
 
     def add_circle(self, **kwargs):
-        self._data_todo.append(
-            (lambda instance: instance._circle, (), kwargs)
-        )
+        self._data_todo.append((lambda instance: instance._circle, (), kwargs))
         return self
 
     def wrap(self, **kwargs):
-        self._data_todo.append(
-            (lambda instance: instance._wrap, (), kwargs)
-        )
+        self._data_todo.append((lambda instance: instance._wrap, (), kwargs))
         return self
 
     def add_tooltip(self, tip, **kwargs):
-        self._data_todo.append(
-            (lambda instance: instance._tooltip, (tip,), kwargs)
-        )
+        self._data_todo.append((lambda instance: instance._tooltip, (tip,), kwargs))
         return self
 
     def convert_to_image(self, **kwargs):
@@ -440,6 +452,95 @@ def rename(self, columns=None, index=None):
         self.display_data = self.display_data.rename(columns=columns, index=index)
         return self
 
+    def _emoji_score(self, series, emoji_str=None, bins=None):
+        if emoji_str is None:
+            emoji_str = ":star:"
+        if bins is None:
+            bins = 3
+
+        if isinstance(bins, int):
+            labels = range(1, bins + 1)
+        else:
+            labels = range(1, len(bins))
+
+        return [
+            emoji.emojize(emoji_str * val, use_aliases=True)
+            for val in pd.cut(series, bins=bins, labels=labels)
+        ]
+
+    def _emoji_scale(self, series, emojis=None, bins=None):
+        emoji_dict = {
+            "thumbs": [":thumbsdown:", ":thumbsup:"],
+            "check": [":cross_mark:", ":white_check_mark:"],
+            "smiley": [
+                ":crying_face:",
+                ":slightly_frowning_face:",
+                ":neutral_face:",
+                ":slightly_smiling_face:",
+                ":grin:",
+            ],
+            "black_square": [
+                ":black_small_square:",
+                ":black_medium_small_square:",
+                ":black_medium_square:",
+                ":black_large_square:",
+            ],
+            "white_square": [
+                ":white_small_square:",
+                ":white_medium_small_square:",
+                ":white_medium_square:",
+                ":white_large_square:",
+            ],
+        }
+
+        if emojis is None:
+            emojis = "smiley"
+
+        if emojis in emoji_dict:
+            labels = emoji_dict[emojis]
+        if bins is None:
+            bins = len(labels)
+
+        return [
+            emoji.emojize(val, use_aliases=True)
+            for val in pd.cut(series, bins=bins, labels=labels)
+        ]
+
+    def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        idx = self._current_index(subset=subset)
+
+        result = self.display_data.iloc[idx].apply(
+            self._emoji_scale, axis=axis, result_type="expand", args=(emojis, bins)
+        )
+        self.display_data.iloc[idx] = result.values
+
+        return self.align(subset=subset, location="center", axis=axis)
+
+    def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        idx = self._current_index(subset=subset)
+        result = self.display_data.iloc[idx].apply(
+            self._emoji_score, axis=axis, result_type="expand", args=(emoji_str, bins)
+        )
+        self.display_data.iloc[idx] = result.values
+
+        return self.align(subset=subset, location="left", axis=axis)
+
+    def emojify(self, subset=None):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        idx = self._current_index(subset=subset)
+        result = self.display_data.iloc[idx].applymap(emoji.emojize)
+        self.display_data.iloc[idx] = result.values
+
+        return self
+
 
 def get_roc_values(motif, fg_file, bg_file, genome):
     """Calculate ROC AUC values for ROC plots."""

From 2eac4c4fcc8e4f4924e4c1eda672ca798c91c66d Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Tue, 14 Jul 2020 15:51:07 +0200
Subject: [PATCH 29/85] fix issue with chrom names as int

---
 scripts/combine_peaks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/combine_peaks b/scripts/combine_peaks
index 155d6e83..07d41040 100644
--- a/scripts/combine_peaks
+++ b/scripts/combine_peaks
@@ -116,7 +116,7 @@ def combine_peaks(peaks, genome, window, scale_value):
 
     # store summit location + associated value in col4
     df_all["col4"] = (
-        df_all["chrom"]
+        df_all["chrom"].astype(str)
         + ";"
         + df_all["start"].astype(str)
         + ";"

From bea0a24c003832a4eff8ec153dfc8c6e1565483e Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 00:05:52 +0200
Subject: [PATCH 30/85] correctly read motif file

---
 gimmemotifs/motif.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index cb8aba8b..2be35178 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -1393,13 +1393,7 @@ def parse_motifs(motifs):
         List of Motif instances.
     """
     if isinstance(motifs, str):
-        with open(motifs) as f:
-            if motifs.endswith("pwm") or motifs.endswith("pfm"):
-                motifs = read_motifs(f, fmt="pwm")
-            elif motifs.endswith("transfac"):
-                motifs = read_motifs(f, fmt="transfac")
-            else:
-                motifs = read_motifs(f)
+        return read_motifs(motifs)
     elif isinstance(motifs, Motif):
         motifs = [motifs]
     else:

From e87801aef7516a545e2fe04f8f72e887add3ebf8 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 08:43:04 +0200
Subject: [PATCH 31/85] update report

---
 gimmemotifs/commands/motifs.py |  12 ++-
 gimmemotifs/maelstrom.py       |   1 +
 gimmemotifs/report.py          | 159 +++++++++++++++++----------------
 3 files changed, 90 insertions(+), 82 deletions(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index a1681c8d..e72e7c5d 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -13,6 +13,7 @@
 from tempfile import NamedTemporaryFile
 
 import numpy as np
+import pandas as pd
 
 from gimmemotifs.background import create_background_file
 from gimmemotifs.comparison import MotifComparer, select_nonredundant_motifs
@@ -142,7 +143,7 @@ def motifs(args):
 
     # Print the metrics
     f_out.write(
-        "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
+        "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n"
     )
 
     logger.info("creating motif scan tables")
@@ -174,6 +175,9 @@ def motifs(args):
             gcnorm=True,
         )
 
+    n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0]
+    n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0]
+
     logger.info("calculating stats")
     for motif_stats in calc_stats_iterator(
         motifs=pfmfile,
@@ -188,10 +192,12 @@ def motifs(args):
                 if motif_stats[str(motif)]["phyper_at_fpr"] > 0:
                     log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"])
                 f_out.write(
-                    "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format(
+                    "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format(
                         motif.id,
                         motif_stats[str(motif)]["matches_at_fpr"][0],
+                        motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100,
                         motif_stats[str(motif)]["matches_at_fpr"][1],
+                        motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100,
                         motif_stats[str(motif)]["phyper_at_fpr"],
                         log_pvalue,
                         motif_stats[str(motif)]["roc_auc"],
@@ -203,7 +209,7 @@ def motifs(args):
     f_out.close()
 
     # Select a set of "non-redundant" motifs.
-    # Using Recursive Feature Elemination, a set of motifs is selected that
+    # Using Recursive Feature Elimination, a set of motifs is selected that
     # best explains the peaks in comparison to the background sequences.
     nr_motifs = select_nonredundant_motifs(
         args.outdir + "/gimme.roc.report.txt",
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 97ecc69f..4719fabb 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -187,6 +187,7 @@ def _rank_agg_column(exps, dfs, e):
                 tmp_dfs[i][k] = (
                     v.sample(frac=1).sort_values(e, ascending=sort_order).index.values
                 )
+            
     return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
 
 
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 8fdc27d8..5bde42e0 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -19,6 +19,8 @@
 from pandas.core.indexing import _non_reducing_slice
 from pandas.io.formats.style import Styler
 import seaborn as sns
+from matplotlib import colors
+import matplotlib.pyplot as plt
 
 try:
     import emoji
@@ -318,41 +320,22 @@ def align(self, subset=None, location="center", axis=0):
         self.apply(self._align, subset=subset, location=location, axis=axis)
         return self
 
-    def scaled_background_gradient(
-        self, subset=None, cmap="RdBu_r", scale_factor=1, center_zero=True
-    ):
-        subset = pd.IndexSlice[:, :] if subset is None else subset
-        subset = _non_reducing_slice(subset)
-        absmax = np.max(
-            (
-                abs(self.data.loc[subset].max().max()),
-                abs(self.data.loc[subset].min().min()),
-            )
-        )
-        target = absmax * scale_factor
-        r = self
-        for col in self.data.loc[subset].columns:
-            smin = self.data[col].min()
-            smax = self.data[col].max()
-            diff = smax - smin
-
-            if center_zero:
-                # Make sure center of palette is at 0
-                low = abs((-target - smin) / diff)
-                high = (target - smax) / diff
-            else:
-                high = 1 / scale_factor
-                low = 1 / scale_factor
+    def _background_gradient(self, s, m, M, cmap='PuBu', low=0, high=0):
+        rng = M - m
+        norm = colors.Normalize(m - (rng * low),
+                            M + (rng * high))
+        normed = norm(s.values)
+        c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
+        return ['background-color: %s' % color for color in c]
+
 
-            r = r.background_gradient(cmap=cmap, low=low, high=high, subset=[col])
-        return r
 
     def _circle(
         self,
         subset=None,
         show_text=True,
         color=None,
-        palette=None,
+        cmap=None,
         vmin=None,
         vmax=None,
         scale=False,
@@ -362,13 +345,24 @@ def _circle(
     ):
         subset = pd.IndexSlice[:, :] if subset is None else subset
         subslice = _non_reducing_slice(subset)
+        
+        if color:
+            palette = sns.color_palette([color])
+            # print(palette)
+        elif cmap is None:
+            palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10)
+        else:
+            # if isinstance(palette, str):
+            palette = sns.color_palette(cmap)
+
         # Make sure we don't select text columns
-        if scale or morph:
+        if len(palette) > 1:
             subslice = pd.IndexSlice[
-                self.data.loc[subset].index,
-                self.data.loc[subset].select_dtypes(exclude=["object"]).columns,
+                self.data.loc[subslice].index,
+                self.data.loc[subslice].select_dtypes(exclude=["object"]).columns,
             ]
 
+
         self.circle_styles = self.circle_styles or []
         circle_id = len(self.circle_styles) + 1
 
@@ -383,14 +377,6 @@ def _circle(
             ("vertical-align", "middle"),
         ]
 
-        if color:
-            palette = sns.color_palette([color])
-            # print(palette)
-        elif palette is None:
-            palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10)
-        else:
-            # if isinstance(palette, str):
-            palette = sns.color_palette(palette)
 
         self.circle_styles.append({"name": f"circle{circle_id}", "props": props})
         self.palette_styles = self.palette_styles or []
@@ -410,8 +396,8 @@ def _circle(
                 {"name": f"color{circle_id}_{i}", "props": props}
             )
 
-        if scale or morph:
-            vmax = vmax or self.data.loc[subslice].max().max() * 1.01
+        if len(palette) > 1:
+            vmax = self.data.loc[subslice].max().max() * 1.01 if vmax is None else vmax * 1.01
             text = self.display_data.loc[subslice].astype(str) if show_text else ""
             self.display_data.loc[subslice] = (
                 f"<div class='circle{circle_id} color{circle_id}_"
@@ -541,6 +527,33 @@ def emojify(self, subset=None):
 
         return self
 
+    def scaled_background_gradient(
+        self, subset=None, cmap="RdBu_r", low=0, high=0, center_zero=False, 
+        vmin=None, vmax=None
+    ):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        vmax = self.data.loc[subset].max().max() if vmax is None else vmax
+        vmin = self.data.loc[subset].min().min() if vmin is None else vmin
+
+        if center_zero:
+            vmax = max(abs(vmax), abs(vmin))
+            vmin = -vmax
+
+        r = self
+        for col in self.data.loc[subset].columns:
+            r = r.apply(self._background_gradient,
+                subset=pd.IndexSlice[subset[0], col],
+                cmap=cmap,
+                m=vmin,
+                M=vmax,
+                low=low,
+                high=high)
+
+        return r
+
+
 
 def get_roc_values(motif, fg_file, bg_file, genome):
     """Calculate ROC AUC values for ROC plots."""
@@ -882,12 +895,12 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
         .set_precision(2)
         .convert_to_image(subset=["logo"], height=30,)
         .scaled_background_gradient(
-            subset=value_cols, center_zero=True, scale_factor=1.75
+            subset=value_cols, center_zero=True, min=1/1.75, max=1/1.75
         )
         .border(subset=list(value_cols[:1]), location="left")
         .border(part="columns", location="bottom")
         .set_table_attributes('class="sortable-theme-slick" data-sortable')
-        .center_align(subset=list(value_cols))
+        .align(subset=list(value_cols), location="center")
         .set_font("Nunito Sans")
         .rename(columns=rename_columns)
     )
@@ -895,9 +908,9 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
     if len(corr_cols) > 0:
         df_styled = (
             df_styled.wrap(subset=list(corr_cols))
-            .center_align(subset=list(corr_cols))
+            .align(subset=list(corr_cols), location="center")
             .scaled_background_gradient(
-                subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75
+                subset=corr_cols, cmap="PuOr_r", center_zero=True, min=1/1.75, max=1/1.75
             )
         )
 
@@ -905,10 +918,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
         df["% with motif"] = df["% with motif"].astype(int)
         df_styled = (
             df_styled.add_circle(
-                subset=["% with motif"], palette="Purples", vmax=100, size=40
+                subset=["% with motif"], cmap="Purples", vmax=100, size=40
             )
             .wrap(subset=["% with motif"])
-            .center_align(subset=["% with motif"])
+            .align(subset=["% with motif"], location="center")
             .border(subset=["% with motif"], location="left")
         )
 
@@ -944,17 +957,12 @@ def roc_html_report(
         "Recall at 10% FDR",
     ]
 
-    motifs = read_motifs(pfmfile)
+    motifs = read_motifs(pfmfile, as_dict=True)
     if use_motifs is not None:
-        motifs = [m for m in motifs if m.id in use_motifs]
-
-    idx = [motif.id for motif in motifs]
+        motifs = {k:v for k,v in motifs.items() if k in use_motifs}
+    idx = list(motifs.keys())
     df = df.loc[idx]
 
-    # Add factors that can bind to the motif
-    df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
-    cols = ["factors"] + cols
-
     df = df[df["corrected P-value"] <= threshold]
 
     if link_matches:
@@ -966,23 +974,17 @@ def roc_html_report(
             + "</a>"
         )
 
-    df["logo"] = [
-        '<img src="logos/{}.png" height=40/>'.format(re.sub(r"[^-_\w]+", "_", x))
-        for x in list(df.index)
-    ]
-
-    df = df[cols]
-
-    df = df.rename(columns={"factors": FACTOR_TOOLTIP})
-    if not os.path.exists(outdir + "/logos"):
-        os.makedirs(outdir + "/logos")
-    for motif in motifs:
-        if motif.id in df.index:
-            motif.plot_logo(
-                fname=outdir
-                + "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id))
-            )
+    # Add motif logo's
+    df.insert(
+        0,
+        "logo",
+        motif_to_img_series(df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"),
+    )
+    # Add factors that can bind to the motif
+    df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs))
 
+    rename_columns = {"factors": FACTOR_TOOLTIP}
+    
     bar_cols = [
         "log10 P-value",
         "ROC AUC",
@@ -998,6 +1000,8 @@ def roc_html_report(
         os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
         encoding="utf-8",
     ).read()
+    
+    df = df.reset_index().sort_values("ROC AUC", ascending=False)
     with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
         f.write("<head>\n")
         f.write("<style>{}</style>\n".format(css))
@@ -1005,16 +1009,13 @@ def roc_html_report(
         f.write("<body>\n")
         if df.shape[0] > 0:
             f.write(
-                df.reset_index()
-                .sort_values("ROC AUC", ascending=False)
-                .style.bar(bar_cols)
-                .set_precision(3)
+                ExtraStyler(df)
+                .bar(bar_cols)
+                .set_precision(2)
                 .set_table_attributes("data-sortable")
                 .hide_index()
                 .render()
-                .replace("data-sortable", 'class="sortable-theme-slick" data-sortable')
             )
         else:
-            f.write("No enriched motifs found.")
-        f.write("<script>{}</script>\n".format(js))
-        f.write("</body>\n")
+            f.write("<body>No enriched motifs found.</body>")
+        

From f6e1374c8346d6b86bc2c07fbddce41f30ef049c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 12:30:53 +0200
Subject: [PATCH 32/85] fix GC%-normalized z-score cutoff

---
 gimmemotifs/scanner.py | 380 ++++++++++++++++++-----------------------
 1 file changed, 166 insertions(+), 214 deletions(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index 337c912d..cbb05961 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -1,6 +1,7 @@
 import os
 import re
 import sys
+from collections import Counter
 from functools import partial
 from tempfile import mkdtemp, NamedTemporaryFile
 import logging
@@ -18,6 +19,8 @@
 from diskcache import Cache
 import numpy as np
 from scipy.stats import scoreatpercentile
+from sklearn.preprocessing import scale
+import pandas as pd
 
 from gimmemotifs import __version__
 from gimmemotifs.background import RandomGenomicFasta, gc_bin_bedfile
@@ -369,7 +372,9 @@ def parse_threshold_values(motif_file, cutoff):
     return threshold
 
 
-def scan_sequence(seq, motifs, nreport, scan_rc):
+def scan_sequence(
+    seq, seq_gc_bin, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False
+):
 
     ret = []
     # scan for motifs
@@ -377,36 +382,38 @@ def scan_sequence(seq, motifs, nreport, scan_rc):
         if cutoff is None:
             ret.append([])
         else:
-            result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc)
-            if cutoff <= motif.pwm_min_score() and len(result) == 0:
-                result = [[motif.pwm_min_score(), 0, 1]] * nreport
+            if zscore:
+                m_mean, m_std = motifs_meanstd[seq_gc_bin][motif.id]
+                result = pwmscan(
+                    seq, motif.logodds, motif.pwm_min_score(), nreport, scan_rc
+                )
+                result = [[(row[0] - m_mean) / m_std, row[1], row[2]] for row in result]
+                result = [row for row in result if row[0] >= cutoff]
+            else:
+                result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc)
+                if cutoff <= motif.pwm_min_score() and len(result) == 0:
+                    result = [[motif.pwm_min_score(), 0, 1]] * nreport
+
             ret.append(result)
 
     # return results
     return ret
 
 
-def scan_region(region, genome, motifs, nreport, scan_rc):
-
-    # retrieve sequence
-    chrom, start, end = re.split(r"[:-]", region)
-    seq = genome[chrom][int(start) : int(end)].seq.upper()
-
-    return scan_sequence(seq, motifs, nreport, scan_rc)
-
-
-def scan_seq_mult(seqs, motifs, nreport, scan_rc):
-    ret = []
-    for seq in seqs:
-        result = scan_sequence(seq.upper(), motifs, nreport, scan_rc)
-        ret.append(result)
-    return ret
-
-
-def scan_region_mult(regions, genome, motifs, nreport, scan_rc):
+def scan_seq_mult(
+    seqs, seq_gc_bins, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False
+):
     ret = []
-    for region in regions:
-        result = scan_region(region, genome, motifs, nreport, scan_rc)
+    for seq, seq_gc_bin in zip(seqs, seq_gc_bins):
+        result = scan_sequence(
+            seq.upper(),
+            seq_gc_bin,
+            motifs,
+            nreport,
+            scan_rc,
+            motifs_meanstd=motifs_meanstd,
+            zscore=zscore,
+        )
         ret.append(result)
     return ret
 
@@ -528,7 +535,7 @@ class Scanner(object):
 
     def __init__(self, ncpus=None):
         self.config = MotifConfig()
-        self.threshold = None
+        self._threshold = None
         self.genome = None
         self.background = None
         self.meanstd = {}
@@ -603,19 +610,15 @@ def _meanstd_from_seqs(self, motifs, seqs):
 
     def _threshold_from_seqs(self, motifs, seqs, fpr):
         scan_motifs = [(m, m.pwm_min_score()) for m in motifs]
-
         table = []
-        for x in self._scan_sequences_with_motif(scan_motifs, seqs, 1, True):
-            table.append([row[0][0] for row in x])
+        seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs]
+        for gc_bin, result in zip(
+            seq_gc_bins, self._scan_sequences_with_motif(scan_motifs, seqs, 1, True)
+        ):
+            table.append([gc_bin] + [row[0][0] for row in result])
 
-        for (motif, _), scores in zip(scan_motifs, np.array(table).transpose()):
-            if len(scores) > 0:
-                opt_score = scoreatpercentile(scores, 100 - (100 * fpr))
-                yield motif, opt_score  # cutoff
-            else:
-                raise ValueError(
-                    "Could not determine threshold for motif {}".format(motif)
-                )
+        df = pd.DataFrame(table, columns=["gc_bin"] + [m.id for m in motifs])
+        return df
 
     def set_meanstd(self, gc=False):
         if not self.background:
@@ -677,7 +680,7 @@ def set_meanstd(self, gc=False):
         lock.release()
 
     def set_background(
-        self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None
+        self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None
     ):
         """Set the background to use for FPR and z-score calculations.
 
@@ -704,6 +707,16 @@ def set_background(
 
         size = int(size)
 
+        if gc_bins is None:
+            if gc:
+                gc_bins = [(0.0, 0.2), (0.8, 1)]
+                for b in np.arange(0.2, 0.799, 0.05):
+                    gc_bins.append((b, b + 0.05))
+            else:
+                gc_bins = [(0, 1)]
+        if nseq is None:
+            nseq = max(10000, len(gc_bins) * 1000)
+
         if genome and fname:
             raise ValueError("Need either genome or filename for background.")
 
@@ -735,12 +748,6 @@ def set_background(
 
             if not fa:
                 if gc:
-
-                    if gc_bins is None:
-                        gc_bins = [(0.0, 0.2), (0.8, 1)]
-                        for b in np.arange(0.2, 0.799, 0.05):
-                            gc_bins.append((b, b + 0.05))
-
                     with NamedTemporaryFile() as tmp:
                         logger.info("using {} sequences".format(nseq))
                         gc_bin_bedfile(
@@ -756,6 +763,12 @@ def set_background(
         if gc_bins:
             self.gc_bins = gc_bins
 
+    @property
+    def threshold(self):
+        if self._threshold is None:
+            self.set_threshold()
+        return self._threshold
+
     def set_threshold(self, fpr=None, threshold=None, gc=False):
         """Set motif scanning threshold based on background sequences.
 
@@ -774,6 +787,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
         if threshold and fpr:
             raise ValueError("Need either fpr or threshold.")
 
+        if threshold is None and fpr is None:
+            if self.genome:
+                fpr = 0.01
+                logger.info(f"Using default FPR of {fpr}")
+            else:
+                threshold = 0.95
+                logger.info(
+                    f"Genome not specified, using default threshold of {threshold}."
+                )
+                logger.info("This is likely not ideal.")
+
         if fpr:
             fpr = float(fpr)
             if not (0.0 < fpr < 1.0):
@@ -784,9 +808,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
 
         thresholds = {}
         motifs = read_motifs(self.motifs)
+        gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins]
 
         if threshold is not None:
-            self.threshold = parse_threshold_values(self.motifs, threshold)
+            data = []
+
+            d = parse_threshold_values(self.motifs, threshold)
+            self._threshold = pd.DataFrame(d, index=[0])
+            self._threshold = self._threshold.join(
+                pd.DataFrame(gc_bins, index=[0] * len(gc_bins), columns=["gc_bin"])
+            )
+            self._threshold = self._threshold.set_index("gc_bin")
             return
 
         if not self.background:
@@ -800,36 +832,41 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
         lock.acquire()
         with Cache(CACHE_DIR) as cache:
             scan_motifs = []
+            self._threshold = None
             for motif in motifs:
-                k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr)
-
-                threshold = cache.get(k)
-                if threshold is None:
+                k = "{}|{}|{:.4f}|{}".format(
+                    motif.hash(), self.background_hash, fpr, ",".join(sorted(gc_bins))
+                )
+                vals = cache.get(k)
+                if vals is None:
                     scan_motifs.append(motif)
                 else:
-                    if np.isclose(threshold, motif.pwm_max_score()):
-                        thresholds[motif.id] = None
-                    elif np.isclose(threshold, motif.pwm_min_score()):
-                        thresholds[motif.id] = 0.0
+                    if self._threshold is None:
+                        self._threshold = vals.to_frame()
                     else:
-                        thresholds[motif.id] = threshold
+                        self._threshold[motif.id] = vals
 
             if len(scan_motifs) > 0:
                 logger.info("determining FPR-based threshold")
-                for motif, threshold in self._threshold_from_seqs(
-                    scan_motifs, seqs, fpr
-                ):
-                    k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr)
-                    cache.set(k, threshold)
-                    if np.isclose(threshold, motif.pwm_max_score()):
-                        thresholds[motif.id] = None
-                    elif np.isclose(threshold, motif.pwm_min_score()):
-                        thresholds[motif.id] = 0.0
-                    else:
-                        thresholds[motif.id] = threshold
+                df = self._threshold_from_seqs(scan_motifs, seqs, fpr).set_index(
+                    "gc_bin"
+                )
+                if self._threshold is None:
+                    self._threshold = df
+                else:
+                    self._threshold = pd.concat((self._threshold, df), axis=1)
+                for motif in scan_motifs:
+                    k = "{}|{}|{:.4f}|{}".format(
+                        motif.hash(),
+                        self.background_hash,
+                        fpr,
+                        ",".join(sorted(gc_bins)),
+                    )
+                    cache.set(k, df[motif.id])
         lock.release()
-        self.threshold_str = "{}_{}_{}".format(fpr, threshold, self.background_hash)
-        self.threshold = thresholds
+        self.threshold_str = "{}_{}_{}_{}".format(
+            fpr, threshold, self.background_hash, ",".join(sorted(gc_bins))
+        )
 
     def set_genome(self, genome):
         """
@@ -930,16 +967,9 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
         """
         Scan a set of regions or sequences.
         """
-
-        if not self.threshold:
-            logger.info(
-                "Using default threshold of 0.95. " "This is likely not optimal!"
-            )
-            self.set_threshold(threshold=0.95)
-
         seqs = as_fasta(seqs, genome=self.genome)
 
-        it = self._scan_sequences(seqs.seqs, nreport, scan_rc)
+        it = self._scan_sequences(seqs.seqs, nreport, scan_rc, zscore=zscore)
 
         if zscore:
             if gc:
@@ -949,165 +979,86 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False):
                 if len(self.meanstd) != 1:
                     self.set_meanstd(gc=gc)
 
-        gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs]
-
         logger.debug("Scanning")
-        for result, gc_seq in zip(it, gc_seqs):
-            if zscore:
-                zresult = []
-                for i, mrow in enumerate(result):
-                    try:
-                        m_mean, m_std = self.get_motif_mean_std(
-                            gc_seq, self.motif_ids[i]
-                        )
-                    except Exception:
-                        print(self.meanstd)
-                        print(gc_seq, self.motif_ids[i])
-                        raise
-                    mrow = [((x[0] - m_mean) / m_std, x[1], x[2]) for x in mrow]
-                    zresult.append(mrow)
-                yield zresult
-            else:
-                yield result
+        for result in it:
+            yield result
 
-    def _scan_regions(self, regions, nreport, scan_rc):
-        genome = self.genome
-        motif_file = self.motifs
-        motif_digest = self.checksum.get(motif_file, None)
+    def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
+        # Simple case, only one threshold
+        if np.all(self.threshold.nunique(axis=0) == 1):
+            return self.threshold.iloc[0].to_dict()
 
-        # determine which regions are not in the cache
-        scan_regions = regions
-        if self.use_cache:
-            scan_regions = []
-            for region in regions:
-                key = str((region, genome, motif_digest, nreport, scan_rc))
-                ret = self.cache.get(key)
-                if ret == NO_VALUE:
-                    scan_regions.append(region)
-
-        # scan the regions that are not in the cache
-        if len(scan_regions) > 0:
-
-            g = Genome(genome)
-
-            motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
-            scan_func = partial(
-                scan_region_mult,
-                genome=g,
-                motifs=motifs,
-                nreport=nreport,
-                scan_rc=scan_rc,
-            )
+        if motifs is None:
+            motifs = read_motifs(self.motifs)
+        seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs]
 
-            for region, ret in self._scan_jobs(scan_func, scan_regions):
-                # return values or store values in cache
-                if self.use_cache:
-                    # store values in cache
-                    key = str(
-                        (
-                            region,
-                            genome,
-                            motif_digest,
-                            nreport,
-                            scan_rc,
-                            self.threshold_str,
-                        )
-                    )
-                    self.cache.set(key, ret)
-                else:
-                    # return values
-                    yield ret
+        gc_bin_count = Counter(seq_gc_bins)
 
-        if self.use_cache:
-            # return results from cache
-            for region in regions:
-                key = str(
-                    (region, genome, motif_digest, nreport, scan_rc, self.threshold_str)
-                )
-                ret = self.cache.get(key)
-                if ret == NO_VALUE or ret is None:
-                    raise Exception(
-                        "cache is not big enough to hold all "
-                        "results, try increasing the cache size "
-                        "or disable cache"
-                    )
-                yield ret
+        print(self.threshold)
+
+        _treshold = self.threshold
+        if zscore:
+            grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0)
+            _threshold = pd.DataFrame(
+                np.vstack(grouped.values),
+                index=_treshold.index,
+                columns=_treshold.columns,
+            )
+
+        min_frac = min(gc_bin_count.values())
+        dfs = [
+            _threshold.loc[gc_bin].sample(
+                int(count / min_frac * 1000), replace=True, random_state=42
+            )
+            for gc_bin, count in gc_bin_count.items()
+        ]
+        print(dfs)
+        fpr_df = pd.concat(dfs)
+        print(fpr_df.shape)
+        t = fpr_df.quantile(0.99, interpolation="higher")
+        print(motifs)
+        print(t)
+        maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index)
+        t[t >= maxt] = None
+        # print(t)
+        return t.replace({np.nan: None}).to_dict()
 
     def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):
         scan_func = partial(
             scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc
         )
-
         for ret in self._scan_jobs(scan_func, seqs):
             yield ret[1]
 
-    def _scan_sequences(self, seqs, nreport, scan_rc):
-
-        motif_file = self.motifs
-        motif_digest = self.checksum.get(motif_file, None)
-
-        scan_seqs = seqs
-        if self.use_cache:
-            # determine which sequences are not in the cache
-            hashes = dict([(s.upper(), xxhash.xxh64(s.upper()).digest()) for s in seqs])
-            scan_seqs = []
-
-            for seq, seq_hash in hashes.items():
-                key = str(
-                    (seq_hash, motif_digest, nreport, scan_rc, self.threshold_str)
-                )
-                ret = self.cache.get(key)
-                if ret == NO_VALUE or ret is None:
-                    scan_seqs.append(seq.upper())
-
-        # scan the sequences that are not in the cache
-        if len(scan_seqs) > 0:
-            motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)]
-            scan_func = partial(
-                scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc
-            )
-
-            for seq, ret in self._scan_jobs(scan_func, scan_seqs):
-                if self.use_cache:
-                    h = hashes[seq]
-                    key = str((h, motif_digest, nreport, scan_rc, self.threshold_str))
-                    self.cache.set(key, ret)
-                else:
-                    yield ret
-
-        if self.use_cache:
-            # return results from cache
-            for seq in seqs:
-                key = str(
-                    (
-                        hashes[seq.upper()],
-                        motif_digest,
-                        nreport,
-                        scan_rc,
-                        self.threshold_str,
-                    )
-                )
-                ret = self.cache.get(key)
-                if ret == NO_VALUE or ret is None:
-                    raise Exception(
-                        "cache is not big enough to hold all "
-                        "results, try increasing the cache size "
-                        "or disable cache"
-                    )
-
-                yield ret
+    def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False):
+        thresholds = self.get_gc_thresholds(seqs, zscore=zscore)
+        motifs = [(m, thresholds[m.id]) for m in read_motifs(self.motifs)]
+        motifs_meanstd = None
+        if zscore:
+            motifs_meanstd = self.meanstd
+        scan_func = partial(
+            scan_seq_mult,
+            motifs=motifs,
+            nreport=nreport,
+            scan_rc=scan_rc,
+            motifs_meanstd=motifs_meanstd,
+            zscore=zscore,
+        )
+        for seq, ret in self._scan_jobs(scan_func, seqs):
+            yield ret
 
     def _scan_jobs(self, scan_func, scan_seqs):
         batchsize = 1000
+
         if self.ncpus > 1:
             for i in range((len(scan_seqs) - 1) // batchsize + 1):
                 batch = scan_seqs[i * batchsize : (i + 1) * batchsize]
                 chunksize = len(batch) // self.ncpus + 1
                 jobs = []
                 for j in range((len(batch) - 1) // chunksize + 1):
-                    job = self.pool.apply_async(
-                        scan_func, (batch[j * chunksize : (j + 1) * chunksize],)
-                    )
+                    batch_seqs = batch[j * chunksize : (j + 1) * chunksize]
+                    seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs]
+                    job = self.pool.apply_async(scan_func, (batch_seqs, seq_gc_bins))
                     jobs.append(job)
 
                 for k, job in enumerate(jobs):
@@ -1116,7 +1067,8 @@ def _scan_jobs(self, scan_func, scan_seqs):
                         yield region, ret
         else:
             for i in range((len(scan_seqs) - 1) // batchsize + 1):
-                for _j, ret in enumerate(
-                    scan_func(scan_seqs[i * batchsize : (i + 1) * batchsize])
-                ):
+                batch_seqs = scan_seqs[i * batchsize : (i + 1) * batchsize]
+                seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs]
+
+                for _j, ret in enumerate(scan_func(batch_seqs, seq_gc_bins)):
                     yield scan_seqs[i], ret

From 588d0864f9e8844cfcc717bef7337f3ca5a98a52 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 12:31:17 +0200
Subject: [PATCH 33/85] update report

---
 gimmemotifs/report.py | 201 ++++++++++++++++++++++++++++++------------
 1 file changed, 143 insertions(+), 58 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 5bde42e0..9c15ad72 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -51,6 +51,9 @@ def _wrap_html_str(x):
         min_pos, max_pos = m.start(), m.end()
 
     positions = [m.start() for m in re.compile(" ").finditer(x)]
+    if len(positions) == 0:
+        return x
+
     positions = [p for p in positions if min_pos < p < max_pos]
 
     pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0]
@@ -58,6 +61,31 @@ def _wrap_html_str(x):
     return x
 
 
+def relative_luminance(rgba):
+    """
+    Calculate relative luminance of a color.
+    The calculation adheres to the W3C standards
+    (https://www.w3.org/WAI/GL/wiki/Relative_luminance)
+    Parameters
+    ----------
+    color : rgb or rgba tuple
+    Returns
+    -------
+    float
+        The relative luminance as a value from 0 to 1
+    """
+    r, g, b = (
+        x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4) for x in rgba[:3]
+    )
+    return 0.2126 * r + 0.7152 * g + 0.0722 * b
+
+
+def contrasting_text_color(color, text_color_threshold=0.408):
+    dark = relative_luminance(color) < text_color_threshold
+    text_color = "#f1f1f1" if dark else "#000000"
+    return text_color
+
+
 class ExtraStyler(Styler):
     """
     Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS.
@@ -320,15 +348,15 @@ def align(self, subset=None, location="center", axis=0):
         self.apply(self._align, subset=subset, location=location, axis=axis)
         return self
 
-    def _background_gradient(self, s, m, M, cmap='PuBu', low=0, high=0):
+    def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0):
         rng = M - m
-        norm = colors.Normalize(m - (rng * low),
-                            M + (rng * high))
+        norm = colors.Normalize(m - (rng * low), M + (rng * high))
         normed = norm(s.values)
-        c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
-        return ['background-color: %s' % color for color in c]
-
-
+        c = plt.cm.get_cmap(cmap)(normed)
+        return [
+            f"background-color: {colors.rgb2hex(color)}; color: {contrasting_text_color(color)}"
+            for color in c
+        ]
 
     def _circle(
         self,
@@ -345,7 +373,7 @@ def _circle(
     ):
         subset = pd.IndexSlice[:, :] if subset is None else subset
         subslice = _non_reducing_slice(subset)
-        
+
         if color:
             palette = sns.color_palette([color])
             # print(palette)
@@ -362,7 +390,6 @@ def _circle(
                 self.data.loc[subslice].select_dtypes(exclude=["object"]).columns,
             ]
 
-
         self.circle_styles = self.circle_styles or []
         circle_id = len(self.circle_styles) + 1
 
@@ -377,7 +404,6 @@ def _circle(
             ("vertical-align", "middle"),
         ]
 
-
         self.circle_styles.append({"name": f"circle{circle_id}", "props": props})
         self.palette_styles = self.palette_styles or []
         for i, color in enumerate(palette.as_hex()):
@@ -397,7 +423,11 @@ def _circle(
             )
 
         if len(palette) > 1:
-            vmax = self.data.loc[subslice].max().max() * 1.01 if vmax is None else vmax * 1.01
+            vmax = (
+                self.data.loc[subslice].max().max() * 1.01
+                if vmax is None
+                else vmax * 1.01
+            )
             text = self.display_data.loc[subslice].astype(str) if show_text else ""
             self.display_data.loc[subslice] = (
                 f"<div class='circle{circle_id} color{circle_id}_"
@@ -528,14 +558,34 @@ def emojify(self, subset=None):
         return self
 
     def scaled_background_gradient(
-        self, subset=None, cmap="RdBu_r", low=0, high=0, center_zero=False, 
-        vmin=None, vmax=None
+        self,
+        subset=None,
+        cmap="RdBu_r",
+        low=0,
+        high=0,
+        center_zero=False,
+        vmin=None,
+        vmax=None,
     ):
         subset = pd.IndexSlice[:, :] if subset is None else subset
         subset = _non_reducing_slice(subset)
 
-        vmax = self.data.loc[subset].max().max() if vmax is None else vmax
-        vmin = self.data.loc[subset].min().min() if vmin is None else vmin
+        vmax = (
+            self.data.loc[subset]
+            .replace({np.inf: np.nan, -np.inf: np.nan})
+            .max(skipna=True)
+            .max()
+            if vmax is None
+            else vmax
+        )
+        vmin = (
+            self.data.loc[subset]
+            .replace({np.inf: np.nan, -np.inf: np.nan})
+            .min(skipna=True)
+            .min()
+            if vmin is None
+            else vmin
+        )
 
         if center_zero:
             vmax = max(abs(vmax), abs(vmin))
@@ -543,18 +593,19 @@ def scaled_background_gradient(
 
         r = self
         for col in self.data.loc[subset].columns:
-            r = r.apply(self._background_gradient,
+            r = r.apply(
+                self._background_gradient,
                 subset=pd.IndexSlice[subset[0], col],
                 cmap=cmap,
                 m=vmin,
                 M=vmax,
                 low=low,
-                high=high)
+                high=high,
+            )
 
         return r
 
 
-
 def get_roc_values(motif, fg_file, bg_file, genome):
     """Calculate ROC AUC values for ROC plots."""
     try:
@@ -783,7 +834,13 @@ def format_factors(motif, max_length=5):
     fmt_d = "<span style='color:black'>{}</span>"
     fmt_i = "<span style='color:#666666'>{}</span>"
 
-    direct = sorted(list(set([x.upper() for x in motif.factors[DIRECT_NAME]])))
+    direct = sorted(
+        list(
+            set(
+                [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]]
+            )
+        )
+    )
     indirect = sorted(
         list(
             set(
@@ -805,7 +862,11 @@ def format_factors(motif, max_length=5):
                 show_factors.append(f)
             if len(show_factors) >= max_length:
                 break
-    show_factors = sorted(show_factors)
+
+    if "de novo" in show_factors:
+        show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"])
+    else:
+        show_factors = sorted(show_factors)
 
     factor_str = ",".join(
         [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]
@@ -895,7 +956,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
         .set_precision(2)
         .convert_to_image(subset=["logo"], height=30,)
         .scaled_background_gradient(
-            subset=value_cols, center_zero=True, min=1/1.75, max=1/1.75
+            subset=value_cols, center_zero=True, min=1 / 1.75, max=1 / 1.75
         )
         .border(subset=list(value_cols[:1]), location="left")
         .border(part="columns", location="bottom")
@@ -910,7 +971,11 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
             df_styled.wrap(subset=list(corr_cols))
             .align(subset=list(corr_cols), location="center")
             .scaled_background_gradient(
-                subset=corr_cols, cmap="PuOr_r", center_zero=True, min=1/1.75, max=1/1.75
+                subset=corr_cols,
+                cmap="PuOr_r",
+                center_zero=True,
+                min=1 / 1.75,
+                max=1 / 1.75,
             )
         )
 
@@ -942,29 +1007,29 @@ def roc_html_report(
 ):
     df = pd.read_table(infile, index_col=0)
     df.rename_axis(None, inplace=True)
-    df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1]
+
+    motifs = read_motifs(pfmfile, as_dict=True)
+    if use_motifs is not None:
+        motifs = {k: v for k, v in motifs.items() if k in use_motifs}
+    idx = list(motifs.keys())
+    df = df.loc[idx]
+
+    df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1])
+    df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"]))
+    df = df[df["corrected P-value"] <= threshold]
 
     cols = [
+        "factors",
         "logo",
-        "# matches",
-        "# matches background",
-        "P-value",
-        "log10 P-value",
-        "corrected P-value",
+        "% matches input",
+        "%matches background",
+        "-log10 P-value",
         "ROC AUC",
         "PR AUC",
         "Enr. at 1% FPR",
         "Recall at 10% FDR",
     ]
 
-    motifs = read_motifs(pfmfile, as_dict=True)
-    if use_motifs is not None:
-        motifs = {k:v for k,v in motifs.items() if k in use_motifs}
-    idx = list(motifs.keys())
-    df = df.loc[idx]
-
-    df = df[df["corrected P-value"] <= threshold]
-
     if link_matches:
         df["# matches"] = (
             "<a href=motif_scan_results/"
@@ -978,44 +1043,64 @@ def roc_html_report(
     df.insert(
         0,
         "logo",
-        motif_to_img_series(df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"),
+        motif_to_img_series(
+            df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"
+        ),
     )
     # Add factors that can bind to the motif
-    df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs))
+    df.insert(
+        0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs)
+    )
 
     rename_columns = {"factors": FACTOR_TOOLTIP}
-    
+
+    df = df[cols]
+
     bar_cols = [
-        "log10 P-value",
+        "% matches input",
+        "%matches background",
+        "-log10 P-value",
         "ROC AUC",
         "PR AUC",
         "Enr. at 1% FPR",
         "Recall at 10% FDR",
     ]
-    template_dir = MotifConfig().get_template_dir()
-    js = open(
-        os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8"
-    ).read()
-    css = open(
-        os.path.join(template_dir, "sortable/sortable-theme-slick.css"),
-        encoding="utf-8",
-    ).read()
-    
-    df = df.reset_index().sort_values("ROC AUC", ascending=False)
+
+    df["% matches input"] = df["% matches input"].astype(int)
+    df["%matches background"] = df["%matches background"].astype(int)
+    rename_columns = {"factors": FACTOR_TOOLTIP}
+    df = df.sort_values("ROC AUC", ascending=False)
     with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f:
-        f.write("<head>\n")
-        f.write("<style>{}</style>\n".format(css))
-        f.write("</head>\n")
-        f.write("<body>\n")
         if df.shape[0] > 0:
             f.write(
                 ExtraStyler(df)
-                .bar(bar_cols)
+                .convert_to_image(subset=["logo"], height=30,)
+                .add_circle(
+                    subset=["% matches input", "%matches background"],
+                    vmax=100,
+                    cmap="Purples",
+                )
+                .scaled_background_gradient(
+                    "-log10 P-value", vmin=0, high=0.3, cmap="Reds"
+                )
+                .scaled_background_gradient(
+                    "ROC AUC", vmin=0.5, vmax=1, high=0.3, cmap="Reds"
+                )
+                .scaled_background_gradient(
+                    "PR AUC", vmin=0, vmax=1, high=0.3, cmap="Reds"
+                )
+                .scaled_background_gradient(
+                    "Enr. at 1% FPR", vmin=1, high=0.3, cmap="Reds"
+                )
+                .scaled_background_gradient(
+                    "Recall at 10% FDR", vmin=0, vmax=1, high=0.7, cmap="Reds"
+                )
                 .set_precision(2)
-                .set_table_attributes("data-sortable")
-                .hide_index()
+                .set_table_attributes('class="sortable-theme-slick" data-sortable')
+                .wrap(subset=cols)
+                .align(subset=bar_cols, location="center")
+                .rename(columns=rename_columns)
                 .render()
             )
         else:
             f.write("<body>No enriched motifs found.</body>")
-        

From 7f04bceb31d49a90f544652d738806921fa5de34 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 13:20:26 +0200
Subject: [PATCH 34/85] update genomepy dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 5d790bf1..e25ef055 100644
--- a/setup.py
+++ b/setup.py
@@ -131,7 +131,7 @@ def run(self):
         "diskcache",
         "xxhash",
         "configparser",
-        "genomepy >= 0.7.2",
+        "genomepy >= 0.8.3",
         "tqdm",
         "pillow",
         "logomaker",

From 1d0dc74359b83c5c697a10f5a82b9f577c43512e Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 14:55:21 +0200
Subject: [PATCH 35/85] new aggregation method

---
 gimmemotifs/cli.py                |  13 ++++
 gimmemotifs/commands/maelstrom.py |   2 +
 gimmemotifs/rank.py               | 123 +++++++++++++++++++++++++++---
 gimmemotifs/report.py             |  11 +--
 4 files changed, 133 insertions(+), 16 deletions(-)

diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index 4878e96b..84b89961 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -341,6 +341,19 @@ def cli(sys_args):
         default=None,
         metavar="NAMES",
     )
+    p.add_argument(
+        "-a",
+        "--aggregation",
+        dest="aggregation",
+        help=(
+            'How to combine motifs from individual methods. Default is "int_stouffer", '
+            "for inverse normal transform of ranks, followed by Stouffer's method to combine "
+            'z-scores. Alternatively, specify "stuart" for log-transformed rank aggregation '
+            "p-values."
+        ),
+        default="int_stouffer",
+        metavar="method",
+    )
     p.add_argument(
         "-N",
         "--nthreads",
diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py
index f8573160..4cae068d 100755
--- a/gimmemotifs/commands/maelstrom.py
+++ b/gimmemotifs/commands/maelstrom.py
@@ -20,6 +20,7 @@ def maelstrom(args):
     zscore = args.zscore
     center = args.center
     gc = args.gc
+    aggregation = args.aggregation
 
     if not os.path.exists(infile):
         raise ValueError("file {} does not exist".format(infile))
@@ -37,4 +38,5 @@ def maelstrom(args):
         zscore=zscore,
         gc=gc,
         center=center,
+        aggregation=aggregation,
     )
diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py
index 15076a86..a11ad71e 100644
--- a/gimmemotifs/rank.py
+++ b/gimmemotifs/rank.py
@@ -9,6 +9,7 @@
 import subprocess as sp
 import pandas as pd
 import numpy as np
+from scipy.stats import rankdata, norm
 
 try:
     from scipy.special import factorial
@@ -77,23 +78,77 @@ def qStuart(r):
     return factorial(N) * v[N]
 
 
-def rankagg(df, method="stuart"):
-    """Return aggregated ranks.
+def _rank_int(series, c=3.0 / 8, stochastic=True):
+    # Based on code by Edward Mountjoy
+    # See: https://github.com/edm1/rank-based-INT
+    """ Perform rank-based inverse normal transformation on pandas series.
+        If stochastic is True ties are given rank randomly, otherwise ties will
+        share the same value. NaN values are ignored.
+        Args:
+            param1 (pandas.Series):   Series of values to transform
+            param2 (Optional[float]): Constand parameter (Bloms constant)
+            param3 (Optional[bool]):  Whether to randomise rank of ties
+        
+        Returns:
+            pandas.Series
+    """
+
+    # Check input
+    assert isinstance(series, pd.Series)
+    assert isinstance(c, float)
+    assert isinstance(stochastic, bool)
+
+    # Set seed
+    np.random.seed(123)
+
+    # Take original series indexes
+    orig_idx = series.index
+
+    # Drop NaNs
+    series = series.loc[~pd.isnull(series)]
+
+    # Get ranks
+    if stochastic == True:
+        # Shuffle by index
+        series = series.loc[np.random.permutation(series.index)]
+        # Get rank, ties are determined by their position in the series (hence
+        # why we randomised the series)
+        rank = rankdata(series, method="ordinal")
+    else:
+        # Get rank, ties are averaged
+        rank = rankdata(series, method="average")
+
+    # Convert numpy array back to series
+    rank = pd.Series(rank, index=series.index)
+
+    # Convert rank to normal distribution
+    transformed = rank.apply(_rank_to_normal, c=c, n=len(rank))
+
+    return transformed[orig_idx]
+
 
+def _rank_to_normal(rank, c, n):
+    # Standard quantile function
+    x = (rank - c) / (n - 2 * c + 1)
+    return norm.ppf(x)
+
+
+def _rankagg_int(df):
+    # Convert values to ranks
+    df_int = df.apply(_rank_int)
+    # Combine z-score using Stouffer's method
+    df_int = (df_int.sum(1) / np.sqrt(df_int.shape[1])).to_frame()
+    df_int.columns = ["z-score"]
+    return df_int
+
+
+def _rankagg_stuart(df):
+    """
     Implementation is ported from the RobustRankAggreg R package
 
     References:
         Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709
         Stuart et al., 2003,  DOI: 10.1126/science.1087447
-
-    Parameters
-    ----------
-    df : pandas.DataFrame
-        DataFrame with values to be ranked and aggregated
-
-    Returns
-    -------
-    pandas.DataFrame with aggregated ranks
     """
     rmat = pd.DataFrame(index=df.iloc[:, 0])
 
@@ -105,3 +160,49 @@ def rankagg(df, method="stuart"):
     rmat = rmat.apply(sorted, 1, result_type="expand")
     p = rmat.apply(qStuart, 1)
     return pd.DataFrame({"score": p}, index=rmat.index)
+
+
+def rankagg(df, method="int_stouffer", include_reverse=True, log_transform=True):
+    """Return aggregated ranks.
+
+    Stuart implementation is ported from the RobustRankAggreg R package
+
+    References:
+        Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709
+        Stuart et al., 2003,  DOI: 10.1126/science.1087447
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame with values to be ranked and aggregated
+    method : str, optional
+        Either "int_stouffer" or "stuart". The "int_stouffer" method is based on combining z-scores
+        from a inverse normal transform of ranks using Stouffer's method.
+
+    Returns
+    -------
+    pandas.DataFrame with aggregated ranks
+    """
+    method = method.lower()
+    if method not in ["stuart", "int_stouffer"]:
+        raise ValueError("unknown method for rank aggregation")
+
+    if method == "stuart":
+        df_asc = pd.DataFrame()
+        df_desc = pd.DataFrame()
+        for col in df.columns:
+            df_asc[col] = (
+                df.sample(frac=1).sort_values(col, ascending=False).index.values
+            )
+            if include_reverse:
+                df_desc[col] = (
+                    df.sample(frac=1).sort_values(col, ascending=True).index.values
+                )
+
+        df_result = -np.log10(_rankagg_stuart(df_asc))
+        if include_reverse:
+            df_result += np.log10(_rankagg_stuart(df_desc))
+
+        return df_result
+    if method == "int_stouffer":
+        return _rankagg_int(df)
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 9c15ad72..766349e7 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -926,7 +926,7 @@ def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="l
     return pd.Series(data=img_series, index=index)
 
 
-def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
+def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
 
     # Read the maelstrom text report
     df = pd.read_table(infile, index_col=0)
@@ -950,13 +950,15 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
     df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
 
     rename_columns = {"factors": FACTOR_TOOLTIP}
+    if "% with motif" in df.columns:
+        df["% with motif"] = df["% with motif"].astype(int)
 
     df_styled = (
         ExtraStyler(df)
         .set_precision(2)
         .convert_to_image(subset=["logo"], height=30,)
         .scaled_background_gradient(
-            subset=value_cols, center_zero=True, min=1 / 1.75, max=1 / 1.75
+            subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75
         )
         .border(subset=list(value_cols[:1]), location="left")
         .border(part="columns", location="bottom")
@@ -974,13 +976,12 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4):
                 subset=corr_cols,
                 cmap="PuOr_r",
                 center_zero=True,
-                min=1 / 1.75,
-                max=1 / 1.75,
+                low=1 / 1.75,
+                high=1 / 1.75,
             )
         )
 
     if "% with motif" in df.columns:
-        df["% with motif"] = df["% with motif"].astype(int)
         df_styled = (
             df_styled.add_circle(
                 subset=["% with motif"], cmap="Purples", vmax=100, size=40

From e979f10067854397be31762a2ce975704b967c5b Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 14:55:45 +0200
Subject: [PATCH 36/85] style

---
 gimmemotifs/commands/motifs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index e72e7c5d..b2b136a6 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -197,7 +197,9 @@ def motifs(args):
                         motif_stats[str(motif)]["matches_at_fpr"][0],
                         motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100,
                         motif_stats[str(motif)]["matches_at_fpr"][1],
-                        motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100,
+                        motif_stats[str(motif)]["matches_at_fpr"][1]
+                        / n_background
+                        * 100,
                         motif_stats[str(motif)]["phyper_at_fpr"],
                         log_pvalue,
                         motif_stats[str(motif)]["roc_auc"],

From f6541ec83a4001fc53ed9c40eabb6fea7a52cb1c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 14:55:53 +0200
Subject: [PATCH 37/85] new aggregation method

---
 gimmemotifs/maelstrom.py | 41 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 4719fabb..8cf32ecf 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -173,35 +173,21 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):
         plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300)
 
 
-def _rank_agg_column(exps, dfs, e):
-    tmp_dfs = [pd.DataFrame(), pd.DataFrame()]
-
-    for i, sort_order in enumerate([False, True]):
-        for method, scoring, _ in exps:
-            k = "{}.{}".format(method, scoring)
-            if k in dfs:
-                v = dfs[k]
-                # Sample rows before sorting to shuffle
-                # Otherwise all ties will not have a random order due to inherent
-                # ordering of the motif dataframe
-                tmp_dfs[i][k] = (
-                    v.sample(frac=1).sort_values(e, ascending=sort_order).index.values
-                )
-            
-    return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1]))
-
-
-def df_rank_aggregation(df, dfs, exps):
+def df_rank_aggregation(df, dfs, exps, method="int_stouffer"):
     df_p = pd.DataFrame(index=list(dfs.values())[0].index)
     names = list(dfs.values())[0].columns
+    dfs = [
+        pd.concat([v[col].rename(k, inplace=True) for k, v in dfs.items()], axis=1)
+        for col in names
+    ]
     pool = Pool(16)
-    func = partial(_rank_agg_column, exps, dfs)
-    ret = pool.map(func, names)
+    func = partial(rankagg, method=method)
+    ret = pool.map(func, dfs)
     pool.close()
     pool.join()
 
-    for e, result in zip(names, ret):
-        df_p[e] = result
+    for name, result in zip(names, ret):
+        df_p[name] = result
 
     if df.shape[1] != 1:
         df_p = df_p[df.columns]
@@ -223,6 +209,7 @@ def run_maelstrom(
     zscore=True,
     gc=True,
     center=False,
+    aggregation="int_stouffer",
 ):
     """Run maelstrom on an input table.
 
@@ -270,6 +257,12 @@ def run_maelstrom(
 
     center : bool, optional
         Mean-center the input table.
+    
+    aggregation: str, optional
+        How to combine scores of the predictors. The default is "int_stouffer", for 
+        inverse normal transform followed by Stouffer's methods to combine z-scores.
+        Alternatively, "stuart" performs rank aggregation and reports the -log10 of 
+        the rank aggregation p-value.
     """
     logger.info("Starting maelstrom")
     if infile.endswith("feather"):
@@ -432,7 +425,7 @@ def run_maelstrom(
 
     if len(methods) > 1:
         logger.info("Rank aggregation")
-        df_p = df_rank_aggregation(df, dfs, exps)
+        df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)
 
         if df.shape[1] > 1:
             # Add correlation between motif score and signal

From 1f8b465219c2eb9d62138d1b68c0aad94827c06c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 15:13:42 +0200
Subject: [PATCH 38/85] add quantile normalization to coverage_table

---
 conda_env.dev.txt      |  1 +
 conda_env.osx.txt      |  1 +
 conda_env.test.txt     |  1 +
 conda_env.txt          |  1 +
 scripts/coverage_table | 28 +++++++++++++++++-----------
 setup.py               |  1 +
 6 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/conda_env.dev.txt b/conda_env.dev.txt
index cb3f62e3..8dc6f56e 100644
--- a/conda_env.dev.txt
+++ b/conda_env.dev.txt
@@ -21,6 +21,7 @@ pysam
 python
 python-xxhash
 pyyaml >=3.10
+qnorm
 scikit-learn >=0.23
 scipy >=1.4.1
 seaborn
diff --git a/conda_env.osx.txt b/conda_env.osx.txt
index 9c931dd4..dd1a2c98 100644
--- a/conda_env.osx.txt
+++ b/conda_env.osx.txt
@@ -21,6 +21,7 @@ pysam
 python
 python-xxhash
 pyyaml >=3.10
+qnorm
 scikit-learn
 scipy
 seaborn
diff --git a/conda_env.test.txt b/conda_env.test.txt
index a61ea95d..af2f478c 100644
--- a/conda_env.test.txt
+++ b/conda_env.test.txt
@@ -22,6 +22,7 @@ pybedtools
 python >=3.8
 python-xxhash
 pyyaml >=3.10
+qnorm
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
diff --git a/conda_env.txt b/conda_env.txt
index 441b7f03..b569daff 100644
--- a/conda_env.txt
+++ b/conda_env.txt
@@ -22,6 +22,7 @@ pysam
 python >=3
 python-xxhash
 pyyaml >=3.10
+qnorm
 scikit-learn >=0.18
 scipy <1.3.0
 seaborn
diff --git a/scripts/coverage_table b/scripts/coverage_table
index 829e67df..14f3128f 100644
--- a/scripts/coverage_table
+++ b/scripts/coverage_table
@@ -9,6 +9,7 @@ import pysam
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import scale
+import qnorm
 
 from gimmemotifs import __version__
 
@@ -27,7 +28,7 @@ def make_table(
     datafiles,
     window,
     log_transform=True,
-    scale_table=True,
+    normalization="none",
     top=0,
     topmethod="var",
     rmdup=True,
@@ -100,9 +101,14 @@ def make_table(
     if log_transform:
         print("Log transform", file=sys.stderr)
         df = np.log1p(df)
-    if scale_table:
-        print("Scale", file=sys.stderr)
+    if normalization == "scale":
+        print("Normalization by scaling", file=sys.stderr)
         df[:] = scale(df, axis=0)
+    if normalization == "quantile":
+        print("Normalization by quantile normalization", file=sys.stderr)
+        df = qnorm.quantile_normalize(df)
+    else:
+        print("No normalization", file=sys.stderr)
 
     if top > 0:
         if topmethod == "var":
@@ -166,12 +172,12 @@ if __name__ == "__main__":
         action="store_true",
     )
     parser.add_argument(
-        "-s",
-        "--scale",
-        dest="scale_table",
-        help="Scale per datafile",
-        default=False,
-        action="store_true",
+        "-n",
+        "--normalization",
+        dest="normalization",
+        help="Normalization: none, quantile or scale",
+        default="none",
+        metavar="METHOD",
     )
     parser.add_argument(
         "-t", "--top", dest="top", help="Select regions.", default=0, type=int
@@ -214,7 +220,7 @@ if __name__ == "__main__":
         datafiles,
         args.window,
         log_transform=args.log_transform,
-        scale_table=args.scale_table,
+        normalization=args.normalization,
         top=args.top,
         topmethod=args.topmethod,
         rmdup=args.rmdup,
@@ -232,7 +238,7 @@ output.write("# Window: {}\n".format(args.window))
 output.write("# Duplicates removed: {}\n".format(yesno[args.rmdup]))
 output.write("# MAPQ 0 removed: {}\n".format(yesno[args.rmrepeats]))
 output.write("# Log transformed: {}\n".format(yesno[args.log_transform]))
-output.write("# Scaled: {}\n".format(yesno[args.scale_table]))
+output.write("# Normalization: {}\n".format(args.normalization))
 if args.top > 0:
     output.write("# Top {} regions selected by {}\n".format(args.top, args.topmethod))
 df.to_csv(output, sep="\t", float_format="%0.5f")
diff --git a/setup.py b/setup.py
index e25ef055..b5dc2217 100644
--- a/setup.py
+++ b/setup.py
@@ -135,5 +135,6 @@ def run(self):
         "tqdm",
         "pillow",
         "logomaker",
+        "qnorm",
     ],
 )

From 0ff3773ac5b7a43ee92cdea17912bdff73b15410 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 15:13:57 +0200
Subject: [PATCH 39/85] extra informative message

---
 gimmemotifs/maelstrom.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 8cf32ecf..1be9fd2c 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -277,7 +277,13 @@ def run_maelstrom(
             logger.info(
                 "Input is not mean-centered, setting the mean of all rows to 0."
             )
-            logger.info("Use --nocenter to change this behavior")
+            logger.info("Use --nocenter if you know what you're doing and want to change this behavior.")
+            logger.info(
+                "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to "
+                "first transform your data, for instance using log2(), and to normalize "
+                "between samples. To create a table suitable for maelstrom you can use the "
+                "coverage_table script included with GimmeMotifs."
+            )
             df = df.sub(df.mean(axis=1), axis=0)
         else:
             logger.info("Input is not mean-centered, but --nocenter was specified.")

From 4e02768c1391808d6c2cab803ee53eec5c339832 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 16:26:29 +0200
Subject: [PATCH 40/85] add SVR regressor, replace Lasso regressor

---
 gimmemotifs/moap.py | 173 +++++++++++++++++++++++---------------------
 1 file changed, 90 insertions(+), 83 deletions(-)

diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index fdcabe98..4b69661b 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -34,8 +34,9 @@ def warn(*args, **kwargs):
 from sklearn.model_selection import GridSearchCV
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.linear_model import MultiTaskLasso, BayesianRidge
+from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge
 from sklearn.preprocessing import scale, LabelEncoder
+from sklearn.svm import SVR
 
 import xgboost
 
@@ -541,22 +542,16 @@ def fit(self, df_X, df_y):
         logger.info("Done")
 
 
-@register_predictor("Lasso")
-class LassoMoap(Moap):
-    def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
-        """Predict motif activities using Lasso MultiTask regression
+@register_predictor("MultiTaskLasso")
+class MultiTaskLassoMoap(Moap):
+    def __init__(self, scale=True, ncpus=None):
+        """Predict motif activities using MultiTaskLasso.
 
         Parameters
         ----------
         scale : boolean, optional, default True
             If ``True``, the motif scores will be scaled
-            before classification
-
-        kfolds : integer, optional, default 5
-            number of kfolds for parameter search
-
-        alpha_stepsize : float, optional, default 1.0
-            stepsize for use in alpha gridsearch
+            before classification.
 
         ncpus : int, optional
             Number of threads. Default is the number specified in the config.
@@ -564,101 +559,113 @@ def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None):
         Attributes
         ----------
         act_ : DataFrame, shape (n_motifs, n_clusters)
-            fitted motif activities
-
-        sig_ : DataFrame, shape (n_motifs,)
-            boolean values, if coefficients are higher/lower than
-            the 1%t from random permutation
+            Coefficients of the regression model.
         """
 
-        self.kfolds = kfolds
-        self.act_description = "activity values: coefficients from " "fitted model"
+        self.act_description = "activity values: coefficients of the" "regression model"
 
-        self.scale = scale
         if ncpus is None:
             ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
         self.ncpus = ncpus
-
-        # initialize attributes
+        self.scale = scale
         self.act_ = None
-        self.sig_ = None
-
-        mtk = MultiTaskLasso()
-        parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]}
-        self.clf = GridSearchCV(
-            mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2"
-        )
         self.pref_table = "score"
         self.supported_tables = ["score", "count"]
         self.ptype = "regression"
 
-    def fit(self, df_X, df_y, permute=False):
-        logger.info("Fitting Lasso")
+    def fit(self, df_X, df_y):
+        logger.info("Fitting MultiTaskLasso")
+
         if not df_y.shape[0] == df_X.shape[0]:
             raise ValueError("number of regions is not equal")
 
         if self.scale:
+            logger.debug("Scaling motif scores")
             # Scale motif scores
-            df_X[:] = scale(df_X, axis=0)
+            df_X.loc[:,:] = scale(df_X, axis=0)
+
+        # logger.debug("Scaling y")
 
-        idx = list(range(df_y.shape[0]))
-        y = df_y.iloc[idx]
-        X = df_X.loc[y.index].values
-        y = y.values
-
-        # fit coefficients
-        coefs = self._get_coefs(X, y)
-        self.act_ = pd.DataFrame(coefs.T)
-
-        # convert labels back to original names
-        self.act_.columns = df_y.columns
-        self.act_.index = df_X.columns
-
-        if permute:
-            # Permutations
-            logger.info("permutations\n")
-            random_dfs = []
-            for _ in range(10):
-                y_random = y[np.random.permutation(range(y.shape[0]))]
-                coefs = self._get_coefs(X, y_random)
-                random_dfs.append(pd.DataFrame(coefs.T))
-            random_df = pd.concat(random_dfs)
-
-            # Select cutoff based on percentile
-            high_cutoffs = random_df.quantile(0.99)
-            low_cutoffs = random_df.quantile(0.01)
-
-            # Set significance
-            self.sig_ = pd.DataFrame(index=df_X.columns)
-            self.sig_["sig"] = False
-
-            for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs):
-                self.sig_["sig"].loc[self.act_[col] >= c_high] = True
-                self.sig_["sig"].loc[self.act_[col] <= c_low] = True
+        # Normalize across samples and features
+        # y = df_y.apply(scale, 1).apply(scale, 0)
+        y = df_y
 
+        X = df_X.loc[y.index]
+
+        model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus)
+        logger.debug("Fitting model")
+        coefs = []
+        model.fit(df_X, df_y)
         logger.info("Done")
 
-    def _get_coefs(self, X, y):
-        logger.info("set alpha through cross-validation\n")
-        # Determine best parameters based on CV
-        self.clf.fit(X, y)
+        self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T
 
-        logger.debug(
-            "average score ({} fold CV): {}".format(self.kfolds, self.clf.best_score_)
-        )
+    def predict(self, df_X):
+        return df_X.dot(self.act_.loc[df_X.columns])
+
+@register_predictor("SVR")
+class SVRMoap(Moap):
+    def __init__(self, scale=True, ncpus=None):
+        """Predict motif activities using Support Vector Regression.
+
+        Parameters
+        ----------
+        scale : boolean, optional, default True
+            If ``True``, the motif scores will be scaled
+            before classification.
+
+        ncpus : int, optional
+            Number of threads. Default is the number specified in the config.
+
+        Attributes
+        ----------
+        act_ : DataFrame, shape (n_motifs, n_clusters)
+            SVR weights.
+        """
+
+        self.act_description = "activity values: SVR weights"
+
+        if ncpus is None:
+            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))
+        self.ncpus = ncpus
+        self.scale = scale
+        self.act_ = None
+        self.pref_table = "score"
+        self.supported_tables = ["score", "count"]
+        self.ptype = "regression"
+
+    def fit(self, df_X, df_y):
+        logger.info("Fitting SVR")
+
+        if not df_y.shape[0] == df_X.shape[0]:
+            raise ValueError("number of regions is not equal")
+
+        if self.scale:
+            logger.debug("Scaling motif scores")
+            # Scale motif scores
+            df_X.loc[:,:] = scale(df_X, axis=0)
+
+        # logger.debug("Scaling y")
 
-        logger.info("Estimate coefficients using bootstrapping\n")
+        # Normalize across samples and features
+        # y = df_y.apply(scale, 1).apply(scale, 0)
+        y = df_y
+        self.columns = df_y.columns
+        X = df_X.loc[y.index]
 
-        n_samples = 0.75 * X.shape[0]
-        max_samples = X.shape[0]
-        m = self.clf.best_estimator_
+        clf = SVR(kernel="linear")
+        self.model = MultiOutputRegressor(clf, n_jobs=1)
+        logger.debug("Fitting model")
         coefs = []
-        for _ in range(10):
-            idx = np.random.randint(0, n_samples, max_samples)
-            m.fit(X[idx], y[idx])
-            coefs.append(m.coef_)
-        coefs = np.array(coefs).mean(axis=0)
-        return coefs
+        self.model.fit(df_X, df_y)
+        logger.info("Done")
+
+        self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T
+
+    def predict(self, df_X):
+        #print(self.model.predict(df_X) )
+        
+        return pd.DataFrame(self.model.predict(df_X), index=df_X.index, columns=self.columns)
 
 
 def moap(

From a1920053553d3eb6b2832eabb8bf1290734b65bf Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 16:27:09 +0200
Subject: [PATCH 41/85] tests

---
 gimmemotifs/scanner.py    | 29 +++++++++++++++++++----------
 test/data/rank/ranked.txt | 10 +++++-----
 test/test_rank.py         | 10 ++++++----
 3 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index cbb05961..3a2de020 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -678,6 +678,19 @@ def set_meanstd(self, gc=False):
                         self.meanstd[gcbin][motif.id] = mean, std
 
         lock.release()
+        for gc_bin in bins:
+            if gc_bin not in self.meanstd:
+                valid_bins = []
+                for b in self.gc_bins:
+                    bstr = "{:.2f}-{:.2f}".format(b[0], b[1])
+                    if bstr in self.meanstd:
+                        valid_bins.append(((b[0] + b[1]) / 2, bstr))
+
+                v = float(gc_bin.split("-")[1])
+                _, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0]
+                logger.warn(f"Using {bstr}")
+                self.meanstd[gc_bin] = self.meanstd[bstr]
+
 
     def set_background(
         self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None
@@ -994,15 +1007,13 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
 
         gc_bin_count = Counter(seq_gc_bins)
 
-        print(self.threshold)
-
-        _treshold = self.threshold
+        _threshold = self.threshold
         if zscore:
             grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0)
             _threshold = pd.DataFrame(
                 np.vstack(grouped.values),
-                index=_treshold.index,
-                columns=_treshold.columns,
+                index=_threshold.index,
+                columns=_threshold.columns,
             )
 
         min_frac = min(gc_bin_count.values())
@@ -1012,12 +1023,9 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
             )
             for gc_bin, count in gc_bin_count.items()
         ]
-        print(dfs)
-        fpr_df = pd.concat(dfs)
-        print(fpr_df.shape)
+    
+        fpr_df = pd.concat(dfs)        
         t = fpr_df.quantile(0.99, interpolation="higher")
-        print(motifs)
-        print(t)
         maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index)
         t[t >= maxt] = None
         # print(t)
@@ -1036,6 +1044,7 @@ def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False):
         motifs_meanstd = None
         if zscore:
             motifs_meanstd = self.meanstd
+
         scan_func = partial(
             scan_seq_mult,
             motifs=motifs,
diff --git a/test/data/rank/ranked.txt b/test/data/rank/ranked.txt
index f5f36f40..bb52cd79 100644
--- a/test/data/rank/ranked.txt
+++ b/test/data/rank/ranked.txt
@@ -1,6 +1,6 @@
 	a	b	c	d	e
-1	bZIP	bZIP	SOX	T-box	POU
-2	AP2	AP2	AP2	AP2	AP2
-3	T-box	SOX	T-box	SOX	SOX
-4	SOX	T-box	POU	POU	T-box
-5	POU	POU	bZIP	bZIP	bZIP
+bZIP	5	5	1	1	1
+AP2	4	4	4	4	4
+T-box	3	2	3	5	2
+SOX	2	3	5	3	3
+POU	1	1	2	2	4
diff --git a/test/test_rank.py b/test/test_rank.py
index 5cb533e9..cca52132 100644
--- a/test/test_rank.py
+++ b/test/test_rank.py
@@ -2,7 +2,7 @@
 import tempfile
 import os
 import pandas as pd
-from gimmemotifs.rank import rankagg
+from gimmemotifs.rank import rankagg, _rankagg_stuart
 
 
 class TestRank(unittest.TestCase):
@@ -17,13 +17,15 @@ def setUp(self):
     def test1_rankagg(self):
         """ Test rank aggregation """
         df = pd.read_csv(self.fname, index_col=0, sep="\t")
-        result = rankagg(df)
-        self.assertEqual("AP2", result.sort_values("score").index[0])
+        result = rankagg(df, method="stuart")
+        self.assertEqual("AP2", result.sort_values("score").index[-1])
+        result = rankagg(df, method="int_stouffer")
+        self.assertEqual("AP2", result.sort_values("z-score").index[-1])
 
     def test2_rankagg(self):
         """ Test Python implementation of rank aggregation """
         df = pd.read_csv(self.rank_in, index_col=0, sep="\t")
-        result = rankagg(df)["score"].values
+        result = _rankagg_stuart(df)["score"].values
         ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values
         for v1, v2 in zip(ref, result):
             self.assertAlmostEqual(v1, v2)

From eaf191af0f72fbbdbb8965ffd238d8cd1bbe79cb Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 23:27:51 +0200
Subject: [PATCH 42/85] black

---
 gimmemotifs/report.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 766349e7..53d781dc 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -51,11 +51,12 @@ def _wrap_html_str(x):
         min_pos, max_pos = m.start(), m.end()
 
     positions = [m.start() for m in re.compile(" ").finditer(x)]
-    if len(positions) == 0:
-        return x
 
     positions = [p for p in positions if min_pos < p < max_pos]
 
+    if len(positions) == 0:
+        return x
+
     pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0]
     x = x[:pos] + "<br/>" + x[pos + 1 :]
     return x

From 3c4ac02bc9ee19a2326ed3fcc1b06fe0a0fc5a3f Mon Sep 17 00:00:00 2001
From: simonvh <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 23:28:53 +0200
Subject: [PATCH 43/85] fix test

---
 gimmemotifs/scanner.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index 3a2de020..e8768248 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -391,12 +391,11 @@ def scan_sequence(
                 result = [row for row in result if row[0] >= cutoff]
             else:
                 result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc)
-                if cutoff <= motif.pwm_min_score() and len(result) == 0:
-                    result = [[motif.pwm_min_score(), 0, 1]] * nreport
+            if cutoff <= motif.pwm_min_score() and len(result) == 0:
+                result = [[motif.pwm_min_score(), 0, 1]] * nreport
 
             ret.append(result)
 
-    # return results
     return ret
 
 
@@ -678,7 +677,9 @@ def set_meanstd(self, gc=False):
                         self.meanstd[gcbin][motif.id] = mean, std
 
         lock.release()
-        for gc_bin in bins:
+
+        for gc_bin in self.gc_bins:
+            gc_bin = "{:.2f}-{:.2f}".format(*gc_bin)
             if gc_bin not in self.meanstd:
                 valid_bins = []
                 for b in self.gc_bins:
@@ -691,7 +692,6 @@ def set_meanstd(self, gc=False):
                 logger.warn(f"Using {bstr}")
                 self.meanstd[gc_bin] = self.meanstd[bstr]
 
-
     def set_background(
         self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None
     ):
@@ -1009,7 +1009,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
 
         _threshold = self.threshold
         if zscore:
-            grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0)
+            grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0)
             _threshold = pd.DataFrame(
                 np.vstack(grouped.values),
                 index=_threshold.index,
@@ -1023,12 +1023,11 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
             )
             for gc_bin, count in gc_bin_count.items()
         ]
-    
-        fpr_df = pd.concat(dfs)        
+
+        fpr_df = pd.concat(dfs)
         t = fpr_df.quantile(0.99, interpolation="higher")
         maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index)
         t[t >= maxt] = None
-        # print(t)
         return t.replace({np.nan: None}).to_dict()
 
     def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):

From bc76a10af82aa1f54f15ef6180ab8e1aa20ff799 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 23:36:52 +0200
Subject: [PATCH 44/85] fix test

---
 gimmemotifs/scanner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index 3a2de020..16744193 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -1009,7 +1009,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
 
         _threshold = self.threshold
         if zscore:
-            grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0)
+            grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0)
             _threshold = pd.DataFrame(
                 np.vstack(grouped.values),
                 index=_threshold.index,

From b1d658a419fe795c3d320c08840ec1476558d4e9 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 16 Jul 2020 23:57:18 +0200
Subject: [PATCH 45/85] style

---
 gimmemotifs/maelstrom.py | 10 ++++++----
 gimmemotifs/moap.py      | 19 ++++++++++---------
 gimmemotifs/rank.py      |  4 ++--
 gimmemotifs/scanner.py   |  7 +------
 4 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 1be9fd2c..be7d94cf 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -257,11 +257,11 @@ def run_maelstrom(
 
     center : bool, optional
         Mean-center the input table.
-    
+
     aggregation: str, optional
-        How to combine scores of the predictors. The default is "int_stouffer", for 
+        How to combine scores of the predictors. The default is "int_stouffer", for
         inverse normal transform followed by Stouffer's methods to combine z-scores.
-        Alternatively, "stuart" performs rank aggregation and reports the -log10 of 
+        Alternatively, "stuart" performs rank aggregation and reports the -log10 of
         the rank aggregation p-value.
     """
     logger.info("Starting maelstrom")
@@ -277,7 +277,9 @@ def run_maelstrom(
             logger.info(
                 "Input is not mean-centered, setting the mean of all rows to 0."
             )
-            logger.info("Use --nocenter if you know what you're doing and want to change this behavior.")
+            logger.info(
+                "Use --nocenter if you know what you're doing and want to change this behavior."
+            )
             logger.info(
                 "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to "
                 "first transform your data, for instance using log2(), and to normalize "
diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 4b69661b..55a4bb50 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -31,10 +31,10 @@ def warn(*args, **kwargs):
 from tqdm.auto import tqdm
 
 # scikit-learn
-from sklearn.model_selection import GridSearchCV
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge
+from sklearn.multioutput import MultiOutputRegressor
 from sklearn.preprocessing import scale, LabelEncoder
 from sklearn.svm import SVR
 
@@ -582,7 +582,7 @@ def fit(self, df_X, df_y):
         if self.scale:
             logger.debug("Scaling motif scores")
             # Scale motif scores
-            df_X.loc[:,:] = scale(df_X, axis=0)
+            df_X.loc[:, :] = scale(df_X, axis=0)
 
         # logger.debug("Scaling y")
 
@@ -594,7 +594,6 @@ def fit(self, df_X, df_y):
 
         model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus)
         logger.debug("Fitting model")
-        coefs = []
         model.fit(df_X, df_y)
         logger.info("Done")
 
@@ -603,6 +602,7 @@ def fit(self, df_X, df_y):
     def predict(self, df_X):
         return df_X.dot(self.act_.loc[df_X.columns])
 
+
 @register_predictor("SVR")
 class SVRMoap(Moap):
     def __init__(self, scale=True, ncpus=None):
@@ -643,7 +643,7 @@ def fit(self, df_X, df_y):
         if self.scale:
             logger.debug("Scaling motif scores")
             # Scale motif scores
-            df_X.loc[:,:] = scale(df_X, axis=0)
+            df_X.loc[:, :] = scale(df_X, axis=0)
 
         # logger.debug("Scaling y")
 
@@ -656,16 +656,17 @@ def fit(self, df_X, df_y):
         clf = SVR(kernel="linear")
         self.model = MultiOutputRegressor(clf, n_jobs=1)
         logger.debug("Fitting model")
-        coefs = []
         self.model.fit(df_X, df_y)
         logger.info("Done")
 
-        self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T
+        self.act_ = pd.DataFrame(self.model.coef_, columns=X.columns, index=y.columns).T
 
     def predict(self, df_X):
-        #print(self.model.predict(df_X) )
-        
-        return pd.DataFrame(self.model.predict(df_X), index=df_X.index, columns=self.columns)
+        # print(self.model.predict(df_X) )
+
+        return pd.DataFrame(
+            self.model.predict(df_X), index=df_X.index, columns=self.columns
+        )
 
 
 def moap(
diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py
index a11ad71e..1a8cfde6 100644
--- a/gimmemotifs/rank.py
+++ b/gimmemotifs/rank.py
@@ -88,7 +88,7 @@ def _rank_int(series, c=3.0 / 8, stochastic=True):
             param1 (pandas.Series):   Series of values to transform
             param2 (Optional[float]): Constand parameter (Bloms constant)
             param3 (Optional[bool]):  Whether to randomise rank of ties
-        
+
         Returns:
             pandas.Series
     """
@@ -108,7 +108,7 @@ def _rank_int(series, c=3.0 / 8, stochastic=True):
     series = series.loc[~pd.isnull(series)]
 
     # Get ranks
-    if stochastic == True:
+    if stochastic:
         # Shuffle by index
         series = series.loc[np.random.permutation(series.index)]
         # Get rank, ties are determined by their position in the series (hence
diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index e8768248..337fdcb4 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -18,7 +18,6 @@
 from genomepy import Genome
 from diskcache import Cache
 import numpy as np
-from scipy.stats import scoreatpercentile
 from sklearn.preprocessing import scale
 import pandas as pd
 
@@ -48,7 +47,6 @@ def _pickle_method(m):
 # only used when using cache, should not be a requirement
 try:
     from dogpile.cache import make_region
-    from dogpile.cache.api import NO_VALUE
     import xxhash
 except ImportError:
     pass
@@ -819,13 +817,10 @@ def set_threshold(self, fpr=None, threshold=None, gc=False):
         if not self.motifs:
             raise ValueError("please run set_motifs() first")
 
-        thresholds = {}
         motifs = read_motifs(self.motifs)
         gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins]
 
         if threshold is not None:
-            data = []
-
             d = parse_threshold_values(self.motifs, threshold)
             self._threshold = pd.DataFrame(d, index=[0])
             self._threshold = self._threshold.join(
@@ -1052,7 +1047,7 @@ def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False):
             motifs_meanstd=motifs_meanstd,
             zscore=zscore,
         )
-        for seq, ret in self._scan_jobs(scan_func, seqs):
+        for _, ret in self._scan_jobs(scan_func, seqs):
             yield ret
 
     def _scan_jobs(self, scan_func, scan_seqs):

From 1e9e22955dac5aa94fd9aa4c9a6b55554f42d548 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Fri, 17 Jul 2020 09:54:34 +0200
Subject: [PATCH 46/85] fix automatic adjusting of input size. fixes #128 #129

---
 gimmemotifs/commands/motifs.py | 35 +++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index b2b136a6..4e89ed0c 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -39,6 +39,19 @@ def motifs(args):
     if not os.path.exists(scan_dir):
         os.makedirs(scan_dir)
 
+    sample = args.sample
+    if args.size and args.size > 0:
+        file_type = determine_file_type(args.sample)
+        if file_type == "fasta":
+            logger.warn("size parameter will be ignored for FASTA input")
+        else:
+            outfile = os.path.join(args.outdir, f"input.w{args.size}.bed")
+            if file_type == "narrowpeak":
+                narrowpeak_to_bed(args.sample, outfile, size=args.size)
+            if file_type == "bed":
+                write_equalsize_bedfile(args.sample, args.size, outfile)
+            sample = outfile
+
     genome = args.genome
     if genome is None:
         args.zscore = False
@@ -71,7 +84,7 @@ def motifs(args):
             bg,
             fmt="fasta",
             genome=genome,
-            inputfile=args.sample,
+            inputfile=sample,
             size=size,
             number=10000,
         )
@@ -84,7 +97,7 @@ def motifs(args):
 
     if args.denovo:
         gimme_motifs(
-            args.sample,
+            sample,
             args.outdir,
             params={
                 "tools": args.tools,
@@ -147,15 +160,15 @@ def motifs(args):
     )
 
     logger.info("creating motif scan tables")
-    ftype = determine_file_type(args.sample)
-    sample = args.sample
-    delete_sample = False
-    if ftype == "narrowpeak":
-        f = NamedTemporaryFile(delete=False)
-        logger.debug("Using {} as temporary BED file".format(f.name))
-        narrowpeak_to_bed(args.sample, f.name, size=args.size)
-        sample = f.name
-        delete_sample = True
+    # ftype = determine_file_type(args.sample)
+    # sample = args.sample
+    # delete_sample = False
+    # if ftype == "narrowpeak":
+    #    f = NamedTemporaryFile(delete=False)
+    #    logger.debug("Using {} as temporary BED file".format(f.name))
+    #    narrowpeak_to_bed(args.sample, f.name, size=args.size)
+    #    sample = f.name
+    #    delete_sample = True
 
     # Create a table with the best score per motif for all motifs.
     # This has three reasons:

From 380d2a9256d31284b4f957f8383ea0e957a05470 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 10:32:05 +0200
Subject: [PATCH 47/85] Hopefully fix memory issue

---
 gimmemotifs/scanner.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index 337fdcb4..bf4c2170 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -1012,18 +1012,27 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
             )
 
         min_frac = min(gc_bin_count.values())
-        dfs = [
-            _threshold.loc[gc_bin].sample(
-                int(count / min_frac * 1000), replace=True, random_state=42
-            )
-            for gc_bin, count in gc_bin_count.items()
-        ]
+        t = {}
+        maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns)
+        # We do this in a loop as the DataFrame will get too big to fit in memory
+        # when the difference between the number of sequences per gc_bin is very
+        # high.
+        for motif in _threshold.columns:
+            dfs = [
+                _threshold.loc[gc_bin, motif].sample(
+                    int(count / min_frac * 1000), replace=True, random_state=42
+                )
+                for gc_bin, count in gc_bin_count.items()
+            ]
+
+            fpr_df = pd.concat(dfs)
+            val = fpr_df.quantile(0.99, interpolation="higher")
+            if val < maxt.loc[motif]:
+                t[motif] = val
+            else:
+                t[motif] = None
 
-        fpr_df = pd.concat(dfs)
-        t = fpr_df.quantile(0.99, interpolation="higher")
-        maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index)
-        t[t >= maxt] = None
-        return t.replace({np.nan: None}).to_dict()
+        return t
 
     def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):
         scan_func = partial(

From 6bb850869ac6774492863306e5807b1f2aa5dd45 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 11:03:49 +0200
Subject: [PATCH 48/85] correct missing import

---
 gimmemotifs/commands/motifs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index 4e89ed0c..dfbab38e 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -23,7 +23,7 @@
 from gimmemotifs.stats import calc_stats_iterator
 from gimmemotifs.report import roc_html_report
 from gimmemotifs.scanner import scan_to_file
-from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed
+from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed, write_equalsize_bedfile
 
 
 logger = logging.getLogger("gimme.motifs")

From 0d0aa4b973ccc75d2a1bb0752a0fcd8933e152e6 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 11:04:00 +0200
Subject: [PATCH 49/85] fix coef

---
 gimmemotifs/moap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 55a4bb50..258069a0 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -659,7 +659,7 @@ def fit(self, df_X, df_y):
         self.model.fit(df_X, df_y)
         logger.info("Done")
 
-        self.act_ = pd.DataFrame(self.model.coef_, columns=X.columns, index=y.columns).T
+        self.act_ = pd.DataFrame({c: e.coef_[0] for c, e in zip(df_y.columns, self.model.estimators_)}, index=X.columns)
 
     def predict(self, df_X):
         # print(self.model.predict(df_X) )

From 010fed716fe3ea96d63b05b27b6a52a29cb370f3 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 14:44:33 +0200
Subject: [PATCH 50/85] filter redudant motifs for maelstrom

---
 gimmemotifs/cli.py                | 16 ++++++++
 gimmemotifs/commands/maelstrom.py |  4 ++
 gimmemotifs/commands/motifs.py    |  3 --
 gimmemotifs/maelstrom.py          | 68 ++++++++++++++++++++++++++-----
 gimmemotifs/motif.py              |  6 +++
 gimmemotifs/report.py             | 17 +++++---
 6 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index 84b89961..2f778384 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -326,6 +326,22 @@ def cli(sys_args):
         default=default_pfm_file,
         metavar="pfmfile",
     )
+    p.add_argument(
+        "--no-filter",
+        dest="filter_redundant",
+        help="Don't remove redundant motifs.",
+        default=True,
+        action="store_false",
+    )
+    p.add_argument(
+        "-F",
+        "--filter_cutoff",
+        dest="filter_cutoff",
+        help="Cutoff to select non-redundant motifs. Default is 0.8.",
+        default=0.8,
+        type=float,
+        metavar="FLOAT",
+    )
     p.add_argument(
         "--nocenter",
         dest="center",
diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py
index 4cae068d..04d8cd1c 100755
--- a/gimmemotifs/commands/maelstrom.py
+++ b/gimmemotifs/commands/maelstrom.py
@@ -15,6 +15,8 @@ def maelstrom(args):
     genome = args.genome
     outdir = args.outdir
     pfmfile = args.pfmfile
+    filter_redundant = args.filter_redundant
+    filter_cutoff = args.filter_cutoff
     methods = args.methods
     ncpus = args.ncpus
     zscore = args.zscore
@@ -33,6 +35,8 @@ def maelstrom(args):
         genome,
         outdir,
         pfmfile,
+        filter_redundant=filter_redundant,
+        filter_cutoff=filter_cutoff,
         methods=methods,
         ncpus=ncpus,
         zscore=zscore,
diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index dfbab38e..6b126a0b 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -253,9 +253,6 @@ def motifs(args):
                 gcnorm=True,
             )
 
-    if delete_sample:
-        os.unlink(sample)
-
     if args.report:
         logger.info("creating statistics report")
         if args.outdir:
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index be7d94cf..a9d50afe 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -26,6 +26,8 @@
 from scipy.cluster import hierarchy
 from scipy.spatial.distance import pdist
 from scipy.cluster.hierarchy import linkage, dendrogram
+from sklearn.cluster import FeatureAgglomeration
+# from scipy.spatial.distance import correlation
 
 # Plotting
 import matplotlib.pyplot as plt
@@ -97,7 +99,7 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):
     mapfile = pfmfile.replace(".pwm", ".motif2factors.txt")
     if os.path.exists(mapfile):
 
-        m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0)
+        m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#")
         m2f["factors"] = m2f["factors"].str[:50]
     else:
         motifs = [m.id for m in read_motifs(pfmfile)]
@@ -200,6 +202,8 @@ def run_maelstrom(
     genome,
     outdir,
     pfmfile=None,
+    filter_redundant=True,
+    filter_cutoff=0.8,
     plot=True,
     cluster=False,
     score_table=None,
@@ -229,6 +233,12 @@ def run_maelstrom(
     pfmfile : str, optional
         Specify a PFM file for scanning.
 
+    filter_redundant : bool, optional
+        Create a non-redundant set of motifs based on correlation of motif scores in the input data.
+
+    filter_cutoff : float, optional
+        Cutoff to use for non-redundant motif selection. Default is 0.8.
+
     plot : bool, optional
         Create heatmaps.
 
@@ -355,6 +365,51 @@ def run_maelstrom(
         else:
             logger.info("Scores, using: %s", score_table)
 
+    counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
+    scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")
+
+    if filter_redundant:
+        logger.info("Selecting non-redundant motifs")
+
+        fa = FeatureAgglomeration(distance_threshold=filter_cutoff, n_clusters=None, affinity="correlation", linkage="complete", compute_full_tree=True)
+        fa.fit(scores)
+        X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_})
+        X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif")
+        selected_motifs = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")["motif"].values
+        nr_motif = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")[["label", "motif"]].set_index("label")
+        X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label")
+        motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif")
+
+        scores = scores[selected_motifs]
+        counts = counts[selected_motifs]
+        score_table = os.path.join(outdir, "motif.nr.score.txt.gz")
+        scores.to_csv(score_table, sep="\t", compression="gzip")
+        count_table = os.path.join(outdir, "motif.nr.count.txt.gz")
+        counts.to_csv(count_table, sep="\t", compression="gzip")
+
+        m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#")
+        m2f = m2f.join(motif_map, on="Motif")
+        m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N"
+        m2f["Motif"] = m2f["motif_nr"]
+        m2f = m2f.drop(columns=["motif_nr"])
+
+        motifs = read_motifs(pfmfile)
+        pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm")
+        with open(pfmfile, "w") as f:
+            for motif in motifs:
+                f.write(f"{motif.to_pfm()}\n")
+        mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")
+        with open(mapfile, "w") as f:
+            f.write("# Note: this mapping is specifically created for this non-redundant set of motifs.\n")
+            f.write("# It also includes factors for motifs that were similar, but this can be\n")
+            f.write("# specific to this analysis.\n")
+
+        with open(mapfile, "a") as f:
+            m2f.to_csv(f, index=False, sep="\t")
+        logger.info(f"Selected {len(selected_motifs)} motifs")
+        logger.info(f"Motifs: {pfmfile}")
+        logger.info(f"Factor mappings: {mapfile}")
+
     if cluster:
         cluster = False
         for method in methods:
@@ -401,18 +456,14 @@ def run_maelstrom(
 
     for method, scoring, fname in exps:
         try:
-            if scoring == "count" and count_table is not None:
+            if scoring == "count":
                 moap_with_table(
                     fname, count_table, outdir, method, scoring, ncpus=ncpus
                 )
-            elif scoring == "score" and score_table is not None:
+            elif scoring == "score":
                 moap_with_table(
                     fname, score_table, outdir, method, scoring, ncpus=ncpus
                 )
-            else:
-                moap_with_bg(
-                    fname, genome, outdir, method, scoring, pfmfile=pfmfile, ncpus=ncpus
-                )
 
         except Exception as e:
             logger.warn("Method %s with scoring %s failed", method, scoring)
@@ -428,9 +479,6 @@ def run_maelstrom(
         except FileNotFoundError:
             logger.warn("Activity file for {} not found!\n".format(t))
 
-    counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t")
-    scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t")
-
     if len(methods) > 1:
         logger.info("Rank aggregation")
         df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)
diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index 2be35178..a0f1608c 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -1419,6 +1419,8 @@ def _add_factors_from_handle(motifs, handle):
     m2f_direct = {}
     m2f_indirect = {}
     for line in open(map_file):
+        if line.startswith("#"):
+            continue
         try:
             motif, *factor_info = line.strip().split("\t")
             if len(factor_info) == 1:
@@ -1431,7 +1433,11 @@ def _add_factors_from_handle(motifs, handle):
         except Exception:
             pass
 
+    m2f = pd.read_csv(map_file, sep="\t", comment="#", index_col=0)
+
     for motif in motifs:
+        if motif.id in m2f.index:
+            motif.factor_info = m2f.loc[motif.id]
         if motif.id in m2f_direct:
             motif.factors[DIRECT_NAME] = m2f_direct[motif.id]
         if motif.id in m2f_indirect:
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 53d781dc..beb798b0 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -11,6 +11,7 @@
 import re
 import shutil
 import logging
+from collections import Counter
 
 import jinja2
 import numpy as np
@@ -835,12 +836,18 @@ def format_factors(motif, max_length=5):
     fmt_d = "<span style='color:black'>{}</span>"
     fmt_i = "<span style='color:#666666'>{}</span>"
 
+    if hasattr(motif, "factor_info"):
+        fcount = Counter([x.upper() for x in motif.factor_info["Factor"]])
+    else:
+        fcount = Counter(motif.factors[DIRECT_NAME] + motif.factors[INDIRECT_NAME])
+
     direct = sorted(
         list(
             set(
                 [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]]
             )
-        )
+        ),
+        key=lambda x: fcount[x], reverse=True
     )
     indirect = sorted(
         list(
@@ -851,23 +858,23 @@ def format_factors(motif, max_length=5):
                     if x.upper() not in direct
                 ]
             )
-        )
+        ), key=lambda x: fcount[x], reverse=True
     )
 
     if len(direct) > max_length:
         show_factors = direct[:max_length]
     else:
         show_factors = direct[:]
-        for f in indirect:
+        for f in sorted(indirect, key=lambda x: fcount[x], reverse=True):
             if f not in show_factors:
                 show_factors.append(f)
             if len(show_factors) >= max_length:
                 break
 
     if "de novo" in show_factors:
-        show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"])
+        show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"], key=lambda x: fcount[x], reverse=True)
     else:
-        show_factors = sorted(show_factors)
+        show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True)
 
     factor_str = ",".join(
         [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]

From 47d2dfeba709b0f1178aebbd0812fe324351696c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 15:08:51 +0200
Subject: [PATCH 51/85] extra help

---
 gimmemotifs/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index 2f778384..ad07d663 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -337,7 +337,7 @@ def cli(sys_args):
         "-F",
         "--filter_cutoff",
         dest="filter_cutoff",
-        help="Cutoff to select non-redundant motifs. Default is 0.8.",
+        help="Cutoff to select non-redundant motifs. Default is 0.8, increase this value to get fewer motifs.",
         default=0.8,
         type=float,
         metavar="FLOAT",

From df82b6160964069fda53361daae0da8c9d04180a Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 15:28:48 +0200
Subject: [PATCH 52/85] fix help

---
 gimmemotifs/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py
index ad07d663..b24e3652 100644
--- a/gimmemotifs/cli.py
+++ b/gimmemotifs/cli.py
@@ -389,7 +389,7 @@ def cli(sys_args):
     p.add_argument(
         "--nogc",
         dest="gc",
-        help="Don't use GC% bins",
+        help="Don't use GC%% bins",
         action="store_false",
         default=True,
     )

From 0a86356665a20062f31d010917a44b0f81978310 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 19:57:04 +0200
Subject: [PATCH 53/85] print small numbers as <1

---
 gimmemotifs/report.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index beb798b0..b01e2f54 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -360,6 +360,19 @@ def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0):
             for color in c
         ]
 
+    def to_precision_str(self, subset=None, precision=0, include_zero=True):
+        subset = pd.IndexSlice[:, :] if subset is None else subset
+        subset = _non_reducing_slice(subset)
+
+        def precision_str(x, precision=precision):
+            if (include_zero or x > 0) and x <= 10 ** -precision:
+                return f"<{10**-precision}"
+            else:
+                return f"{{0:.{precision}f}}".format(x)
+
+        self.display_data.loc[subset] = self.data.loc[subset].applymap(precision_str)
+        return self
+
     def _circle(
         self,
         subset=None,
@@ -997,6 +1010,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
             .wrap(subset=["% with motif"])
             .align(subset=["% with motif"], location="center")
             .border(subset=["% with motif"], location="left")
+            .to_precision_str(subset=["% with motif"])
         )
 
     df_styled = df_styled.render()
@@ -1109,6 +1123,7 @@ def roc_html_report(
                 .wrap(subset=cols)
                 .align(subset=bar_cols, location="center")
                 .rename(columns=rename_columns)
+                .to_precision_str(subset=["% matches input", "%matches background"])
                 .render()
             )
         else:

From 01207e48a0773ddb2e5be914ae886181cdf5a94b Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 21:35:50 +0200
Subject: [PATCH 54/85] update moaps

---
 gimmemotifs/commands/motifs.py |  6 +++++-
 gimmemotifs/maelstrom.py       | 33 +++++++++++++++++++++++++++------
 gimmemotifs/moap.py            | 27 ++++++++++++++++++++++-----
 3 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index 6b126a0b..c4489d43 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -23,7 +23,11 @@
 from gimmemotifs.stats import calc_stats_iterator
 from gimmemotifs.report import roc_html_report
 from gimmemotifs.scanner import scan_to_file
-from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed, write_equalsize_bedfile
+from gimmemotifs.utils import (
+    determine_file_type,
+    narrowpeak_to_bed,
+    write_equalsize_bedfile,
+)
 
 
 logger = logging.getLogger("gimme.motifs")
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index a9d50afe..6195960d 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -27,6 +27,7 @@
 from scipy.spatial.distance import pdist
 from scipy.cluster.hierarchy import linkage, dendrogram
 from sklearn.cluster import FeatureAgglomeration
+
 # from scipy.spatial.distance import correlation
 
 # Plotting
@@ -99,7 +100,9 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None):
     mapfile = pfmfile.replace(".pwm", ".motif2factors.txt")
     if os.path.exists(mapfile):
 
-        m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#")
+        m2f = pd.read_csv(
+            mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#"
+        )
         m2f["factors"] = m2f["factors"].str[:50]
     else:
         motifs = [m.id for m in read_motifs(pfmfile)]
@@ -371,12 +374,26 @@ def run_maelstrom(
     if filter_redundant:
         logger.info("Selecting non-redundant motifs")
 
-        fa = FeatureAgglomeration(distance_threshold=filter_cutoff, n_clusters=None, affinity="correlation", linkage="complete", compute_full_tree=True)
+        fa = FeatureAgglomeration(
+            distance_threshold=filter_cutoff,
+            n_clusters=None,
+            affinity="correlation",
+            linkage="complete",
+            compute_full_tree=True,
+        )
         fa.fit(scores)
         X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_})
         X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif")
-        selected_motifs = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")["motif"].values
-        nr_motif = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")[["label", "motif"]].set_index("label")
+        selected_motifs = (
+            X_cluster.sort_values("var")
+            .drop_duplicates(subset=["label"], keep="last")["motif"]
+            .values
+        )
+        nr_motif = (
+            X_cluster.sort_values("var")
+            .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]]
+            .set_index("label")
+        )
         X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label")
         motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif")
 
@@ -400,8 +417,12 @@ def run_maelstrom(
                 f.write(f"{motif.to_pfm()}\n")
         mapfile = pfmfile.replace(".pfm", ".motif2factors.txt")
         with open(mapfile, "w") as f:
-            f.write("# Note: this mapping is specifically created for this non-redundant set of motifs.\n")
-            f.write("# It also includes factors for motifs that were similar, but this can be\n")
+            f.write(
+                "# Note: this mapping is specifically created for this non-redundant set of motifs.\n"
+            )
+            f.write(
+                "# It also includes factors for motifs that were similar, but this can be\n"
+            )
             f.write("# specific to this analysis.\n")
 
         with open(mapfile, "a") as f:
diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py
index 258069a0..d4053bc4 100644
--- a/gimmemotifs/moap.py
+++ b/gimmemotifs/moap.py
@@ -36,7 +36,9 @@ def warn(*args, **kwargs):
 from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.preprocessing import scale, LabelEncoder
-from sklearn.svm import SVR
+from sklearn.svm import LinearSVR
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
 
 import xgboost
 
@@ -592,12 +594,24 @@ def fit(self, df_X, df_y):
 
         X = df_X.loc[y.index]
 
-        model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus)
+        model = Pipeline(
+            [
+                ("scale", StandardScaler()),
+                (
+                    "reg",
+                    MultiTaskLassoCV(
+                        fit_intercept=False, n_alphas=20, n_jobs=self.ncpus
+                    ),
+                ),
+            ]
+        )
         logger.debug("Fitting model")
         model.fit(df_X, df_y)
         logger.info("Done")
 
-        self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T
+        self.act_ = pd.DataFrame(
+            model.steps[1][1].coef_, index=y.columns, columns=X.columns
+        ).T
 
     def predict(self, df_X):
         return df_X.dot(self.act_.loc[df_X.columns])
@@ -653,13 +667,16 @@ def fit(self, df_X, df_y):
         self.columns = df_y.columns
         X = df_X.loc[y.index]
 
-        clf = SVR(kernel="linear")
+        clf = LinearSVR()
         self.model = MultiOutputRegressor(clf, n_jobs=1)
         logger.debug("Fitting model")
         self.model.fit(df_X, df_y)
         logger.info("Done")
 
-        self.act_ = pd.DataFrame({c: e.coef_[0] for c, e in zip(df_y.columns, self.model.estimators_)}, index=X.columns)
+        self.act_ = pd.DataFrame(
+            {c: e.coef_ for c, e in zip(df_y.columns, self.model.estimators_)},
+            index=X.columns,
+        )
 
     def predict(self, df_X):
         # print(self.model.predict(df_X) )

From 5bb73a11cef8103d2baec71416ea0871fb1e7e5f Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 22 Jul 2020 21:36:03 +0200
Subject: [PATCH 55/85] update report

---
 gimmemotifs/report.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index b01e2f54..20af73a8 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -860,7 +860,8 @@ def format_factors(motif, max_length=5):
                 [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]]
             )
         ),
-        key=lambda x: fcount[x], reverse=True
+        key=lambda x: fcount[x],
+        reverse=True,
     )
     indirect = sorted(
         list(
@@ -871,7 +872,9 @@ def format_factors(motif, max_length=5):
                     if x.upper() not in direct
                 ]
             )
-        ), key=lambda x: fcount[x], reverse=True
+        ),
+        key=lambda x: fcount[x],
+        reverse=True,
     )
 
     if len(direct) > max_length:
@@ -885,7 +888,11 @@ def format_factors(motif, max_length=5):
                 break
 
     if "de novo" in show_factors:
-        show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"], key=lambda x: fcount[x], reverse=True)
+        show_factors = ["de novo"] + sorted(
+            [f for f in show_factors if f != "de novo"],
+            key=lambda x: fcount[x],
+            reverse=True,
+        )
     else:
         show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True)
 

From e3270038344361ca0abde6da0cd8b36c63f787eb Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 08:17:41 +0200
Subject: [PATCH 56/85] update test

---
 test/test_maelstrom.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py
index 4edda61b..66f602e2 100644
--- a/test/test_maelstrom.py
+++ b/test/test_maelstrom.py
@@ -26,13 +26,30 @@ def test1_maelstrom(self):
             self.clusters,
             "mm10",
             self.outdir,
+            filter_redundant=False,
             score_table=self.score_table,
             count_table=self.count_table,
             plot=False,
         )
         df = pd.read_table(self.outfile, index_col=0, comment="#")
         print(df.shape)
+
         self.assertEquals((623, 5), df.shape)
+        
+        # Filter redundant motifs
+        run_maelstrom(
+            self.clusters,
+            "mm10",
+            self.outdir,
+            filter_redundant=True,
+            score_table=self.score_table,
+            count_table=self.count_table,
+            plot=False,
+        )
+        df = pd.read_table(self.outfile, index_col=0, comment="#")
+        print(df.shape)
+        self.assertEquals((156, 5), df.shape)
+
 
         for fname in glob(os.path.join(self.outdir, "activity*")):
             os.unlink(fname)

From 6165925a0b88b2031cc32d69a648e2d9ebf74447 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 08:45:09 +0200
Subject: [PATCH 57/85] updated for readthedocs

---
 docs/requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2ba060ac..9429d2a2 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -12,5 +12,3 @@ six
 future
 statsmodels
 tqdm
-xgboost >=0.71
-sklearn-contrib-lightning==0.4.0

From 857f13cb80c73e3aee47d26d61f365dc42104168 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 08:56:41 +0200
Subject: [PATCH 58/85] readthedocs

---
 .rtd-environment.yml | 30 ++++++++++++++++++++++++++++++
 readthedocs.yml      | 15 +++++----------
 2 files changed, 35 insertions(+), 10 deletions(-)
 create mode 100644 .rtd-environment.yml

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
new file mode 100644
index 00000000..dd64911b
--- /dev/null
+++ b/.rtd-environment.yml
@@ -0,0 +1,30 @@
+name: gimmemotifs
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - configparser
+  - diskcache
+  - feather-format
+  - genomepy >=0.8.3
+  - jinja2
+  - logomaker
+  - matplotlib-base >=3.1.2
+  - ncurses
+  - numpy
+  - pandas >=1.0.3
+  - pillow
+  - pyarrow >=0.16.0
+  - pybedtools
+  - python >=3
+  - python-xxhash
+  - pyyaml >=3.10
+  - qnorm
+  - scikit-learn >=0.18
+  - scipy >=1.3.0
+  - seaborn
+  - statsmodels
+  - tqdm >=4.27.0
+  - xdg
+  - xgboost >=0.71
diff --git a/readthedocs.yml b/readthedocs.yml
index 9a6a856f..63c8c19b 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -5,14 +5,9 @@ version: 2
 sphinx:
   configuration: docs/conf.py
 
+conda:
+  file: .rtd-environment.yml
+ 
 python:
-   version: 3.7
-   install:
-      - requirements: docs/requirements.txt
-      - method: pip
-        path: .
-        extra_requirements:
-            - docs
-      - method: setuptools
-        path: another/package
-   system_packages: true
+  version: 3.7
+  setup_py_install: true

From 82cbb17ea2d5bb1e72cbdef1fb64f682b911f542 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 09:06:02 +0200
Subject: [PATCH 59/85] readthedocs

---
 readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/readthedocs.yml b/readthedocs.yml
index 63c8c19b..1e54352b 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -6,7 +6,7 @@ sphinx:
   configuration: docs/conf.py
 
 conda:
-  file: .rtd-environment.yml
+  environment: .rtd-environment.yml
  
 python:
   version: 3.7

From 579e9859ab6566d6fb61048e9c6bcde7305da251 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 09:15:02 +0200
Subject: [PATCH 60/85] readthedocs

---
 readthedocs.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/readthedocs.yml b/readthedocs.yml
index 1e54352b..f9d483db 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -10,4 +10,6 @@ conda:
  
 python:
   version: 3.7
-  setup_py_install: true
+  install:
+    - method: pip
+      path: .

From bcd2d8f657700be8c32c23a5459c9ae7e35b0024 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 09:41:41 +0200
Subject: [PATCH 61/85] update readthedocs

---
 .rtd-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
index dd64911b..5fafc7d3 100644
--- a/.rtd-environment.yml
+++ b/.rtd-environment.yml
@@ -21,6 +21,7 @@ dependencies:
   - python-xxhash
   - pyyaml >=3.10
   - qnorm
+  - represent
   - scikit-learn >=0.18
   - scipy >=1.3.0
   - seaborn

From ab0432986ea471ba50be149f4e09ba92ed58c34e Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 10:51:57 +0200
Subject: [PATCH 62/85] update MaelstromResult to deal with new format

---
 gimmemotifs/maelstrom.py | 79 ++++++++++++++++++++++++++++------------
 gimmemotifs/report.py    | 61 +++++++++++++++++--------------
 2 files changed, 90 insertions(+), 50 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 6195960d..fc5f936b 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -41,7 +41,7 @@
 from gimmemotifs.moap import moap, Moap, scan_to_table
 from gimmemotifs.rank import rankagg
 from gimmemotifs.motif import read_motifs
-from gimmemotifs.report import maelstrom_html_report
+from gimmemotifs.report import maelstrom_html_report, format_factors
 from gimmemotifs.utils import join_max, pfmfile_location
 
 from multiprocessing import Pool
@@ -563,7 +563,10 @@ def __init__(self, outdir):
             raise FileNotFoundError("No such directory: " + outdir)
 
         # Load motifs
-        fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
+        fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m"))
+        print(fnames)
+        if len(fnames) == 0:
+            fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
         if len(fnames) > 0:
             pfmfile = fnames[0]
             with open(pfmfile) as fin:
@@ -582,6 +585,17 @@ def __init__(self, outdir):
         self.result = pd.read_table(
             os.path.join(outdir, "final.out.txt"), comment="#", index_col=0
         )
+        self.correlation = self.result.loc[
+            :, self.result.columns.str.contains("correlation")
+        ]
+        self.percent_match = self.result.loc[
+            :, self.result.columns.str.contains("% with motif")
+        ]
+        self.result = self.result.loc[
+            :,
+            ~self.result.columns.str.contains("correlation")
+            & ~self.result.columns.str.contains("% with motif"),
+        ]
 
         # Read motif results
         self.scores = pd.read_table(
@@ -610,10 +624,11 @@ def plot_heatmap(
         min_freq=0.01,
         threshold=2,
         name=True,
-        indirect=False,
+        indirect=True,
         figsize=None,
-        max_len=50,
+        max_number_factors=5,
         aspect=1,
+        cmap="RdBu_r",
         **kwargs,
     ):
         """Plot clustered heatmap of predicted motif activity.
@@ -622,7 +637,7 @@ def plot_heatmap(
         ----------
         kind : str, optional
             Which data type to use for plotting. Default is 'final', which will
-            plot the result of the rang aggregation. Other options are 'freq'
+            plot the result of the rank aggregation. Other options are 'freq'
             for the motif frequencies, or any of the individual activities such
             as 'rf.score'.
 
@@ -636,19 +651,22 @@ def plot_heatmap(
             Use factor names instead of motif names for plotting.
 
         indirect : bool, optional
-            Include indirect factors. Default is False.
+            Include indirect factors (computationally predicted or non-curated). Default is True.
 
-        max_len : int, optional
-            Truncate the list of factors to this maximum length.
+        max_number_factors : int, optional
+            Truncate the list of factors to this maximum size.
 
         figsize : tuple, optional
             Tuple of figure size (width, height).
 
         aspect : int, optional
             Aspect ratio for tweaking the plot.
+  
+        cmap : str, optional
+            Color paletter to use, RdBu_r by default.
 
         kwargs : other keyword arguments
-            All other keyword arguments are passed to sns.clustermap
+            All other keyword arguments are passed to sns.heatmap
 
         Returns
         -------
@@ -663,13 +681,17 @@ def plot_heatmap(
             filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq)
 
         idx = self.result.loc[filt].index
+
+        if idx.shape[0] == 0:
+            logger.warning("Empty matrix, try lowering the threshold")
+            return
+
         if idx.shape[0] >= 100:
             logger.warning("The filtered matrix has more than 100 rows.")
             logger.warning(
                 "It might be worthwhile to increase the threshold for visualization"
             )
 
-        cmap = "RdBu_r"
         if kind == "final":
             data = self.result
         elif kind == "freq":
@@ -687,18 +709,26 @@ def plot_heatmap(
         else:
             raise ValueError("Unknown dtype")
 
-        # print(data.head())
-        # plt.figure(
         m = data.loc[idx]
-        vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99))
-        vmin = -vmax
+
+        if "vmax" in kwargs:
+            vmax = kwargs.pop("vmax")
+        else:
+            vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99))
+
+        if "vmin" in kwargs:
+            vmin = kwargs.pop("vmin")
+        else:
+            vmin = -vmax
+
         if name:
             m["factors"] = [
-                join_max(
-                    _get_factor_list(self.motifs[n], indirect),
-                    max_len,
-                    ",",
-                    suffix=",(...)",
+                format_factors(
+                    self.motifs[n],
+                    max_length=max_number_factors,
+                    html=False,
+                    include_indirect=indirect,
+                    extra_str=",..",
                 )
                 for n in m.index
             ]
@@ -706,7 +736,8 @@ def plot_heatmap(
         h, w = m.shape
 
         if figsize is None:
-            figsize = (3 + m.shape[1] / 4, 1 + m.shape[0] / 3)
+            figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3)
+
         fig = plt.figure(figsize=figsize)
         npixels = 30
         g = GridSpec(
@@ -714,8 +745,8 @@ def plot_heatmap(
         )
         ax1 = fig.add_subplot(g[0, :])
         ax2 = fig.add_subplot(g[1, :])
-        ax2.set_title("Significance (-log10(p-value))")
-        dm = pdist(m, metric="euclidean")
+        ax2.set_title("aggregated z-score")
+        dm = pdist(m, metric="correlation")
         hc = linkage(dm, method="ward")
         leaves = dendrogram(hc, no_plot=True)["leaves"]
         cg = sns.heatmap(
@@ -727,10 +758,12 @@ def plot_heatmap(
             linewidths=1,
             vmin=vmin,
             vmax=vmax,
+            **kwargs,
         )
+        plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90)
         plt.tight_layout()
         # cg.ax_col_dendrogram.set_visible(False)
-        # plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
+        # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
         return cg
 
     def plot_scores(self, motifs, name=True, max_len=50):
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 20af73a8..dd65aefd 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -845,9 +845,12 @@ def create_denovo_motif_report(
     )
 
 
-def format_factors(motif, max_length=5):
-    fmt_d = "<span style='color:black'>{}</span>"
-    fmt_i = "<span style='color:#666666'>{}</span>"
+def format_factors(motif, max_length=5, html=True, include_indirect=True, extra_str=", (...)"):
+    if html:
+        fmt_d = "<span style='color:black'>{}</span>"
+        fmt_i = "<span style='color:#666666'>{}</span>"
+    else:
+        fmt_d = fmt_i = "{}"
 
     if hasattr(motif, "factor_info"):
         fcount = Counter([x.upper() for x in motif.factor_info["Factor"]])
@@ -863,19 +866,22 @@ def format_factors(motif, max_length=5):
         key=lambda x: fcount[x],
         reverse=True,
     )
-    indirect = sorted(
-        list(
-            set(
-                [
-                    x.upper()
-                    for x in motif.factors[INDIRECT_NAME]
-                    if x.upper() not in direct
-                ]
-            )
-        ),
-        key=lambda x: fcount[x],
-        reverse=True,
-    )
+
+    indirect = []
+    if include_indirect:
+        indirect = sorted(
+            list(
+                set(
+                    [
+                        x.upper()
+                        for x in motif.factors[INDIRECT_NAME]
+                        if x.upper() not in direct
+                    ]
+                )
+            ),
+            key=lambda x: fcount[x],
+            reverse=True,
+        )
 
     if len(direct) > max_length:
         show_factors = direct[:max_length]
@@ -901,17 +907,18 @@ def format_factors(motif, max_length=5):
     )
 
     if len(direct + indirect) > max_length:
-        factor_str += ", (...)"
-
-    tooltip = ""
-    if len(direct) > 0:
-        tooltip += "direct: " + ",".join(sorted(direct))
-    if len(indirect) > 0:
-        if tooltip != "":
-            tooltip += "&#10;"
-        tooltip += "predicted: " + ",".join(sorted(indirect))
-
-    factor_str = '<div title="' + tooltip + '">' + factor_str + "</div>"
+        factor_str += extra_str
+
+    if html:
+        tooltip = ""
+        if len(direct) > 0:
+            tooltip += "direct: " + ",".join(sorted(direct))
+        if len(indirect) > 0:
+            if tooltip != "":
+                tooltip += "&#10;"
+            tooltip += "predicted: " + ",".join(sorted(indirect))
+
+        factor_str = '<div title="' + tooltip + '">' + factor_str + "</div>"
 
     return factor_str
 

From a5ae647c4355c9ecbbc3c65feadf9949dd20d04e Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 10:55:22 +0200
Subject: [PATCH 63/85] updated docs

---
 docs/examples.rst  |  8 ++++----
 docs/reference.rst | 38 ++++++++++++++++++++++++++++++--------
 docs/tutorials.rst | 12 ++++++------
 readthedocs.yml    | 11 +++++++----
 4 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/docs/examples.rst b/docs/examples.rst
index 33fd06c7..9eedc21a 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -41,10 +41,10 @@ Compare motifs between data sets
     $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.out/
 
 The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. 
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using rank aggregation. 
-The score that is shown is the -log10(p-value), where the p-value (from the rank aggregation) is corrected for multiple testing. 
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of 
+the motif or a higher motif score is associated with higher signal in a specific sample.
 
 Create sequence logos
 ---------------------
diff --git a/docs/reference.rst b/docs/reference.rst
index f29e2044..2ba5b10e 100644
--- a/docs/reference.rst
+++ b/docs/reference.rst
@@ -362,11 +362,26 @@ This command can be used to identify differential motifs between two or more dat
 ::
 
     -h, --help            show this help message and exit
-    -p PFMFILE, --pfmfile PFMFILE
+    -p pfmfile, --pfmfile pfmfile
                           PFM file with motifs (default:
-                          gimme.vertebrate.v5.0.pwm)
+                          gimme.vertebrate.v5.0.pfm)
+    --no-filter           Don't remove redundant motifs.
+    -F FLOAT, --filter_cutoff FLOAT
+                          Cutoff to select non-redundant motifs. Default is 0.8,
+                          increase this value to get fewer motifs.
+    --nocenter            Don't mean-center the rows by default
     -m NAMES, --methods NAMES
                           Run with specific methods
+    -a method, --aggregation method
+                          How to combine motifs from individual methods. Default
+                          is "int_stouffer", for inverse normal transform of
+                          ranks, followed by Stouffer's method to combine
+                          z-scores. Alternatively, specify "stuart" for log-
+                          transformed rank aggregation p-values.
+    -N INT, --nthreads INT
+                          Number of threads (default 12)
+    --rawscore            Don't z-score normalize motif scores
+    --nogc                Don't use GC% bins
 
 **Input file formats**
 
@@ -407,16 +422,23 @@ The second option looks like this:
 
 This is a tab-separated table, with a header describing the experiments. In case of sequencing data, such 
 as ChIP-seq, ATAC-seq or DNaseI seq, we recommend to use **log-transformed** read counts which are
-**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after the log-transformatiion step, 
-for instance by quantile normalization or scaling.
+**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after
+ the log-transformatiion step, for instance by quantile normalization or scaling.
+By default, ``gimme maelstrom`` will mean-center the input, disable this with ``--nocenter``. 
 
 The second input format generally gives better results than the first one and would be the recommended format.
 
 The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. 
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using the rank aggregation method from `Kolde, 2012 <https://www.ncbi.nlm.nih.gov/pubmed/22247279>`_. 
-The score that is shown is the -log10(p-value), where the p-value comes from the rank aggregation.
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+This z-score represents the combined result of multiple methods.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of 
+the motif or a higher motif score is associated with higher signal in a specific sample.
+
+By default, ``gimme maelstrom`` selects a non-redundant set of motifs by clustering the motifs based on scores in the set of
+input sequences. You can disable this by using the ``--no-filter`` argument. You can tweak the number of selected motifs by
+changing the ``--filter-cutoff`` parameter. By default this is set to ``0.8``. Increase this value to select fewer motifs, 
+decrease it to select more motifs. Keep in mind that you may start to lose biologically relevant motifs if you set this too high.
 
 .. _`gimme_scan`:
 
diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 088f8710..1439dcac 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -330,11 +330,11 @@ There output directory contains several files:
 The two motif files, ``motif.count.txt.gz`` and ``motif.score.txt.gz`` contain the motif scan results. 
 The ``activity.*.out.txt`` files are tables with the results of the individual methods. 
 The main result is ``final.out.txt``, which integrates all individual methods in a final score. 
-This score represents the combined result of multiple methods.
-The individual results from different methods are ranked from high-scoring motif to low-scoring motif
-and then aggregated using the rank aggregation method from `Kolde, 2012 <https://www.ncbi.nlm.nih.gov/pubmed/22247279>`_.
-The score that is shown is the -log10(p-value).
-This procedure is then repeated with the ranking reversed. These are shown as negative values.
+This z-score represents the combined result of multiple methods.
+The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted
+to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using
+Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of 
+the motif or a higher motif score is associated with higher signal in a specific sample.
 
 The file ``gimme.maelstrom.report.html`` contains a graphical summary of this file that can be opened in your web browser.
 
@@ -359,7 +359,7 @@ This will show a heatmap like this:
 .. image:: images/heatmap.png
 
 We see that the expected motifs for different cell types are identified. GATA/TAL1 for Erythrocytes, CEBP for monocytes, LEF/TCF for T cells (ie. Wnt signaling), SPIB and PAX5 for B cells and so on. 
-Keep in mind that this shows only the most relevant motifs (-log10 p-value cutoff of 6), there are more relevant motifs. 
+Keep in mind that this shows only the most relevant motifs (z-score threshold of 6), there are more relevant motifs. 
 This example was run only on 1,000 variable enhancer. A file with more regions, ``hg19.blood.most_variable.10k.txt`` for this example, will usually yield better results.
 
 The Jupyter notebook example `maelstrom.ipynb <https://github.com/vanheeringen-lab/gimmemotifs/blob/master/docs/notebooks/maelstrom.ipynb>`_ shows a more extensive example on how to work with maelstrom results in Python.
diff --git a/readthedocs.yml b/readthedocs.yml
index f9d483db..e458c84e 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -1,15 +1,18 @@
 # Required
 version: 2
 
-# Build documentation in the docs/ directory with Sphinx
+conda:
+  environment: .rtd-environment.yml
+
+  # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py
 
-conda:
-  environment: .rtd-environment.yml
+build:
+  image: latest
  
 python:
   version: 3.7
   install:
-    - method: pip
+    - method: setuptools
       path: .

From c448ee8c4fd20bd98fe6461046304957d18a975b Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 11:51:29 +0200
Subject: [PATCH 64/85] trd

---
 .rtd-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
index 5fafc7d3..2f23c0d2 100644
--- a/.rtd-environment.yml
+++ b/.rtd-environment.yml
@@ -29,3 +29,4 @@ dependencies:
   - tqdm >=4.27.0
   - xdg
   - xgboost >=0.71
+  - sphinx_bootstrap_theme

From bb1f82a803967e8b174dc0c472dac7d1bf4cfe77 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 13:46:16 +0200
Subject: [PATCH 65/85] rtd

---
 .rtd-environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
index 2f23c0d2..cbcad0d7 100644
--- a/.rtd-environment.yml
+++ b/.rtd-environment.yml
@@ -30,3 +30,4 @@ dependencies:
   - xdg
   - xgboost >=0.71
   - sphinx_bootstrap_theme
+  - numpydoc

From 5c49f59953050664ba4bddc3b8065787bf707fa3 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 17:01:20 +0200
Subject: [PATCH 66/85] move format_factors to motif

---
 gimmemotifs/maelstrom.py |  7 ++--
 gimmemotifs/motif.py     | 78 ++++++++++++++++++++++++++++++++++++++
 gimmemotifs/report.py    | 81 +---------------------------------------
 3 files changed, 82 insertions(+), 84 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index fc5f936b..d5847214 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -41,7 +41,7 @@
 from gimmemotifs.moap import moap, Moap, scan_to_table
 from gimmemotifs.rank import rankagg
 from gimmemotifs.motif import read_motifs
-from gimmemotifs.report import maelstrom_html_report, format_factors
+from gimmemotifs.report import maelstrom_html_report
 from gimmemotifs.utils import join_max, pfmfile_location
 
 from multiprocessing import Pool
@@ -564,7 +564,7 @@ def __init__(self, outdir):
 
         # Load motifs
         fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m"))
-        print(fnames)
+        
         if len(fnames) == 0:
             fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
         if len(fnames) > 0:
@@ -723,8 +723,7 @@ def plot_heatmap(
 
         if name:
             m["factors"] = [
-                format_factors(
-                    self.motifs[n],
+                self.motifs[n].format_factors(
                     max_length=max_number_factors,
                     html=False,
                     include_indirect=indirect,
diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index a0f1608c..e4eb7f6d 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -10,6 +10,7 @@
 import sys
 import random
 from math import log, sqrt
+from collections import Counter
 from warnings import warn
 
 from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME
@@ -1303,6 +1304,83 @@ def wiggle_pwm(self):
 
         return self.wiggled_pwm
 
+    def format_factors(self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"):
+        if html:
+            fmt_d = "<span style='color:black'>{}</span>"
+            fmt_i = "<span style='color:#666666'>{}</span>"
+        else:
+            fmt_d = fmt_i = "{}"
+
+        if hasattr(self, "factor_info"):
+            fcount = Counter([x.upper() for x in self.factor_info["Factor"]])
+        else:
+            fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME])
+
+        direct = sorted(
+            list(
+                set(
+                    [x.upper() if x != "de novo" else x for x in self.factors[DIRECT_NAME]]
+                )
+            ),
+            key=lambda x: fcount[x],
+            reverse=True,
+        )
+
+        indirect = []
+        if include_indirect:
+            indirect = sorted(
+                list(
+                    set(
+                        [
+                            x.upper()
+                            for x in self.factors[INDIRECT_NAME]
+                            if x.upper() not in direct
+                        ]
+                    )
+                ),
+                key=lambda x: fcount[x],
+                reverse=True,
+            )
+
+        if len(direct) > max_length:
+            show_factors = direct[:max_length]
+        else:
+            show_factors = direct[:]
+            for f in sorted(indirect, key=lambda x: fcount[x], reverse=True):
+                if f not in show_factors:
+                    show_factors.append(f)
+                if len(show_factors) >= max_length:
+                    break
+
+        if "de novo" in show_factors:
+            show_factors = ["de novo"] + sorted(
+                [f for f in show_factors if f != "de novo"],
+                key=lambda x: fcount[x],
+                reverse=True,
+            )
+        else:
+            show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True)
+
+        factor_str = ",".join(
+            [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]
+        )
+
+        if len(direct + indirect) > max_length:
+            factor_str += extra_str
+
+        if html:
+            tooltip = ""
+            if len(direct) > 0:
+                tooltip += "direct: " + ",".join(sorted(direct))
+            if len(indirect) > 0:
+                if tooltip != "":
+                    tooltip += "&#10;"
+                tooltip += "predicted: " + ",".join(sorted(indirect))
+
+            factor_str = '<div title="' + tooltip + '">' + factor_str + "</div>"
+
+        return factor_str
+
 
 def default_motifs():
     """Return list of Motif instances from default motif database."""
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index dd65aefd..aa0b3a9a 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -11,7 +11,6 @@
 import re
 import shutil
 import logging
-from collections import Counter
 
 import jinja2
 import numpy as np
@@ -845,84 +844,6 @@ def create_denovo_motif_report(
     )
 
 
-def format_factors(motif, max_length=5, html=True, include_indirect=True, extra_str=", (...)"):
-    if html:
-        fmt_d = "<span style='color:black'>{}</span>"
-        fmt_i = "<span style='color:#666666'>{}</span>"
-    else:
-        fmt_d = fmt_i = "{}"
-
-    if hasattr(motif, "factor_info"):
-        fcount = Counter([x.upper() for x in motif.factor_info["Factor"]])
-    else:
-        fcount = Counter(motif.factors[DIRECT_NAME] + motif.factors[INDIRECT_NAME])
-
-    direct = sorted(
-        list(
-            set(
-                [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]]
-            )
-        ),
-        key=lambda x: fcount[x],
-        reverse=True,
-    )
-
-    indirect = []
-    if include_indirect:
-        indirect = sorted(
-            list(
-                set(
-                    [
-                        x.upper()
-                        for x in motif.factors[INDIRECT_NAME]
-                        if x.upper() not in direct
-                    ]
-                )
-            ),
-            key=lambda x: fcount[x],
-            reverse=True,
-        )
-
-    if len(direct) > max_length:
-        show_factors = direct[:max_length]
-    else:
-        show_factors = direct[:]
-        for f in sorted(indirect, key=lambda x: fcount[x], reverse=True):
-            if f not in show_factors:
-                show_factors.append(f)
-            if len(show_factors) >= max_length:
-                break
-
-    if "de novo" in show_factors:
-        show_factors = ["de novo"] + sorted(
-            [f for f in show_factors if f != "de novo"],
-            key=lambda x: fcount[x],
-            reverse=True,
-        )
-    else:
-        show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True)
-
-    factor_str = ",".join(
-        [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors]
-    )
-
-    if len(direct + indirect) > max_length:
-        factor_str += extra_str
-
-    if html:
-        tooltip = ""
-        if len(direct) > 0:
-            tooltip += "direct: " + ",".join(sorted(direct))
-        if len(indirect) > 0:
-            if tooltip != "":
-                tooltip += "&#10;"
-            tooltip += "predicted: " + ",".join(sorted(indirect))
-
-        factor_str = '<div title="' + tooltip + '">' + factor_str + "</div>"
-
-    return factor_str
-
-
 def motif_to_factor_series(series, pfmfile=None, motifs=None):
     if motifs is None:
         motifs = read_motifs(pfmfile, as_dict=True)
@@ -932,7 +853,7 @@ def motif_to_factor_series(series, pfmfile=None, motifs=None):
     else:
         index = series.index
 
-    factors = [format_factors(motifs[motif]) for motif in series]
+    factors = [motifs[motif].format_factors(html=True) for motif in series]
     return pd.Series(data=factors, index=index)
 
 

From 951459fa2689084d46ebfdfb35e98bf4746a3b45 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 17:02:12 +0200
Subject: [PATCH 67/85] fix slow threshold determination

---
 gimmemotifs/scanner.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index bf4c2170..d2726993 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -931,8 +931,11 @@ def best_match(self, seqs, scan_rc=True, zscore=False, gc=False):
             yield [m[0] for m in matches]
 
     def get_seq_bin(self, seq):
-        useq = seq.upper()
-        gc = round((useq.count("G") + useq.count("C")) / len(useq), 2)
+        if len(str(seq)) == 0:
+            gc = 0
+        else:
+            useq = seq.upper()
+            gc = round((useq.count("G") + useq.count("C")) / len(useq), 2)
         if gc == 0:
             gc = 0.01
         for b_start, b_end in self.gc_bins:
@@ -1011,29 +1014,23 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
                 columns=_threshold.columns,
             )
 
-        min_frac = min(gc_bin_count.values())
+        nseqs = int(20000 / np.sum(list(gc_bin_count.values())))
         t = {}
         maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns)
         # We do this in a loop as the DataFrame will get too big to fit in memory
         # when the difference between the number of sequences per gc_bin is very
         # high.
-        for motif in _threshold.columns:
-            dfs = [
-                _threshold.loc[gc_bin, motif].sample(
-                    int(count / min_frac * 1000), replace=True, random_state=42
-                )
-                for gc_bin, count in gc_bin_count.items()
-            ]
-
-            fpr_df = pd.concat(dfs)
-            val = fpr_df.quantile(0.99, interpolation="higher")
+        _threshold = _threshold.reset_index()
+        idx = np.hstack([_threshold[_threshold[_threshold.columns[0]] == gc_bin].sample(nseqs * count, re
+place=True, random_state=42).index.values for gc_bin, count in gc_bin_count.items()])
+        for motif in _threshold.columns[1:]:
+            val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher")
             if val < maxt.loc[motif]:
                 t[motif] = val
             else:
                 t[motif] = None
-
         return t
-
+        
     def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):
         scan_func = partial(
             scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc

From 227941e078bb904e38ee94ef5a5861d6eb166fa5 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 17:04:31 +0200
Subject: [PATCH 68/85] style

---
 gimmemotifs/conversion.py | 103 ++++++++++++++++++++++++++++++++++++++
 gimmemotifs/maelstrom.py  |   2 +-
 gimmemotifs/motif.py      |   9 +++-
 gimmemotifs/scanner.py    |  12 +++--
 4 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 gimmemotifs/conversion.py

diff --git a/gimmemotifs/conversion.py b/gimmemotifs/conversion.py
new file mode 100644
index 00000000..ed2a1487
--- /dev/null
+++ b/gimmemotifs/conversion.py
@@ -0,0 +1,103 @@
+# import mygene
+import pandas as pd
+import pybedtools
+from genomepy import Genome
+import sys
+from gimmemotifs.fasta import Fasta
+
+# mg = mygene.MyGeneInfo()
+
+# xli = ["gata3"]
+
+# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all")
+
+# for hit in out:
+#     print(hit)
+# #     if "genomic_pos" in hit:
+# #         print("{}:{}-{}\t{}".format(
+# #             hit["genomic_pos"]["chr"],
+# #             hit["genomic_pos"]["start"],
+# #             hit["genomic_pos"]["end"],
+# #             hit["query"],
+# #             ))
+
+# sys.exit()
+from functools import singledispatch
+
+
+@singledispatch
+def scan(obj):
+    # default implementation
+    raise NotImplementedError(f"Not implemented for {type(obj)}")
+
+
+@scan.register(pd.DataFrame)
+def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"):
+    if not set(columns).issubset(df.columns):
+        raise ValueError(f"Expected columns {columns}")
+
+    if len(columns) == 3:
+        # Assume this is chromosome start, end
+        g = Genome(genome)
+        seqs = list(
+            (
+                df[columns[0]]
+                + ":"
+                + df[columns[1]].astype(str)
+                + "-"
+                + df[columns[2]].astype(str)
+            ).values
+        )
+        return g.track2fasta(seqs)
+    elif len(columns) == 1:
+        # Assume this is some kind of gene_id
+        return df[columns[0]].values
+
+
+# @scan.register(pybedtools.BedTool)
+# @profile
+def _scan_bedtool(bed, genome="hg38"):
+    g = Genome(genome)
+    intervals = [g[f.chrom][f.start : f.stop] for f in bed]
+    return intervals
+
+
+# @profile
+def _scan_bedtool2(bed, genome="hg38"):
+    g = Genome(genome)
+    return Fasta(bed.sequence(fi=g.filename).seqfn).seqs
+
+
+import requests
+
+rest_url = "https://rest.ensembl.org/info/species"
+r = requests.get(rest_url, headers={"Content-Type": "application/json"})
+
+if not r.ok:
+    r.raise_for_status()
+
+json = r.json()
+
+print(json)
+
+
+# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822,
+# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]})
+# b = pybedtools.BedTool("5k.bed")
+# # for f in b:
+# #     print(f)
+# #     break
+# seqs = _scan_bedtool(b, genome="Spur_3.1")
+# seqs = _scan_bedtool2(b, genome="Spur_3.1")
+
+# g = Genome("hg19")
+# print(g["chr1"][1000000:1000100])
+
+# FASTA file
+# BED file
+# region file
+# Gene file
+#   - promoter (all species)
+#   - closest accessible region (human)
+#   - sum / mean / max of regions within distance of promoter
+#   - sum / mean / max of regions within weighted distance of promoter
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index d5847214..d764a1c9 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -564,7 +564,7 @@ def __init__(self, outdir):
 
         # Load motifs
         fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m"))
-        
+
         if len(fnames) == 0:
             fnames = glob.glob(os.path.join(outdir, "*.p[fw]m"))
         if len(fnames) > 0:
diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py
index e4eb7f6d..253bc410 100644
--- a/gimmemotifs/motif.py
+++ b/gimmemotifs/motif.py
@@ -1304,7 +1304,9 @@ def wiggle_pwm(self):
 
         return self.wiggled_pwm
 
-    def format_factors(self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"):
+    def format_factors(
+        self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"
+    ):
         if html:
             fmt_d = "<span style='color:black'>{}</span>"
             fmt_i = "<span style='color:#666666'>{}</span>"
@@ -1319,7 +1321,10 @@ def format_factors(self, max_length=5, html=False, include_indirect=True, extra_
         direct = sorted(
             list(
                 set(
-                    [x.upper() if x != "de novo" else x for x in self.factors[DIRECT_NAME]]
+                    [
+                        x.upper() if x != "de novo" else x
+                        for x in self.factors[DIRECT_NAME]
+                    ]
                 )
             ),
             key=lambda x: fcount[x],
diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index d2726993..cfccc415 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -1021,8 +1021,14 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
         # when the difference between the number of sequences per gc_bin is very
         # high.
         _threshold = _threshold.reset_index()
-        idx = np.hstack([_threshold[_threshold[_threshold.columns[0]] == gc_bin].sample(nseqs * count, re
-place=True, random_state=42).index.values for gc_bin, count in gc_bin_count.items()])
+        idx = np.hstack(
+            [
+                _threshold[_threshold[_threshold.columns[0]] == gc_bin]
+                .sample(nseqs * count, replace=True, random_state=42)
+                .index.values
+                for gc_bin, count in gc_bin_count.items()
+            ]
+        )
         for motif in _threshold.columns[1:]:
             val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher")
             if val < maxt.loc[motif]:
@@ -1030,7 +1036,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False):
             else:
                 t[motif] = None
         return t
-        
+
     def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc):
         scan_func = partial(
             scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc

From bd321e633e72111b11548ec2da15b9f66b54ab13 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 19:24:16 +0200
Subject: [PATCH 69/85] typo

---
 docs/tutorials.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/tutorials.rst b/docs/tutorials.rst
index 1439dcac..9aae5889 100644
--- a/docs/tutorials.rst
+++ b/docs/tutorials.rst
@@ -320,7 +320,7 @@ There is also a larger file, that contains more regions ``hg19.blood.most_variab
 
     $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.blood.1k.out
 
-There output directory contains several files:
+The output directory contains several files:
 
 ::
    
@@ -342,7 +342,6 @@ The file ``gimme.maelstrom.report.html`` contains a graphical summary of this fi
 
 You can sort on the different columns by clicking on them.
 
-
 The following Python snippet will create a heatmap of the results.
 
 .. code-block:: python

From 2b4d5e6e58f64e7091cd00bdc5922f9ca801cba9 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Thu, 23 Jul 2020 19:27:37 +0200
Subject: [PATCH 70/85] flakefix

---
 gimmemotifs/conversion.py | 103 --------------------------------------
 gimmemotifs/maelstrom.py  |   2 +-
 gimmemotifs/report.py     |   2 +-
 3 files changed, 2 insertions(+), 105 deletions(-)
 delete mode 100644 gimmemotifs/conversion.py

diff --git a/gimmemotifs/conversion.py b/gimmemotifs/conversion.py
deleted file mode 100644
index ed2a1487..00000000
--- a/gimmemotifs/conversion.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# import mygene
-import pandas as pd
-import pybedtools
-from genomepy import Genome
-import sys
-from gimmemotifs.fasta import Fasta
-
-# mg = mygene.MyGeneInfo()
-
-# xli = ["gata3"]
-
-# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all")
-
-# for hit in out:
-#     print(hit)
-# #     if "genomic_pos" in hit:
-# #         print("{}:{}-{}\t{}".format(
-# #             hit["genomic_pos"]["chr"],
-# #             hit["genomic_pos"]["start"],
-# #             hit["genomic_pos"]["end"],
-# #             hit["query"],
-# #             ))
-
-# sys.exit()
-from functools import singledispatch
-
-
-@singledispatch
-def scan(obj):
-    # default implementation
-    raise NotImplementedError(f"Not implemented for {type(obj)}")
-
-
-@scan.register(pd.DataFrame)
-def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"):
-    if not set(columns).issubset(df.columns):
-        raise ValueError(f"Expected columns {columns}")
-
-    if len(columns) == 3:
-        # Assume this is chromosome start, end
-        g = Genome(genome)
-        seqs = list(
-            (
-                df[columns[0]]
-                + ":"
-                + df[columns[1]].astype(str)
-                + "-"
-                + df[columns[2]].astype(str)
-            ).values
-        )
-        return g.track2fasta(seqs)
-    elif len(columns) == 1:
-        # Assume this is some kind of gene_id
-        return df[columns[0]].values
-
-
-# @scan.register(pybedtools.BedTool)
-# @profile
-def _scan_bedtool(bed, genome="hg38"):
-    g = Genome(genome)
-    intervals = [g[f.chrom][f.start : f.stop] for f in bed]
-    return intervals
-
-
-# @profile
-def _scan_bedtool2(bed, genome="hg38"):
-    g = Genome(genome)
-    return Fasta(bed.sequence(fi=g.filename).seqfn).seqs
-
-
-import requests
-
-rest_url = "https://rest.ensembl.org/info/species"
-r = requests.get(rest_url, headers={"Content-Type": "application/json"})
-
-if not r.ok:
-    r.raise_for_status()
-
-json = r.json()
-
-print(json)
-
-
-# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822,
-# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]})
-# b = pybedtools.BedTool("5k.bed")
-# # for f in b:
-# #     print(f)
-# #     break
-# seqs = _scan_bedtool(b, genome="Spur_3.1")
-# seqs = _scan_bedtool2(b, genome="Spur_3.1")
-
-# g = Genome("hg19")
-# print(g["chr1"][1000000:1000100])
-
-# FASTA file
-# BED file
-# region file
-# Gene file
-#   - promoter (all species)
-#   - closest accessible region (human)
-#   - sum / mean / max of regions within distance of promoter
-#   - sum / mean / max of regions within weighted distance of promoter
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index d764a1c9..d214ffa8 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -661,7 +661,7 @@ def plot_heatmap(
 
         aspect : int, optional
             Aspect ratio for tweaking the plot.
-  
+
         cmap : str, optional
             Color paletter to use, RdBu_r by default.
 
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index aa0b3a9a..bb5caa6f 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -30,7 +30,7 @@
 from gimmemotifs.comparison import MotifComparer
 from gimmemotifs.fasta import Fasta
 from gimmemotifs.motif import read_motifs
-from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME
+from gimmemotifs.config import MotifConfig
 from gimmemotifs.plot import roc_plot
 from gimmemotifs.stats import calc_stats, add_star, write_stats
 from gimmemotifs import __version__

From 21075494c1ecaaf628526a0f0f71b25cb0da949a Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Fri, 24 Jul 2020 08:08:22 +0200
Subject: [PATCH 71/85] update report

---
 gimmemotifs/maelstrom.py | 25 ++++++++++++-------------
 gimmemotifs/report.py    | 14 ++++++++------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index d214ffa8..95f68c72 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -197,6 +197,10 @@ def df_rank_aggregation(df, dfs, exps, method="int_stouffer"):
     if df.shape[1] != 1:
         df_p = df_p[df.columns]
 
+    if method == "int_stouffer":
+        df_p.columns = ["z-score " + c for c in df_p.columns]
+    else:
+        df_p.columns = ["activity " + c for c in df_p.columns]
     return df_p
 
 
@@ -504,19 +508,16 @@ def run_maelstrom(
         logger.info("Rank aggregation")
         df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)
 
+        # Add percentage of input sequences with motif
+        df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100
+
         if df.shape[1] > 1:
             # Add correlation between motif score and signal
             logger.info("Correlation")
-            cols = df_p.columns
-            for col in cols[::-1]:
-                df_p.insert(0, f"correlation {col}", 0)
+            for col in df.columns:
+                df_p[f"corr {col}"] = 0
                 for motif in df_p.index:
-                    df_p.loc[motif, f"correlation {col}"] = pearsonr(
-                        df[col], scores[motif]
-                    )[0]
-
-        # Add percentage of input sequences with motif
-        df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100)
+                    df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0]
 
         df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t")
     # df_p = df_p.join(m2f)
@@ -585,15 +586,13 @@ def __init__(self, outdir):
         self.result = pd.read_table(
             os.path.join(outdir, "final.out.txt"), comment="#", index_col=0
         )
-        self.correlation = self.result.loc[
-            :, self.result.columns.str.contains("correlation")
-        ]
+        self.correlation = self.result.loc[:, self.result.columns.str.contains("corr")]
         self.percent_match = self.result.loc[
             :, self.result.columns.str.contains("% with motif")
         ]
         self.result = self.result.loc[
             :,
-            ~self.result.columns.str.contains("correlation")
+            ~self.result.columns.str.contains("corr")
             & ~self.result.columns.str.contains("% with motif"),
         ]
 
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index bb5caa6f..381c887a 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -403,6 +403,7 @@ def _circle(
                 self.data.loc[subslice].index,
                 self.data.loc[subslice].select_dtypes(exclude=["object"]).columns,
             ]
+        idx = self._current_index(subslice)
 
         self.circle_styles = self.circle_styles or []
         circle_id = len(self.circle_styles) + 1
@@ -442,8 +443,8 @@ def _circle(
                 if vmax is None
                 else vmax * 1.01
             )
-            text = self.display_data.loc[subslice].astype(str) if show_text else ""
-            self.display_data.loc[subslice] = (
+            text = self.display_data.iloc[idx].astype(str) if show_text else ""
+            self.display_data.iloc[idx] = (
                 f"<div class='circle{circle_id} color{circle_id}_"
                 + (self.data.loc[subslice] / (vmax / len(palette)))
                 .astype(int)
@@ -453,8 +454,8 @@ def _circle(
                 + "</div>"
             )
         else:
-            text = self.display_data.loc[subslice].astype(str) if show_text else ""
-            self.display_data.loc[subslice] = (
+            text = self.display_data.iloc[idx].astype(str) if show_text else ""
+            self.display_data.iloc[idx] = (
                 f"<div class='circle{circle_id} color{circle_id}_0'>" + text + "</div>"
             )
 
@@ -889,10 +890,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
 
     # Columns with maelstrom rank aggregation value
     value_cols = df.columns[
-        ~df.columns.str.contains("correlation") & ~df.columns.isin(["% with motif"])
+        ~df.columns.str.contains("corr") & ~df.columns.isin(["% with motif"])
     ]
     # Columns with correlation values
-    corr_cols = df.columns[df.columns.str.contains("correlation")]
+    corr_cols = df.columns[df.columns.str.contains("corr")]
 
     df = df[np.any(abs(df[value_cols]) >= threshold, 1)]
 
@@ -921,6 +922,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
         .set_table_attributes('class="sortable-theme-slick" data-sortable')
         .align(subset=list(value_cols), location="center")
         .set_font("Nunito Sans")
+        .wrap()
         .rename(columns=rename_columns)
     )
 

From 55550992490ca8a6a0a46a27316b74f252371c1c Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 27 Jul 2020 11:12:05 +0200
Subject: [PATCH 72/85] fix '% with motif' column

---
 gimmemotifs/report.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 381c887a..87bc8398 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -922,7 +922,6 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
         .set_table_attributes('class="sortable-theme-slick" data-sortable')
         .align(subset=list(value_cols), location="center")
         .set_font("Nunito Sans")
-        .wrap()
         .rename(columns=rename_columns)
     )
 
@@ -950,7 +949,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
             .to_precision_str(subset=["% with motif"])
         )
 
-    df_styled = df_styled.render()
+    df_styled = df_styled.wrap().render()
 
     with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f:
         f.write(df_styled)

From b8cb1059c9e3178a96bf377b911d8a7c3a9ee002 Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Fri, 21 Aug 2020 11:57:46 +0200
Subject: [PATCH 73/85] fix another issue with numeric chrom names

---
 scripts/combine_peaks | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/combine_peaks b/scripts/combine_peaks
index 155d6e83..3d7c0e49 100644
--- a/scripts/combine_peaks
+++ b/scripts/combine_peaks
@@ -46,7 +46,7 @@ def read_peak_file_to_df(fname):
             "qval",
             "peak",
         ]
-        df = pd.read_table(fname, names=header)
+        df = pd.read_table(fname, names=header, dtype={"chrom": "str"})
         df["chrom"] = df["chrom"].astype(str)
 
         # get the summit
@@ -57,7 +57,7 @@ def read_peak_file_to_df(fname):
         df["value"] = df["qval"]
         df = df[summit_header]
     elif ftype == "bed":
-        df = pd.read_table(fname, names=summit_header)
+        df = pd.read_table(fname, names=summit_header, dtype={"chrom": "str"})
         if ((df["end"] - df["start"]) != 1).sum() != 0:
             raise ValueError(f"{fname} does not contain summits.")
     else:

From baf48207a7126c79d98adaaad78865af9a637ee0 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 7 Sep 2020 19:40:58 +0200
Subject: [PATCH 74/85] initial support for multispecies maelstrom

---
 gimmemotifs/utils.py | 61 ++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index 27d7bed3..23747a88 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -23,7 +23,7 @@
 from scipy import special
 import numpy as np
 import pybedtools
-from genomepy import Genome
+from genomepy import Genome, list_installed_genomes
 
 
 # gimme imports
@@ -470,7 +470,7 @@ def get_seqs_type(seqs):
         - region file
         - BED file
     """
-    region_p = re.compile(r"^(.+):(\d+)-(\d+)$")
+    region_p = re.compile(r"^([^\s:]+\@)?(.+):(\d+)-(\d+)$")
     if isinstance(seqs, Fasta):
         return "fasta"
     elif isinstance(seqs, list) or isinstance(seqs, np.ndarray):
@@ -496,24 +496,51 @@ def get_seqs_type(seqs):
         raise ValueError("unknown type {}".format(type(seqs).__name__))
 
 
-def as_fasta(seqs, genome=None):
-    ftype = get_seqs_type(seqs)
+def as_fasta(input_seqs, genome=None):
+    ftype = get_seqs_type(input_seqs)
     if ftype == "fasta":
-        return seqs
+        return input_seqs
     elif ftype == "fastafile":
-        return Fasta(seqs)
+        return Fasta(input_seqs)
     else:
-        if genome is None:
-            raise ValueError("need genome to convert to FASTA")
-
-        tmpfa = NamedTemporaryFile()
-        if isinstance(genome, str):
-            genome = Genome(genome)
-
-        if isinstance(seqs, np.ndarray):
-            seqs = list(seqs)
-        genome.track2fasta(seqs, tmpfa.name)
-        return Fasta(tmpfa.name)
+        if isinstance(input_seqs, np.ndarray):
+            seqs = list(input_seqs)
+
+        genomic_regions = {}
+        if "@" in input_seqs[0]:
+            available = list_installed_genomes()
+            for seq in input_seqs:
+                genome, region = seq.split("@")
+                if genome not in genomic_regions:
+                    if genome not in available:
+                        raise ValueError(f"genome {genome} is not installed!")
+                    genomic_regions[genome] = []
+                genomic_regions[genome].append(region)
+        else:
+            if genome is None:
+                raise ValueError("need genome to convert to FASTA")
+            genomic_regions[genome] = input_seqs
+
+        tmpfa = NamedTemporaryFile(mode="w")
+        for genome, regions in genomic_regions.items():
+
+            if isinstance(genome, str):
+                genome = Genome(genome)
+
+            tmpfa2 = NamedTemporaryFile()
+            genome.track2fasta(regions, tmpfa2.name)
+
+            fa = Fasta(tmpfa2.name)
+            for name, seq in fa.items():
+                print(f">{genome.name}@{name}\n{fa._format_seq(seq)}", file=tmpfa)
+        tmpfa.flush()
+
+        # Open tempfile and restore original sequence order
+        fa = Fasta(tmpfa.name)
+        seqs = [fa[region] for region in input_seqs]
+        fa.ids = input_seqs[:]
+        fa.seqs = seqs[:]
+        return fa
 
 
 def file_checksum(fname):

From 4385c11794ca1737af02aad3b7f95b5f63249855 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 9 Sep 2020 11:29:38 +0200
Subject: [PATCH 75/85] % with motif per category in maelstrom report

---
 data/templates/sortable/sortable.min.js |  2 +-
 gimmemotifs/maelstrom.py                |  8 +++++++-
 gimmemotifs/report.py                   | 26 +++++++++++++------------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/data/templates/sortable/sortable.min.js b/data/templates/sortable/sortable.min.js
index b968cf01..11d419dd 100644
--- a/data/templates/sortable/sortable.min.js
+++ b/data/templates/sortable/sortable.min.js
@@ -1,2 +1,2 @@
 /*! sortable.js 0.8.0 */
-(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(-?[£$¤]?[\d,.e\-]+%?|inf)$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this);
+(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(.+\>)?<?(-?[£$¤]?[\d,.e\-]+%?|inf)(\<.+)?$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this);
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index 95f68c72..f257bdf7 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -509,7 +509,13 @@ def run_maelstrom(
         df_p = df_rank_aggregation(df, dfs, exps, method=aggregation)
 
         # Add percentage of input sequences with motif
-        df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100
+        if df.shape[1] > 1:
+            df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100
+        else:
+            bla = counts.join(df).groupby(df.columns[0]).mean() * 100
+            bla = bla.T
+            bla = bla.rename(columns={col: f"{col} % with motif" for col in bla.columns})
+            df_p = df_p.join(bla)
 
         if df.shape[1] > 1:
             # Add correlation between motif score and signal
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 87bc8398..7a77d51d 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -890,7 +890,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
 
     # Columns with maelstrom rank aggregation value
     value_cols = df.columns[
-        ~df.columns.str.contains("corr") & ~df.columns.isin(["% with motif"])
+        ~df.columns.str.contains("corr") & ~df.columns.str.contains("% with motif")
     ]
     # Columns with correlation values
     corr_cols = df.columns[df.columns.str.contains("corr")]
@@ -907,8 +907,9 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
     df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
 
     rename_columns = {"factors": FACTOR_TOOLTIP}
-    if "% with motif" in df.columns:
-        df["% with motif"] = df["% with motif"].astype(int)
+    for col in df.columns:
+        if "% with motif" in col:
+            df[col] = df[col].astype(int)
 
     df_styled = (
         ExtraStyler(df)
@@ -938,16 +939,17 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
             )
         )
 
-    if "% with motif" in df.columns:
-        df_styled = (
-            df_styled.add_circle(
-                subset=["% with motif"], cmap="Purples", vmax=100, size=40
+    for col in df.columns:
+        if "% with motif" in col:
+            df_styled = (
+                df_styled.add_circle(
+                    subset=[col], cmap="Purples", vmax=100, size=40
+                )
+                .wrap(subset=[col])
+                .align(subset=[col], location="center")
+                .border(subset=[col], location="left")
+                .to_precision_str(subset=[col])
             )
-            .wrap(subset=["% with motif"])
-            .align(subset=["% with motif"], location="center")
-            .border(subset=["% with motif"], location="left")
-            .to_precision_str(subset=["% with motif"])
-        )
 
     df_styled = df_styled.wrap().render()
 

From 88783d9232fb600217ab8a218359ce19242dd781 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Wed, 9 Sep 2020 17:03:03 +0200
Subject: [PATCH 76/85] * Safe motif names (fixes #135) * Multiples species in
 maelstrom now supported (#141)

---
 gimmemotifs/commands/motifs.py |   4 +-
 gimmemotifs/fasta.py           |   6 +-
 gimmemotifs/maelstrom.py       |   4 +-
 gimmemotifs/report.py          |   8 +-
 gimmemotifs/utils.py           | 228 +++++++++++++++++++++++++++------
 5 files changed, 201 insertions(+), 49 deletions(-)

diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py
index c4489d43..f5b72737 100755
--- a/gimmemotifs/commands/motifs.py
+++ b/gimmemotifs/commands/motifs.py
@@ -7,6 +7,7 @@
 """Command line function 'roc'."""
 from __future__ import print_function
 import os
+import re
 import sys
 import shutil
 import logging
@@ -246,10 +247,11 @@ def motifs(args):
         with NamedTemporaryFile(mode="w") as f:
             print(motif_dict[motif].to_pwm(), file=f)
             f.flush()
+            safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)
             scan_to_file(
                 sample,
                 f.name,
-                filepath_or_buffer=os.path.join(scan_dir, f"{motif}.matches.bed"),
+                filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"),
                 bed=True,
                 fpr=0.01,
                 genome=args.genome,
diff --git a/gimmemotifs/fasta.py b/gimmemotifs/fasta.py
index 6cc1d5d4..781d1cf0 100644
--- a/gimmemotifs/fasta.py
+++ b/gimmemotifs/fasta.py
@@ -12,7 +12,7 @@
 
 
 class Fasta(object):
-    def __init__(self, fname=None, split_whitespace=False):
+    def __init__(self, fname=None, split_whitespace=False, fdict=None):
         """ Instantiate fasta object. Optional Fasta-formatted file as argument"""
         self.ids = []
         self.seqs = []
@@ -35,6 +35,10 @@ def __init__(self, fname=None, split_whitespace=False):
                     if p.match(sequence):
                         raise IOError("Not a valid FASTA file")
                     self.seqs.append(sequence)
+        elif fdict is not None:
+            for name, seq in fdict.items():
+                self.ids.append(name)
+                self.seqs.append(seq)
 
     def hardmask(self):
         """ Mask all lowercase nucleotides with N's """
diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py
index f257bdf7..fbe46b03 100644
--- a/gimmemotifs/maelstrom.py
+++ b/gimmemotifs/maelstrom.py
@@ -514,7 +514,9 @@ def run_maelstrom(
         else:
             bla = counts.join(df).groupby(df.columns[0]).mean() * 100
             bla = bla.T
-            bla = bla.rename(columns={col: f"{col} % with motif" for col in bla.columns})
+            bla = bla.rename(
+                columns={col: f"{col} % with motif" for col in bla.columns}
+            )
             df_p = df_p.join(bla)
 
         if df.shape[1] > 1:
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index 7a77d51d..d0cdcfa8 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -871,7 +871,7 @@ def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="l
     for motif in series:
         if motif not in motifs:
             raise ValueError(f"Motif {motif} does not occur in motif database")
-        fname = subdir + "/{}.png".format(re.sub("[()/]", "_", motif))
+        fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif))
         if not os.path.exists(fname):
             motifs[motif].plot_logo(fname=os.path.join(outdir, fname))
         img_series.append(fname)
@@ -942,9 +942,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
     for col in df.columns:
         if "% with motif" in col:
             df_styled = (
-                df_styled.add_circle(
-                    subset=[col], cmap="Purples", vmax=100, size=40
-                )
+                df_styled.add_circle(subset=[col], cmap="Purples", vmax=100, size=40)
                 .wrap(subset=[col])
                 .align(subset=[col], location="center")
                 .border(subset=[col], location="left")
@@ -994,7 +992,7 @@ def roc_html_report(
     if link_matches:
         df["# matches"] = (
             "<a href=motif_scan_results/"
-            + df.index.to_series()
+            + df.index.to_series().str.replace(r"[^a-zA-Z0-9\-]+", "_")
             + ".matches.bed>"
             + df["# matches"].astype(str)
             + "</a>"
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index 23747a88..fd2ca25b 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -15,15 +15,19 @@
 import random
 import tempfile
 import requests
+from io import TextIOWrapper
+from functools import singledispatch
 from subprocess import Popen
 from tempfile import NamedTemporaryFile
 from shutil import copyfile
 
 # External imports
+import pyfaidx
 from scipy import special
 import numpy as np
 import pybedtools
-from genomepy import Genome, list_installed_genomes
+from genomepy import Genome
+from Bio.SeqIO.FastaIO import SimpleFastaParser
 
 
 # gimme imports
@@ -496,51 +500,193 @@ def get_seqs_type(seqs):
         raise ValueError("unknown type {}".format(type(seqs).__name__))
 
 
-def as_fasta(input_seqs, genome=None):
-    ftype = get_seqs_type(input_seqs)
-    if ftype == "fasta":
-        return input_seqs
-    elif ftype == "fastafile":
-        return Fasta(input_seqs)
-    else:
-        if isinstance(input_seqs, np.ndarray):
-            seqs = list(input_seqs)
-
-        genomic_regions = {}
-        if "@" in input_seqs[0]:
-            available = list_installed_genomes()
-            for seq in input_seqs:
-                genome, region = seq.split("@")
-                if genome not in genomic_regions:
-                    if genome not in available:
-                        raise ValueError(f"genome {genome} is not installed!")
-                    genomic_regions[genome] = []
-                genomic_regions[genome].append(region)
-        else:
-            if genome is None:
-                raise ValueError("need genome to convert to FASTA")
-            genomic_regions[genome] = input_seqs
+# Regular expression to check for region (chr:start-end or genome@chr:start-end)
+region_p = re.compile(r"^[^@]+@([^\s]+):(\d+)-(\d+)$")
 
-        tmpfa = NamedTemporaryFile(mode="w")
-        for genome, regions in genomic_regions.items():
 
-            if isinstance(genome, str):
-                genome = Genome(genome)
+def _check_minsize(fa, minsize):
+    """
+    Raise ValueError if there is any sequence that is shorter than minsize.
+    If minsize is None the size will not be checked.
+    """
+    if minsize is None:
+        return fa
 
-            tmpfa2 = NamedTemporaryFile()
-            genome.track2fasta(regions, tmpfa2.name)
+    for name, seq in fa.items():
+        if len(seq) < minsize:
+            raise ValueError(f"sequence {name} is shorter than {minsize}")
 
-            fa = Fasta(tmpfa2.name)
-            for name, seq in fa.items():
-                print(f">{genome.name}@{name}\n{fa._format_seq(seq)}", file=tmpfa)
-        tmpfa.flush()
+    return fa
 
-        # Open tempfile and restore original sequence order
-        fa = Fasta(tmpfa.name)
-        seqs = [fa[region] for region in input_seqs]
-        fa.ids = input_seqs[:]
-        fa.seqs = seqs[:]
-        return fa
+
+def _genomepy_convert(to_convert, genome, minsize=None):
+    """
+    Convert a variety of inputs using track2fasta().
+    """
+    if genome is None:
+        raise ValueError("input file is not a FASTA file, need a genome!")
+
+    g = Genome(genome)
+    tmpfile = NamedTemporaryFile()
+    g.track2fasta(to_convert, tmpfile.name)
+
+    fa = as_seqdict(tmpfile.name)
+    return _check_minsize(fa, minsize)
+
+
+def _as_seqdict_genome_regions(regions, minsize=None):
+    """
+    Accepts list of regions where the genome is encoded in the region,
+    using the genome@chrom:start-end format.
+    """
+    genomic_regions = {}
+    for region in regions:
+        genome, region = region.split("@")
+        if genome not in genomic_regions:
+            Genome(genome)
+            genomic_regions[genome] = []
+        genomic_regions[genome].append(region)
+
+    tmpfa = NamedTemporaryFile(mode="w", delete=False)
+    for genome, g_regions in genomic_regions.items():
+        g = Genome(genome)
+
+        fa = g.track2fasta(g_regions)
+
+        for seq in fa:
+            seq.name = f"{genome}@{seq.name}"
+            print(seq.__repr__(), file=tmpfa)
+
+    tmpfa.flush()
+
+    # Open tempfile and restore original sequence order
+    fa = as_seqdict(tmpfa.name)
+    fa = {region: fa[region] for region in regions}
+    return _check_minsize(fa, minsize)
+
+
+@singledispatch
+def as_seqdict(to_convert, genome=None, minsize=None):
+    """
+    Convert input to a dictionary with name as key and sequence as value.
+
+    If the input contains genomic coordinates, the genome needs to be
+    specified. If minsize is specified all sequences will be checked if they
+    are not shorter than minsize. If regions (or a region file) are used as
+    the input, the genome can optionally be specified in the region using the
+    following format: genome@chrom:start-end.
+
+    Current supported input types include:
+    * FASTA, BED and region files.
+    * List or numpy.ndarray of regions.
+    * pyfaidx.Fasta object.
+    * pybedtools.BedTool object.
+
+    Parameters
+    ----------
+    to_convert : list, str, pyfaidx.Fasta or pybedtools.BedTool
+        Input to convert to FASTA-like dictionary
+
+    genome : str, optional
+        Genomepy genome name.
+
+    minsize : int or None, optional
+        If specified, check if all sequences have at least size minsize.
+
+    Returns
+    -------
+        dict with sequence names as key and sequences as value.
+    """
+    raise NotImplementedError(f"Not implement for {type(to_convert)}")
+
+
+@as_seqdict.register(list)
+def _as_seqdict_list(to_convert, genome=None, minsize=None):
+    """
+    Accepts list of regions as input.
+    """
+    if region_p.match(to_convert[0]):
+        return _as_seqdict_genome_regions(to_convert, minsize)
+
+    return _genomepy_convert(to_convert, genome, minsize)
+
+
+@as_seqdict.register(TextIOWrapper)
+def _as_seqdict_file_object(to_convert, genome=None, minsize=None):
+    """
+    Accepts file object as input, should be a FASTA file.
+    """
+    fa = {x: y for x, y in SimpleFastaParser(to_convert)}
+    return _check_minsize(fa, minsize)
+
+
+@as_seqdict.register(str)
+def _as_seqdict_filename(to_convert, genome=None, minsize=None):
+    """
+    Accepts filename as input.
+    """
+    if not os.path.exists(to_convert):
+        raise ValueError("Assuming filename, but it does not exist")
+
+    f = open(to_convert)
+    fa = as_seqdict(f)
+
+    if any(fa):
+        return _check_minsize(fa, minsize)
+
+    with open(to_convert) as f:
+        line = ""
+        while True:
+            line = f.readline()
+            if line == "":
+                break
+            if not line.startswith("#"):
+                break
+
+        if line == "":
+            raise IOError(f"empty file {to_convert}")
+
+        if region_p.match(line.strip()):
+            regions = [l.strip() for l in [line] + f.readlines()]
+            return _as_seqdict_genome_regions(regions, minsize=None)
+
+    # Biopython parser resulted in empty dict
+    # Assuming it's a BED or region file
+    return _genomepy_convert(to_convert, genome, minsize)
+
+
+@as_seqdict.register(pyfaidx.Fasta)
+def _as_seqdict_pyfaidx(to_convert, genome=None, minsize=None):
+    """
+    Accepts pyfaidx.Fasta object as input.
+    """
+    fa = {k: str(v) for k, v in to_convert.items()}
+    return _check_minsize(fa, minsize)
+
+
+@as_seqdict.register(pybedtools.BedTool)
+def _as_seqdict_bedtool(to_convert, genome=None, minsize=None):
+    """
+    Accepts pybedtools.BedTool as input.
+    """
+    return _genomepy_convert(
+        ["{}:{}-{}".format(*f[:3]) for f in to_convert], genome, minsize
+    )
+
+
+@as_seqdict.register(np.ndarray)
+def _as_seqdict_array(to_convert, genome=None, minsize=None):
+    """
+    Accepts numpy.ndarray with regions as input.
+    """
+    return as_seqdict(list(to_convert), genome, minsize)
+
+
+def as_fasta(to_convert, genome=None, minsize=None):
+    if isinstance(to_convert, Fasta):
+        return to_convert
+
+    return Fasta(fdict=as_seqdict(to_convert, genome, minsize))
 
 
 def file_checksum(fname):

From 3c96fda3e6cbd4516065144c0bc033f76c9888e5 Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Fri, 25 Sep 2020 15:33:31 +0200
Subject: [PATCH 77/85] "easier" imports

---
 gimmemotifs/__init__.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py
index 3d63dd7f..c7dafa84 100644
--- a/gimmemotifs/__init__.py
+++ b/gimmemotifs/__init__.py
@@ -52,3 +52,23 @@ def filter(self, record):
 
 __version__ = get_versions()["version"]
 del get_versions
+
+# easier import of gimme (config and cli left out)
+from . import background
+from . import cluster
+from . import comparison
+from . import denovo
+from . import fasta
+from . import maelstrom
+from . import moap
+from . import motif
+from . import plot
+from . import prediction
+from . import rank
+from . import report
+from . import rocmetrics
+from . import scanner
+from . import shutils
+from . import stats
+from . import utils
+from . import validation

From 389549ecddbdb4980742fd1f88944c35a3a45a1a Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Fri, 25 Sep 2020 16:05:11 +0200
Subject: [PATCH 78/85] fix flake8

---
 gimmemotifs/__init__.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py
index c7dafa84..c34b617e 100644
--- a/gimmemotifs/__init__.py
+++ b/gimmemotifs/__init__.py
@@ -54,21 +54,21 @@ def filter(self, record):
 del get_versions
 
 # easier import of gimme (config and cli left out)
-from . import background
-from . import cluster
-from . import comparison
-from . import denovo
-from . import fasta
-from . import maelstrom
-from . import moap
-from . import motif
-from . import plot
-from . import prediction
-from . import rank
-from . import report
-from . import rocmetrics
-from . import scanner
-from . import shutils
-from . import stats
-from . import utils
-from . import validation
+from . import background  # noqa: F401
+from . import cluster     # noqa: F401
+from . import comparison  # noqa: F401
+from . import denovo      # noqa: F401
+from . import fasta       # noqa: F401
+from . import maelstrom   # noqa: F401
+from . import moap        # noqa: F401
+from . import motif       # noqa: F401
+from . import plot        # noqa: F401
+from . import prediction  # noqa: F401
+from . import rank        # noqa: F401
+from . import report      # noqa: F401
+from . import rocmetrics  # noqa: F401
+from . import scanner     # noqa: F401
+from . import shutils     # noqa: F401
+from . import stats       # noqa: F401
+from . import utils       # noqa: F401
+from . import validation  # noqa: F401

From 5367259e75e1436a00690e92577431be29656431 Mon Sep 17 00:00:00 2001
From: Maarten-vd-Sande <maartenvandersande@hotmail.com>
Date: Fri, 25 Sep 2020 16:15:07 +0200
Subject: [PATCH 79/85] ignore indentation for black

---
 gimmemotifs/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py
index c34b617e..fcd514c2 100644
--- a/gimmemotifs/__init__.py
+++ b/gimmemotifs/__init__.py
@@ -53,6 +53,7 @@ def filter(self, record):
 __version__ = get_versions()["version"]
 del get_versions
 
+# fmt: off
 # easier import of gimme (config and cli left out)
 from . import background  # noqa: F401
 from . import cluster     # noqa: F401
@@ -72,3 +73,4 @@ def filter(self, record):
 from . import stats       # noqa: F401
 from . import utils       # noqa: F401
 from . import validation  # noqa: F401
+# fmt: on

From a8181d8ed97bc3e49a067058f2da2e18e7d570b0 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 28 Sep 2020 15:00:17 +0200
Subject: [PATCH 80/85] new black version

---
 gimmemotifs/comparison.py        |   6 +-
 gimmemotifs/conversion.tryout.py | 103 +++++++++++++++++++++++++++++++
 gimmemotifs/plot.py              |   3 +-
 gimmemotifs/rank.py              |  20 +++---
 gimmemotifs/report.py            |  10 ++-
 gimmemotifs/scanner.py           |   3 +-
 gimmemotifs/utils.py             |   9 ++-
 7 files changed, 132 insertions(+), 22 deletions(-)
 create mode 100644 gimmemotifs/conversion.tryout.py

diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py
index f964205d..911cd9d6 100644
--- a/gimmemotifs/comparison.py
+++ b/gimmemotifs/comparison.py
@@ -950,7 +950,11 @@ def select_nonredundant_motifs(
     y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0])))
 
     X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.4, random_state=2, shuffle=True,
+        X,
+        y,
+        test_size=0.4,
+        random_state=2,
+        shuffle=True,
     )
 
     X_bla = X_train[keep]
diff --git a/gimmemotifs/conversion.tryout.py b/gimmemotifs/conversion.tryout.py
new file mode 100644
index 00000000..ed2a1487
--- /dev/null
+++ b/gimmemotifs/conversion.tryout.py
@@ -0,0 +1,103 @@
+# import mygene
+import pandas as pd
+import pybedtools
+from genomepy import Genome
+import sys
+from gimmemotifs.fasta import Fasta
+
+# mg = mygene.MyGeneInfo()
+
+# xli = ["gata3"]
+
+# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all")
+
+# for hit in out:
+#     print(hit)
+# #     if "genomic_pos" in hit:
+# #         print("{}:{}-{}\t{}".format(
+# #             hit["genomic_pos"]["chr"],
+# #             hit["genomic_pos"]["start"],
+# #             hit["genomic_pos"]["end"],
+# #             hit["query"],
+# #             ))
+
+# sys.exit()
+from functools import singledispatch
+
+
+@singledispatch
+def scan(obj):
+    # default implementation
+    raise NotImplementedError(f"Not implemented for {type(obj)}")
+
+
+@scan.register(pd.DataFrame)
+def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"):
+    if not set(columns).issubset(df.columns):
+        raise ValueError(f"Expected columns {columns}")
+
+    if len(columns) == 3:
+        # Assume this is chromosome start, end
+        g = Genome(genome)
+        seqs = list(
+            (
+                df[columns[0]]
+                + ":"
+                + df[columns[1]].astype(str)
+                + "-"
+                + df[columns[2]].astype(str)
+            ).values
+        )
+        return g.track2fasta(seqs)
+    elif len(columns) == 1:
+        # Assume this is some kind of gene_id
+        return df[columns[0]].values
+
+
+# @scan.register(pybedtools.BedTool)
+# @profile
+def _scan_bedtool(bed, genome="hg38"):
+    g = Genome(genome)
+    intervals = [g[f.chrom][f.start : f.stop] for f in bed]
+    return intervals
+
+
+# @profile
+def _scan_bedtool2(bed, genome="hg38"):
+    g = Genome(genome)
+    return Fasta(bed.sequence(fi=g.filename).seqfn).seqs
+
+
+import requests
+
+rest_url = "https://rest.ensembl.org/info/species"
+r = requests.get(rest_url, headers={"Content-Type": "application/json"})
+
+if not r.ok:
+    r.raise_for_status()
+
+json = r.json()
+
+print(json)
+
+
+# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822,
+# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]})
+# b = pybedtools.BedTool("5k.bed")
+# # for f in b:
+# #     print(f)
+# #     break
+# seqs = _scan_bedtool(b, genome="Spur_3.1")
+# seqs = _scan_bedtool2(b, genome="Spur_3.1")
+
+# g = Genome("hg19")
+# print(g["chr1"][1000000:1000100])
+
+# FASTA file
+# BED file
+# region file
+# Gene file
+#   - promoter (all species)
+#   - closest accessible region (human)
+#   - sum / mean / max of regions within distance of promoter
+#   - sum / mean / max of regions within weighted distance of promoter
diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py
index 6703b6e4..d27f9d21 100644
--- a/gimmemotifs/plot.py
+++ b/gimmemotifs/plot.py
@@ -32,8 +32,7 @@
 
 
 def axes_off(ax):
-    """Get rid of all axis ticks, lines, etc.
-    """
+    """Get rid of all axis ticks, lines, etc."""
     ax.set_frame_on(False)
     ax.axes.get_yaxis().set_visible(False)
     ax.axes.get_xaxis().set_visible(False)
diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py
index 1a8cfde6..5ae37690 100644
--- a/gimmemotifs/rank.py
+++ b/gimmemotifs/rank.py
@@ -81,16 +81,16 @@ def qStuart(r):
 def _rank_int(series, c=3.0 / 8, stochastic=True):
     # Based on code by Edward Mountjoy
     # See: https://github.com/edm1/rank-based-INT
-    """ Perform rank-based inverse normal transformation on pandas series.
-        If stochastic is True ties are given rank randomly, otherwise ties will
-        share the same value. NaN values are ignored.
-        Args:
-            param1 (pandas.Series):   Series of values to transform
-            param2 (Optional[float]): Constand parameter (Bloms constant)
-            param3 (Optional[bool]):  Whether to randomise rank of ties
-
-        Returns:
-            pandas.Series
+    """Perform rank-based inverse normal transformation on pandas series.
+    If stochastic is True ties are given rank randomly, otherwise ties will
+    share the same value. NaN values are ignored.
+    Args:
+        param1 (pandas.Series):   Series of values to transform
+        param2 (Optional[float]): Constand parameter (Bloms constant)
+        param3 (Optional[bool]):  Whether to randomise rank of ties
+
+    Returns:
+        pandas.Series
     """
 
     # Check input
diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index d0cdcfa8..cb03e713 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -914,7 +914,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
     df_styled = (
         ExtraStyler(df)
         .set_precision(2)
-        .convert_to_image(subset=["logo"], height=30,)
+        .convert_to_image(
+            subset=["logo"],
+            height=30,
+        )
         .scaled_background_gradient(
             subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75
         )
@@ -1033,7 +1036,10 @@ def roc_html_report(
         if df.shape[0] > 0:
             f.write(
                 ExtraStyler(df)
-                .convert_to_image(subset=["logo"], height=30,)
+                .convert_to_image(
+                    subset=["logo"],
+                    height=30,
+                )
                 .add_circle(
                     subset=["% matches input", "%matches background"],
                     vmax=100,
diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py
index cfccc415..8397fb8f 100644
--- a/gimmemotifs/scanner.py
+++ b/gimmemotifs/scanner.py
@@ -250,8 +250,7 @@ def scan_to_file(
     zscore=True,
     gcnorm=True,
 ):
-    """Scan an inputfile with motifs.
-    """
+    """Scan an inputfile with motifs."""
     should_close = False
     if filepath_or_buffer is None:
         fo = sys.stdout
diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index fd2ca25b..b6a9a336 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -50,8 +50,7 @@ def rc(seq):
 
 
 def narrowpeak_to_bed(inputfile, bedfile, size=0):
-    """Convert narrowPeak file to BED file.
-    """
+    """Convert narrowPeak file to BED file."""
     p = re.compile(r"^(#|track|browser)")
     warn_no_summit = True
     with open(bedfile, "w") as f_out:
@@ -133,7 +132,7 @@ def phyper_single(k, good, bad, N):
 
 
 def phyper(k, good, bad, N):
-    """ Current hypergeometric implementation in scipy is broken,
+    """Current hypergeometric implementation in scipy is broken,
     so here's the correct version.
     """
     pvalues = [phyper_single(x, good, bad, N) for x in range(k + 1, N + 1)]
@@ -294,8 +293,8 @@ def motif_localization(fastafile, motif, size, outfile, cutoff=0.9):
 
 
 def parse_cutoff(motifs, cutoff, default=0.9):
-    """ Provide either a file with one cutoff per motif or a single cutoff
-        returns a hash with motif id as key and cutoff as value
+    """Provide either a file with one cutoff per motif or a single cutoff
+    returns a hash with motif id as key and cutoff as value
     """
 
     cutoffs = {}

From fcbf34d0b509ddac35a45d435db045a07f02c4f2 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 28 Sep 2020 15:08:25 +0200
Subject: [PATCH 81/85] fix tests

---
 gimmemotifs/utils.py   | 8 ++++++--
 test/test_maelstrom.py | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index b6a9a336..99c3a281 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -525,7 +525,11 @@ def _genomepy_convert(to_convert, genome, minsize=None):
     if genome is None:
         raise ValueError("input file is not a FASTA file, need a genome!")
 
-    g = Genome(genome)
+    if isinstance(genome, Genome):
+        g = genome
+    else:
+        g = Genome(genome)
+    
     tmpfile = NamedTemporaryFile()
     g.track2fasta(to_convert, tmpfile.name)
 
@@ -646,7 +650,7 @@ def _as_seqdict_filename(to_convert, genome=None, minsize=None):
             raise IOError(f"empty file {to_convert}")
 
         if region_p.match(line.strip()):
-            regions = [l.strip() for l in [line] + f.readlines()]
+            regions = [myline.strip() for myline in [line] + f.readlines()]
             return _as_seqdict_genome_regions(regions, minsize=None)
 
     # Biopython parser resulted in empty dict
diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py
index 66f602e2..53bffc77 100644
--- a/test/test_maelstrom.py
+++ b/test/test_maelstrom.py
@@ -34,7 +34,7 @@ def test1_maelstrom(self):
         df = pd.read_table(self.outfile, index_col=0, comment="#")
         print(df.shape)
 
-        self.assertEquals((623, 5), df.shape)
+        self.assertEquals((623, 8), df.shape)
         
         # Filter redundant motifs
         run_maelstrom(
@@ -48,7 +48,7 @@ def test1_maelstrom(self):
         )
         df = pd.read_table(self.outfile, index_col=0, comment="#")
         print(df.shape)
-        self.assertEquals((156, 5), df.shape)
+        self.assertEquals((156, 8), df.shape)
 
 
         for fname in glob(os.path.join(self.outdir, "activity*")):

From 9cb4f6fafd1bb63b730fe4d9042465c4f7bf15e4 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 28 Sep 2020 15:22:52 +0200
Subject: [PATCH 82/85] style

---
 gimmemotifs/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py
index 99c3a281..a1b3c71b 100644
--- a/gimmemotifs/utils.py
+++ b/gimmemotifs/utils.py
@@ -529,7 +529,7 @@ def _genomepy_convert(to_convert, genome, minsize=None):
         g = genome
     else:
         g = Genome(genome)
-    
+
     tmpfile = NamedTemporaryFile()
     g.track2fasta(to_convert, tmpfile.name)
 

From fb638a581253612de1e11c6a7123d05c156651bc Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 28 Sep 2020 15:23:11 +0200
Subject: [PATCH 83/85] fix unnecessary conversion

---
 gimmemotifs/report.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py
index cb03e713..e41951dc 100644
--- a/gimmemotifs/report.py
+++ b/gimmemotifs/report.py
@@ -907,9 +907,6 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3):
     df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile))
 
     rename_columns = {"factors": FACTOR_TOOLTIP}
-    for col in df.columns:
-        if "% with motif" in col:
-            df[col] = df[col].astype(int)
 
     df_styled = (
         ExtraStyler(df)

From a84ab48e5cc1b4f9987e7c43f62316da48e3cc4f Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Mon, 28 Sep 2020 15:24:38 +0200
Subject: [PATCH 84/85] remove

---
 gimmemotifs/conversion.tryout.py | 103 -------------------------------
 1 file changed, 103 deletions(-)
 delete mode 100644 gimmemotifs/conversion.tryout.py

diff --git a/gimmemotifs/conversion.tryout.py b/gimmemotifs/conversion.tryout.py
deleted file mode 100644
index ed2a1487..00000000
--- a/gimmemotifs/conversion.tryout.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# import mygene
-import pandas as pd
-import pybedtools
-from genomepy import Genome
-import sys
-from gimmemotifs.fasta import Fasta
-
-# mg = mygene.MyGeneInfo()
-
-# xli = ["gata3"]
-
-# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all")
-
-# for hit in out:
-#     print(hit)
-# #     if "genomic_pos" in hit:
-# #         print("{}:{}-{}\t{}".format(
-# #             hit["genomic_pos"]["chr"],
-# #             hit["genomic_pos"]["start"],
-# #             hit["genomic_pos"]["end"],
-# #             hit["query"],
-# #             ))
-
-# sys.exit()
-from functools import singledispatch
-
-
-@singledispatch
-def scan(obj):
-    # default implementation
-    raise NotImplementedError(f"Not implemented for {type(obj)}")
-
-
-@scan.register(pd.DataFrame)
-def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"):
-    if not set(columns).issubset(df.columns):
-        raise ValueError(f"Expected columns {columns}")
-
-    if len(columns) == 3:
-        # Assume this is chromosome start, end
-        g = Genome(genome)
-        seqs = list(
-            (
-                df[columns[0]]
-                + ":"
-                + df[columns[1]].astype(str)
-                + "-"
-                + df[columns[2]].astype(str)
-            ).values
-        )
-        return g.track2fasta(seqs)
-    elif len(columns) == 1:
-        # Assume this is some kind of gene_id
-        return df[columns[0]].values
-
-
-# @scan.register(pybedtools.BedTool)
-# @profile
-def _scan_bedtool(bed, genome="hg38"):
-    g = Genome(genome)
-    intervals = [g[f.chrom][f.start : f.stop] for f in bed]
-    return intervals
-
-
-# @profile
-def _scan_bedtool2(bed, genome="hg38"):
-    g = Genome(genome)
-    return Fasta(bed.sequence(fi=g.filename).seqfn).seqs
-
-
-import requests
-
-rest_url = "https://rest.ensembl.org/info/species"
-r = requests.get(rest_url, headers={"Content-Type": "application/json"})
-
-if not r.ok:
-    r.raise_for_status()
-
-json = r.json()
-
-print(json)
-
-
-# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822,
-# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]})
-# b = pybedtools.BedTool("5k.bed")
-# # for f in b:
-# #     print(f)
-# #     break
-# seqs = _scan_bedtool(b, genome="Spur_3.1")
-# seqs = _scan_bedtool2(b, genome="Spur_3.1")
-
-# g = Genome("hg19")
-# print(g["chr1"][1000000:1000100])
-
-# FASTA file
-# BED file
-# region file
-# Gene file
-#   - promoter (all species)
-#   - closest accessible region (human)
-#   - sum / mean / max of regions within distance of promoter
-#   - sum / mean / max of regions within weighted distance of promoter

From faa31a3872d9d5004a46b1a197e8008df80d3e23 Mon Sep 17 00:00:00 2001
From: Simon van Heeringen <simon.vanheeringen@gmail.com>
Date: Tue, 29 Sep 2020 15:40:25 +0200
Subject: [PATCH 85/85] Update CHANGELOG

---
 CHANGELOG.md | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0638e19..a109ed18 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,33 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+## [0.15.0] - 2020-09-29
+
+### Added
+
+- Added additional columns to `gimme maelstrom` output for better intepretation (correlation of motif to signal and % of regions with motif).
+- Added support for multi-species input in `genome@chrom:start-end` format.
+- `gimme maelstrom` warns if data is not row-centered and will center by default.
+- `gimme maelstrom` selects a set of non-redundant (or less redundant) motifs by default.
+- Added SVR regressor for `gimme maelstrom`.
+- Added quantile normalization to `coverage_table`.
+
+### Removed
+
+- Removed the lightning classifiers and regressors as the package is no longer actively maintained.
+
+### Changed
+
+- Visually improved HTML output.
+- Score of `maelstrom` is now an aggregate z-score based on combining z-scores from individual methods using Stouffer's method. The z-scores of individual methods are generated using the inverse normal transform.
+- Reorganized some classes and functions.
+
+### Fixed
+
+- Fixed minor issues with sorting columns in HTML output.
+- `gimme motifs` doesn't crash when no motifs are found.
+- Fixed error with Ensembl chromosome names in `combine_peaks`.
+
 ## [0.14.4] - 2020-04-02
 
 ### Fixed