From 5b23b2ec7cce1cdb864302896078ea38e9cfed83 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Fri, 17 Apr 2020 16:30:14 +0200 Subject: [PATCH 01/85] fix for Ensembl genomes --- gimmemotifs/background.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index e308f556..32deb2df 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -360,6 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100): cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)] df.columns = cols + + # Make really sure that column 'chrom' is a string + df.dropna(subset=['chrom'], inplace=True) + df['chrom'] = df['chrom'].apply(str).astype("string") + df.reset_index()[cols].to_feather(fname) From a5265906b440fa0290dda7a334169cb8179629a3 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Fri, 17 Apr 2020 17:12:20 +0200 Subject: [PATCH 02/85] style --- gimmemotifs/background.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index 32deb2df..9ecf0318 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -360,11 +360,11 @@ def create_gc_bin_index(genome, fname, min_bin_size=100): cols += ["w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)] df.columns = cols - + # Make really sure that column 'chrom' is a string - df.dropna(subset=['chrom'], inplace=True) - df['chrom'] = df['chrom'].apply(str).astype("string") - + df.dropna(subset=["chrom"], inplace=True) + df["chrom"] = df["chrom"].apply(str).astype("string") + df.reset_index()[cols].to_feather(fname) From 40684d54b5d8fb114f69152d23c3e9e4d1fa76ab Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 27 May 2020 08:33:39 +0200 Subject: [PATCH 03/85] fix #118 --- gimmemotifs/denovo.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gimmemotifs/denovo.py b/gimmemotifs/denovo.py index 2d1913ac..2cd1225f 100644 --- a/gimmemotifs/denovo.py +++ b/gimmemotifs/denovo.py @@ -669,11 +669,14 @@ def gimme_motifs( sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) - with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: - for m in final_motifs: - f.write("{}\n".format(m.to_pwm())) + motifs_found = len(final_motifs) > 0 - if create_report: + if motifs_found: + with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: + for m in final_motifs: + f.write("{}\n".format(m.to_pwm())) + + if motifs_found and create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( @@ -700,7 +703,7 @@ def gimme_motifs( logger.info("finished") logger.info("output dir: %s", outdir) - if cluster: + if motifs_found and cluster: logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html")) return final_motifs From b3aec2300afac7d5fd264ddbba03a17c4a6a9784 Mon Sep 17 00:00:00 2001 From: simonvh Date: Wed, 27 May 2020 08:37:28 +0200 Subject: [PATCH 04/85] MEME fix in docker --- gimmemotifs/tools/meme.py | 1 + gimmemotifs/tools/memew.py | 1 + 2 files changed, 2 insertions(+) diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py index 0319e8ad..a6a4d639 100644 --- a/gimmemotifs/tools/meme.py +++ b/gimmemotifs/tools/meme.py @@ -58,6 +58,7 @@ def _run_program(self, bin, fastafile, params=None): number = default_params["number"] cmd = [ + "OMPI_MCA_plm_rsh_agent=sh", # Fix to run in Docker bin, fastafile, "-text", diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py index c8d17e45..8cdeac7c 100644 --- a/gimmemotifs/tools/memew.py +++ b/gimmemotifs/tools/memew.py @@ -57,6 +57,7 @@ def _run_program(self, bin, fastafile, params=None): number = default_params["number"] cmd = [ + "OMPI_MCA_plm_rsh_agent=sh", bin, fastafile, "-text", From cd04aaf3a8a074dab4886d0d4e29c69188e7ac68 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 14:56:34 +0200 Subject: [PATCH 05/85] remove six --- gimmemotifs/commands/__init__.py | 6 +----- gimmemotifs/motif.py | 5 ++--- gimmemotifs/scanner.py | 3 +-- gimmemotifs/utils.py | 3 +-- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/gimmemotifs/commands/__init__.py b/gimmemotifs/commands/__init__.py index c20bb546..f944936c 100644 --- a/gimmemotifs/commands/__init__.py +++ b/gimmemotifs/commands/__init__.py @@ -1,13 +1,9 @@ import pkgutil -import six import os dirname = os.path.split(__file__)[0] -if six.PY3: - level = 0 -else: - level = -1 +level = 0 # Dynamically load all commands for _importer, cmdname, _ in pkgutil.iter_modules([dirname]): diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index 3065a6cb..cb8aba8b 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -11,7 +11,6 @@ import random from math import log, sqrt from warnings import warn -import six from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME from gimmemotifs.c_metrics import pfmscan @@ -1393,7 +1392,7 @@ def parse_motifs(motifs): motifs : list List of Motif instances. """ - if isinstance(motifs, six.string_types): + if isinstance(motifs, str): with open(motifs) as f: if motifs.endswith("pwm") or motifs.endswith("pfm"): motifs = read_motifs(f, fmt="pwm") @@ -1518,7 +1517,7 @@ def read_motifs(infile=None, fmt="pfm", as_dict=False): if fmt == "pwm": fmt = "pfm" - if infile is None or isinstance(infile, six.string_types): + if infile is None or isinstance(infile, str): infile = pfmfile_location(infile) with open(infile) as f: motifs = _read_motifs_from_filehandle(f, fmt) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index d45d3906..337c912d 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -5,7 +5,6 @@ from tempfile import mkdtemp, NamedTemporaryFile import logging import multiprocessing as mp -import six # "hidden" features, in development try: @@ -341,7 +340,7 @@ def scan_to_best_match( if genome: s.set_genome(genome) - if isinstance(motifs, six.string_types): + if isinstance(motifs, str): motifs = read_motifs(motifs) logger.debug("scanning %s...", fname) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 17bd6fb9..ad4b87b8 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -15,7 +15,6 @@ import logging import mmap import random -import six import tempfile import requests from subprocess import Popen @@ -88,7 +87,7 @@ def pfmfile_location(infile): "database specified in the config file." ) - if isinstance(infile, six.string_types): + if isinstance(infile, str): if not os.path.exists(infile): motif_dir = config.get_motif_dir() checkfile = os.path.join(motif_dir, infile) From 1d78626d4ccaf7e18966f1092a3e09db7de5c8e1 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 15:01:19 +0200 Subject: [PATCH 06/85] black --- conda_env.dev.txt | 2 - conda_env.osx.txt | 2 - conda_env.test.txt | 37 ++++++ conda_env.txt | 16 +-- gimmemotifs/background.py | 2 - gimmemotifs/comparison.py | 2 - gimmemotifs/moap.py | 1 - gimmemotifs/plot.py | 1 - gimmemotifs/utils.py | 2 - requirements.txt | 2 - setup.py | 2 - versioneer.py | 267 +++++++++++++++++++++++--------------- 12 files changed, 207 insertions(+), 129 deletions(-) create mode 100644 conda_env.test.txt diff --git a/conda_env.dev.txt b/conda_env.dev.txt index 31297494..9704440a 100644 --- a/conda_env.dev.txt +++ b/conda_env.dev.txt @@ -3,7 +3,6 @@ configparser dinamo diskcache feather-format -future gadem genomepy >=0.6.1 ghostscript @@ -27,7 +26,6 @@ pyyaml >=3.10 scikit-learn >=0.18 scipy >=1.3.0 seaborn -six sklearn-contrib-lightning statsmodels tqdm >=4.27.0 diff --git a/conda_env.osx.txt b/conda_env.osx.txt index dc2adacb..8ee94491 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -2,7 +2,6 @@ bedtools configparser diskcache feather-format -future gadem genomepy >=0.6.1 ghostscript @@ -24,7 +23,6 @@ pyyaml >=3.10 scikit-learn >=0.18 scipy <1.3.0 seaborn -six sklearn-contrib-lightning statsmodels tqdm >=4.27.0 diff --git a/conda_env.test.txt b/conda_env.test.txt new file mode 100644 index 00000000..81760361 --- /dev/null +++ b/conda_env.test.txt @@ -0,0 +1,37 @@ +bedtools +configparser +dinamo +diskcache +feather-format +gadem +genomepy >=0.6.1 +ghostscript +homer +icu=58 +ipywidgets # Necessary for progress bar in Jupyter notebook +jinja2 +logomaker +matplotlib >=2.0 +meme >=5 +ncurses +numpy +pillow +prosampler +pyarrow +pybedtools +python >=3.8 +python-xxhash +pyyaml >=3.10 +scikit-learn >=0.18 +scipy <1.3.0 +seaborn +sklearn-contrib-lightning +statsmodels +tqdm >=4.27.0 +trawler +ucsc-bigbedtobed +ucsc-genepredtobed +weeder +xdg +xgboost >=0.71 +xxmotif diff --git a/conda_env.txt b/conda_env.txt index 5767b058..3c9ea758 100644 --- a/conda_env.txt +++ b/conda_env.txt @@ -1,39 +1,35 @@ -bedtools configparser dinamo diskcache feather-format -future gadem -genomepy >=0.6.1 +genomepy >=0.8.3 ghostscript homer -icu=58 ipywidgets # Necessary for progress bar in Jupyter notebook jinja2 logomaker -matplotlib >=2.0 -meme >=5 +matplotlib-base >=3.1.2 +meme >=5.1.1 ncurses numpy +pandas >=1.0.3 pillow prosampler -pyarrow +pyarrow >=0.16.0 pybedtools pysam -python +python >=3 python-xxhash pyyaml >=3.10 scikit-learn >=0.18 scipy <1.3.0 seaborn -six sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler ucsc-bigbedtobed -ucsc-genepredtobed weeder xdg xgboost >=0.71 diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index 9ecf0318..60c96d53 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -11,8 +11,6 @@ similar genomic distribution as the input. """ -from __future__ import division - # Python imports import gzip import os diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py index 94c0b808..40565869 100644 --- a/gimmemotifs/comparison.py +++ b/gimmemotifs/comparison.py @@ -6,8 +6,6 @@ """ Module to compare DNA sequence motifs (positional frequency matrices) """ -from __future__ import print_function - # Python imports import sys import os diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 45b87356..16d0517b 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -4,7 +4,6 @@ # the terms of the MIT License, see the file COPYING included with this # distribution. """ Module for motif activity prediction """ -from __future__ import print_function def warn(*args, **kwargs): diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py index d161c138..436f48ed 100644 --- a/gimmemotifs/plot.py +++ b/gimmemotifs/plot.py @@ -4,7 +4,6 @@ # the terms of the MIT License, see the file COPYING included with this # distribution. """ Various plotting functions """ -from __future__ import print_function from PIL import Image import seaborn as sns from mpl_toolkits.axes_grid1 import ImageGrid diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index ad4b87b8..c65b58af 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -5,8 +5,6 @@ # distribution. """ Odds and ends that for which I didn't (yet) find another place """ -from __future__ import print_function - # Python imports import os import re diff --git a/requirements.txt b/requirements.txt index 604878ff..2ff4f9e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,4 @@ pysam xgboost diskcache xxhash -six -future pillow diff --git a/setup.py b/setup.py index 5f9fd60f..aa961c94 100644 --- a/setup.py +++ b/setup.py @@ -132,8 +132,6 @@ def run(self): "diskcache", "xxhash", "configparser", - "six", - "future", "genomepy >= 0.7.2", "tqdm", "pillow", diff --git a/versioneer.py b/versioneer.py index 64fea1c8..cce201c7 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1,4 +1,3 @@ - # Version: 0.18 """The Versioneer - like a rocketeer, but for versions. @@ -276,7 +275,6 @@ """ -from __future__ import print_function try: import configparser except ImportError: @@ -308,11 +306,13 @@ def get_root(): setup_py = os.path.join(root, "setup.py") versioneer_py = os.path.join(root, "versioneer.py") if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): - err = ("Versioneer was unable to run the project root directory. " - "Versioneer requires setup.py to be executed from " - "its immediate directory (like 'python setup.py COMMAND'), " - "or in a way that lets it use sys.argv[0] to find the root " - "(like 'python path/to/setup.py COMMAND').") + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools @@ -325,8 +325,10 @@ def get_root(): me_dir = os.path.normcase(os.path.splitext(me)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir: - print("Warning: build in %s is using versioneer.py from %s" - % (os.path.dirname(me), versioneer_py)) + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(me), versioneer_py) + ) except NameError: pass return root @@ -348,6 +350,7 @@ def get(parser, name): if parser.has_option("versioneer", name): return parser.get("versioneer", name) return None + cfg = VersioneerConfig() cfg.VCS = VCS cfg.style = get(parser, "style") or "" @@ -372,17 +375,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Decorator to mark a method as the handler for a particular VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -390,10 +394,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -418,7 +425,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, return stdout, p.returncode -LONG_VERSION_PY['git'] = ''' +LONG_VERSION_PY[ + "git" +] = ''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -993,7 +1002,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -1002,7 +1011,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -1010,19 +1019,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -1037,8 +1053,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -1046,10 +1061,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -1072,17 +1096,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -1091,10 +1114,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -1105,13 +1130,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces @@ -1167,16 +1192,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -1205,11 +1236,13 @@ def versions_from_file(filename): contents = f.read() except EnvironmentError: raise NotThisMethod("unable to read _version.py") - mo = re.search(r"version_json = '''\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: - mo = re.search(r"version_json = '''\r\n(.*)''' # END VERSION_JSON", - contents, re.M | re.S) + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) @@ -1218,8 +1251,7 @@ def versions_from_file(filename): def write_to_version_file(filename, versions): """Write the given version number to the given _version.py file.""" os.unlink(filename) - contents = json.dumps(versions, sort_keys=True, - indent=1, separators=(",", ": ")) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) @@ -1251,8 +1283,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -1366,11 +1397,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -1390,9 +1423,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } class VersioneerBadRootError(Exception): @@ -1415,8 +1452,9 @@ def get_versions(verbose=False): handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or cfg.verbose - assert cfg.versionfile_source is not None, \ - "please set versioneer.versionfile_source" + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) @@ -1470,9 +1508,13 @@ def get_versions(verbose=False): if verbose: print("unable to compute version") - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, "error": "unable to compute version", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } def get_version(): @@ -1521,6 +1563,7 @@ def run(self): print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) + cmds["version"] = cmd_version # we override "build_py" in both distutils and setuptools @@ -1553,14 +1596,15 @@ def run(self): # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: - target_versionfile = os.path.join(self.build_lib, - cfg.versionfile_build) + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) + cmds["build_py"] = cmd_build_py if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe + # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ @@ -1581,17 +1625,21 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["build_exe"] = cmd_build_exe del cmds["build_py"] - if 'py2exe' in sys.modules: # py2exe enabled? + if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.distutils_buildexe import py2exe as _py2exe # py3 except ImportError: @@ -1610,13 +1658,17 @@ def run(self): os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % - {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + cmds["py2exe"] = cmd_py2exe # we override different "sdist" commands for both environments @@ -1643,8 +1695,10 @@ def make_release_tree(self, base_dir, files): # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) - write_to_version_file(target_versionfile, - self._versioneer_generated_versions) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + cmds["sdist"] = cmd_sdist return cmds @@ -1699,11 +1753,13 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except (EnvironmentError, configparser.NoSectionError, - configparser.NoOptionError) as e: + except ( + EnvironmentError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): - print("Adding sample versioneer config to setup.cfg", - file=sys.stderr) + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) @@ -1712,15 +1768,18 @@ def do_setup(): print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] - f.write(LONG % {"DOLLAR": "$", - "STYLE": cfg.style, - "TAG_PREFIX": cfg.tag_prefix, - "PARENTDIR_PREFIX": cfg.parentdir_prefix, - "VERSIONFILE_SOURCE": cfg.versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(cfg.versionfile_source), - "__init__.py") + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: with open(ipy, "r") as f: @@ -1762,8 +1821,10 @@ def do_setup(): else: print(" 'versioneer.py' already in MANIFEST.in") if cfg.versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - cfg.versionfile_source) + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) with open(manifest_in, "a") as f: f.write("include %s\n" % cfg.versionfile_source) else: From c23c50fa3e9853b39ef1b0bbef4696f180c5bee9 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 19:50:18 +0200 Subject: [PATCH 07/85] remove lightning --- conda_env.dev.txt | 11 +- conda_env.osx.txt | 1 - conda_env.test.txt | 1 - conda_env.txt | 1 - gimmemotifs/moap.py | 258 +------------------------------------------- requirements.txt | 1 - setup.py | 1 - 7 files changed, 6 insertions(+), 268 deletions(-) diff --git a/conda_env.dev.txt b/conda_env.dev.txt index 9704440a..cb3f62e3 100644 --- a/conda_env.dev.txt +++ b/conda_env.dev.txt @@ -4,10 +4,8 @@ dinamo diskcache feather-format gadem -genomepy >=0.6.1 -ghostscript +genomepy >=0.8.3 homer -icu=58 ipywidgets # Necessary for progress bar in Jupyter notebook jinja2 logomaker @@ -17,16 +15,15 @@ ncurses numpy prosampler pillow -pyarrow +pyarrow >=0.16.0 pybedtools pysam python python-xxhash pyyaml >=3.10 -scikit-learn >=0.18 -scipy >=1.3.0 +scikit-learn >=0.23 +scipy >=1.4.1 seaborn -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler diff --git a/conda_env.osx.txt b/conda_env.osx.txt index 8ee94491..8eb820e6 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -23,7 +23,6 @@ pyyaml >=3.10 scikit-learn >=0.18 scipy <1.3.0 seaborn -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler diff --git a/conda_env.test.txt b/conda_env.test.txt index 81760361..a61ea95d 100644 --- a/conda_env.test.txt +++ b/conda_env.test.txt @@ -25,7 +25,6 @@ pyyaml >=3.10 scikit-learn >=0.18 scipy <1.3.0 seaborn -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler diff --git a/conda_env.txt b/conda_env.txt index 3c9ea758..441b7f03 100644 --- a/conda_env.txt +++ b/conda_env.txt @@ -25,7 +25,6 @@ pyyaml >=3.10 scikit-learn >=0.18 scipy <1.3.0 seaborn -sklearn-contrib-lightning statsmodels tqdm >=4.27.0 trawler diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 16d0517b..08efe91e 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -37,8 +37,6 @@ def warn(*args, **kwargs): from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import MultiTaskLasso, BayesianRidge from sklearn.preprocessing import scale, LabelEncoder -from lightning.classification import CDClassifier -from lightning.regression import CDRegressor import xgboost @@ -355,252 +353,6 @@ def fit(self, df_X, df_y): logger.info("Done") -@register_predictor("LightningRegressor") -class LightningRegressionMoap(Moap): - def __init__(self, scale=True, cv=3, ncpus=None): - """Predict motif activities using lightning CDRegressor - - Parameters - ---------- - scale : boolean, optional, default True - If ``True``, the motif scores will be scaled - before classification - - cv : int, optional, default 3 - Cross-validation k-fold parameter. - - ncpus : int, optional - Number of threads. Default is the number specified in the config. - - Attributes - ---------- - act_ : DataFrame, shape (n_motifs, n_clusters) - fitted coefficients - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation - """ - - self.act_description = "activity values: coefficients from " "fitted model" - - if ncpus is None: - ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) - self.ncpus = ncpus - self.kfolds = cv - self.scale = scale - - self.act_ = None - self.pref_table = "score" - self.supported_tables = ["score", "count"] - self.ptype = "regression" - - def fit(self, df_X, df_y, batch_size=50, shuffle=True, tmpdir=None): - logger.info("Fitting LightningRegression") - - if self.scale: - # Scale motif scores - df_X[:] = scale(df_X, axis=0) - - # Normalize across samples and features - # y = df_y.apply(scale, 1).apply(scale, 0) - y = df_y - X = df_X.loc[y.index] - - if not y.shape[0] == X.shape[0]: - raise ValueError("number of regions is not equal") - - # Define model - cd = CDRegressor(penalty="l1/l2", C=1.0) - parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 2)]} - clf = GridSearchCV(cd, parameters, n_jobs=self.ncpus) - - if shuffle: - idx = list(y.sample(y.shape[1], axis=1, random_state=42).columns) - else: - idx = list(y.columns) - - if tmpdir: - if not os.path.exists(tmpdir): - os.mkdir(tmpdir) - - coefs = pd.DataFrame(index=X.columns) - start_i = 0 - if tmpdir: - for i in range(0, len(idx), batch_size): - fname = os.path.join(tmpdir, "{}.feather".format(i)) - if os.path.exists(fname) and os.path.exists(fname + ".done"): - - tmp = pd.read_feather(fname) - tmp = tmp.set_index(tmp.columns[0]) - coefs = coefs.join(tmp) - else: - logger.info("Resuming at batch {}".format(i)) - start_i = i - break - - for i in tqdm(range(start_i, len(idx), batch_size)): - split_y = y[idx[i : i + batch_size]] - - # Fit model - clf.fit(X.values, split_y.values) - tmp = pd.DataFrame( - clf.best_estimator_.coef_.T, index=X.columns, columns=split_y.columns - ) - if tmpdir: - fname = os.path.join(tmpdir, "{}.feather".format(i)) - tmp.reset_index().rename(columns=str).to_feather(fname) - # Make sure we don't read corrupted files - open(fname + ".done", "a").close() - # Get coefficients - coefs = coefs.join(tmp) - - # Get coefficients - self.act_ = coefs[y.columns] - - logger.info("Done") - - -@register_predictor("LightningClassification") -class LightningClassificationMoap(Moap): - def __init__(self, scale=True, permute=False, ncpus=None): - """Predict motif activities using lightning CDClassifier - - Parameters - ---------- - scale : boolean, optional, default True - If ``True``, the motif scores will be scaled - before classification - - ncpus : int, optional - Number of threads. Default is the number specified in the config. - - Attributes - ---------- - act_ : DataFrame, shape (n_motifs, n_clusters) - fitted coefficients - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation - """ - - self.act_description = "activity values: coefficients from " "fitted model" - - # self.cdc = CDClassifier(random_state=args.seed) - self.cdc = CDClassifier() - - self.parameters = { - "penalty": ["l1/l2"], - "loss": ["squared_hinge"], - "multiclass": [True], - "max_iter": [20], - "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)], - "C": [0.001, 0.01, 0.1, 0.5, 1.0], - "tol": [1e-3], - } - - self.kfolds = 10 - - if ncpus is None: - ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) - - self.clf = GridSearchCV(self.cdc, self.parameters, cv=self.kfolds, n_jobs=ncpus) - - self.scale = scale - self.permute = permute - - self.act_ = None - self.sig_ = None - self.pref_table = "score" - self.supported_tables = ["score", "count"] - self.ptype = "classification" - - def fit(self, df_X, df_y): - logger.info("Fitting LightningClassification") - - if not df_y.shape[0] == df_X.shape[0]: - raise ValueError("number of regions is not equal") - if df_y.shape[1] != 1: - raise ValueError("y needs to have 1 label column") - - if self.scale: - # Scale motif scores - df_X[:] = scale(df_X, axis=0) - - idx = list(range(df_y.shape[0])) - - y = df_y.iloc[idx] - X = df_X.loc[y.index].values - y = y.values.flatten() - - # Convert (putative) string labels - label = LabelEncoder() - y = label.fit_transform(y) - - # Split data - X_train, X_test, y_train, y_test = train_test_split(X, y) - - logger.debug("Setting parameters through cross-validation") - # Determine best parameters based on CV - self.clf.fit(X_train, y_train) - - logger.debug( - "Average score ({} fold CV): {}".format( - self.kfolds, self.clf.score(X_test, y_test) - ) - ) - - logger.debug("Estimate coefficients using bootstrapping") - - # Estimate coefficients using bootstrappig - # b = BaggingClassifier(self.clf.best_estimator_, - # max_samples=0.75, n_jobs=-1, random_state=state) - b = BaggingClassifier(self.clf.best_estimator_, max_samples=0.75, n_jobs=-1) - b.fit(X, y) - - # Get mean coefficients - coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) - - # Create dataframe of predicted coefficients - if len(label.classes_) == 2: - self.act_ = pd.DataFrame(np.hstack((-coeffs.T, coeffs.T))) - else: - self.act_ = pd.DataFrame(coeffs.T) - - # Convert labels back to original names - self.act_.columns = label.inverse_transform(range(len(label.classes_))) - self.act_.index = df_X.columns - - if self.permute: - # Permutations - logger.debug("Permutations") - random_dfs = [] - for _ in range(10): - y_random = np.random.permutation(y) - b.fit(X, y_random) - coeffs = np.array([e.coef_ for e in b.estimators_]).mean(axis=0) - - if len(label.classes_) == 2: - random_dfs.append(pd.DataFrame(np.hstack((-coeffs.T, coeffs.T)))) - else: - random_dfs.append(pd.DataFrame(coeffs.T)) - random_df = pd.concat(random_dfs) - - # Select cutoff based on percentile - high_cutoffs = random_df.quantile(0.99) - low_cutoffs = random_df.quantile(0.01) - - # Set significance - self.sig_ = pd.DataFrame(index=df_X.columns) - self.sig_["sig"] = False - - for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs): - self.sig_["sig"].loc[self.act_[col] >= c_high] = True - self.sig_["sig"].loc[self.act_[col] <= c_low] = True - logger.info("Done") - - @register_predictor("MWU") class MWUMoap(Moap): def __init__(self, *args, **kwargs): @@ -934,7 +686,7 @@ def moap( method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', - 'lightningclassification', 'lightningregressor', 'bayesianridge', + 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional @@ -1057,13 +809,7 @@ def moap( motifs = motifs.loc[df.index] - if method == "lightningregressor": - outdir = os.path.dirname(outfile) - tmpname = os.path.join(outdir, ".lightning.tmp") - clf.fit(motifs, df, tmpdir=tmpname) - shutil.rmtree(tmpname) - else: - clf.fit(motifs, df) + clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: diff --git a/requirements.txt b/requirements.txt index 2ff4f9e7..ec991b68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ pyyaml >= 3.10 pybedtools statsmodels scikit-learn -sklearn-contrib-lightning seaborn pysam xgboost diff --git a/setup.py b/setup.py index aa961c94..5d790bf1 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,6 @@ def run(self): "pybedtools", "statsmodels", "scikit-learn", - "sklearn-contrib-lightning", "seaborn", "pysam", "xgboost >= 0.71", From 241d22ec739a8e3934a3ce200567d8fa878fc186 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 19:51:33 +0200 Subject: [PATCH 08/85] style --- gimmemotifs/moap.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 08efe91e..fdcabe98 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -17,7 +17,6 @@ def warn(*args, **kwargs): import os import sys -import shutil try: from itertools import izip @@ -32,8 +31,8 @@ def warn(*args, **kwargs): from tqdm.auto import tqdm # scikit-learn -from sklearn.model_selection import train_test_split, GridSearchCV -from sklearn.ensemble import BaggingClassifier, RandomForestClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import MultiTaskLasso, BayesianRidge from sklearn.preprocessing import scale, LabelEncoder From 3887c9df7cbd67753c262dd07d5dec7e93133c43 Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 21:27:44 +0200 Subject: [PATCH 09/85] remove lightning tests --- test/test_moap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_moap.py b/test/test_moap.py index 775b3534..c6c13be3 100644 --- a/test/test_moap.py +++ b/test/test_moap.py @@ -20,7 +20,7 @@ def setUp(self): def test1_moap(self): """ Test motif activity prediction """ - for method in ["mwu", "rf", "lightningclassification"]: + for method in ["mwu", "rf"]: df = moap( self.clusters, method=method, @@ -41,7 +41,7 @@ def test1_moap(self): def test2_moap(self): """ Test motif activity prediction for two clusters """ - for method in ["mwu", "rf", "lightningclassification"]: + for method in ["mwu", "rf"]: df = moap( self.clusters2, method=method, From d1fd3b99443412b13b8ae9f4d5e2b88cf4ec298d Mon Sep 17 00:00:00 2001 From: simonvh Date: Mon, 8 Jun 2020 21:28:14 +0200 Subject: [PATCH 10/85] black --- gimmemotifs/tools/meme.py | 9 ++++++--- gimmemotifs/tools/memew.py | 9 +++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/gimmemotifs/tools/meme.py b/gimmemotifs/tools/meme.py index a6a4d639..3bb69d60 100644 --- a/gimmemotifs/tools/meme.py +++ b/gimmemotifs/tools/meme.py @@ -1,5 +1,6 @@ from .motifprogram import MotifProgram import io +import os import re from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile @@ -58,7 +59,6 @@ def _run_program(self, bin, fastafile, params=None): number = default_params["number"] cmd = [ - "OMPI_MCA_plm_rsh_agent=sh", # Fix to run in Docker bin, fastafile, "-text", @@ -76,8 +76,11 @@ def _run_program(self, bin, fastafile, params=None): if not default_params["single"]: cmd.append(strand) - # sys.stderr.write(" ".join(cmd) + "\n") - p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE) + # Fix to run in Docker + env = os.environ.copy() + env["OMPI_MCA_plm_rsh_agent"] = "sh" + + p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env) stdout, stderr = p.communicate() motifs = [] diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py index 8cdeac7c..643d737c 100644 --- a/gimmemotifs/tools/memew.py +++ b/gimmemotifs/tools/memew.py @@ -1,6 +1,8 @@ from .motifprogram import MotifProgram import io +import os import re +import sys from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile @@ -57,7 +59,6 @@ def _run_program(self, bin, fastafile, params=None): number = default_params["number"] cmd = [ - "OMPI_MCA_plm_rsh_agent=sh", bin, fastafile, "-text", @@ -77,8 +78,12 @@ def _run_program(self, bin, fastafile, params=None): if not default_params["single"]: cmd.append(strand) + # Fix to run in Docker + env = os.environ.copy() + env["OMPI_MCA_plm_rsh_agent"] = "sh" + # sys.stderr.write(" ".join(cmd) + "\n") - p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE) + p = Popen(cmd, bufsize=1, stderr=PIPE, stdout=PIPE, env=env) stdout, stderr = p.communicate() motifs = [] From b3e6f211de351a26f5b048c1d84e115d3945298d Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 09:06:50 +0200 Subject: [PATCH 11/85] fix test for genomepy>=0.8.3 --- test/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 87f7b9da..c4f72c2f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -13,7 +13,7 @@ class TestUtils(unittest.TestCase): """ A test class to test utils functions """ def setUp(self): - self.genome_dir = "test/data/genome_index" + self.genomes_dir = "test/data/genome_index" self.datadir = "test/data/utils" def test1_phyper(self): @@ -31,7 +31,7 @@ def test2_as_fasta(self): """ convert bed, regions, etc to Fasta """ tmpdir = mkdtemp() - g = Genome("genome", genome_dir=self.genome_dir) + g = Genome("genome", genomes_dir=self.genomes_dir) fafile = os.path.join(self.datadir, "test.fa") fa = Fasta(fafile) From bb99e8e2619671489bf3081b084ac3dd83fbf8d0 Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 09:12:14 +0200 Subject: [PATCH 12/85] fix flake errors --- gimmemotifs/background.py | 20 +++++++++++--------- gimmemotifs/commands/motifs.py | 2 +- gimmemotifs/comparison.py | 10 ++++++---- gimmemotifs/plot.py | 2 +- gimmemotifs/tools/memew.py | 1 - gimmemotifs/tools/weeder.py | 2 +- gimmemotifs/utils.py | 4 ++-- 7 files changed, 22 insertions(+), 19 deletions(-) diff --git a/gimmemotifs/background.py b/gimmemotifs/background.py index 60c96d53..46d33233 100644 --- a/gimmemotifs/background.py +++ b/gimmemotifs/background.py @@ -249,15 +249,17 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None): for _i in range(k - 1): new_init = [] for x in init: - for l in alphabet: - new_init.append(x + l) + for letter in alphabet: + new_init.append(x + letter) init = new_init[:] - self.trans = dict([(word, dict([(l, 0.0) for l in alphabet])) for word in init]) + self.trans = dict( + [(word, dict([(letter, 0.0) for letter in alphabet])) for word in init] + ) new_init = [] for x in init: - for l in alphabet: - new_init.append(x + l) + for letter in alphabet: + new_init.append(x + letter) kmercount = dict([(word, 0) for word in new_init]) lettercount = dict([(word[:k], 0) for word in new_init]) @@ -284,9 +286,9 @@ def _initialize_matrices(self, seqs, k=1, alphabet=None): for k, v in lettercount.items(): self.init[k] = v / total - def _generate_sequence(self, l): + def _generate_sequence(self, length): sequence = list(self._weighted_random(list(self.init.items()))) - for _ in range(l - self.k): + for _ in range(length - self.k): sequence.append( self._weighted_random( list(self.trans["".join(sequence[-self.k :])].items()) @@ -294,10 +296,10 @@ def _generate_sequence(self, l): ) return "".join(sequence) - def _weighted_random(self, l): + def _weighted_random(self, weighted_list): n = random.uniform(0, 1) item = None - for item, weight in l: # noqa: B007 + for item, weight in weighted_list: # noqa: B007 if n < weight: break else: diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index 429fca70..a1681c8d 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -151,7 +151,7 @@ def motifs(args): delete_sample = False if ftype == "narrowpeak": f = NamedTemporaryFile(delete=False) - logger.debug("Using %s as temporary BED file".format(f.name)) + logger.debug("Using {} as temporary BED file".format(f.name)) narrowpeak_to_bed(args.sample, f.name, size=args.size) sample = f.name delete_sample = True diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py index 40565869..f964205d 100644 --- a/gimmemotifs/comparison.py +++ b/gimmemotifs/comparison.py @@ -888,12 +888,14 @@ def generate_score_dist(self, motifs, match, metric, combine): f = open(score_file, "w") all_scores = {} - for l in [len(motif) for motif in motifs]: - all_scores[l] = {} + for motif_len in [len(motif) for motif in motifs]: + all_scores[motif_len] = {} sorted_motifs = {} - for l in all_scores.keys(): - sorted_motifs[l] = [motif for motif in motifs if len(motif) == l] + for motif_len in all_scores.keys(): + sorted_motifs[motif_len] = [ + motif for motif in motifs if len(motif) == motif_len + ] for l1 in all_scores.keys(): for l2 in all_scores.keys(): diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py index 436f48ed..6703b6e4 100644 --- a/gimmemotifs/plot.py +++ b/gimmemotifs/plot.py @@ -354,7 +354,7 @@ def _get_motif_tree(tree, data, circle=True, vmin=None, vmax=None): m = 25 / data.values.max() for node in t.traverse("levelorder"): - val = data[[l.name for l in node.get_leaves()]].values.mean() + val = data[[leaf.name for leaf in node.get_leaves()]].values.mean() style = NodeStyle() style["size"] = 0 diff --git a/gimmemotifs/tools/memew.py b/gimmemotifs/tools/memew.py index 643d737c..5f39ddef 100644 --- a/gimmemotifs/tools/memew.py +++ b/gimmemotifs/tools/memew.py @@ -2,7 +2,6 @@ import io import os import re -import sys from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile diff --git a/gimmemotifs/tools/weeder.py b/gimmemotifs/tools/weeder.py index 0361c2d2..0ad4334c 100644 --- a/gimmemotifs/tools/weeder.py +++ b/gimmemotifs/tools/weeder.py @@ -76,7 +76,7 @@ def _run_program(self, bin, fastafile, params=None): shutil.copy(fastafile, name) fastafile = name - cmd = "{} -f {} -O".format(self.cmd, fastafile, weeder_organism) + cmd = "{} -f {} -O {}".format(self.cmd, fastafile, weeder_organism) if params["single"]: cmd += " -ss" diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index c65b58af..27d7bed3 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -536,11 +536,11 @@ def file_checksum(fname): return checksum -def join_max(a, l, sep="", suffix=""): +def join_max(a, length, sep="", suffix=""): lengths = [len(x) for x in a] total = 0 for i, size in enumerate(lengths + [0]): - if total > (l - len(suffix)): + if total > (length - len(suffix)): return sep.join(a[: i - 1]) + suffix if i > 0: total += 1 From cb63c3ce687ff70137a2aaf8f709de081b4e7c21 Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 09:47:58 +0200 Subject: [PATCH 13/85] update test --- test/test_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_tools.py b/test/test_tools.py index d6dba2db..9903ce54 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -35,6 +35,7 @@ def test_tool(tool_name): "trawler", # unpredictable, sometimes doesn't find the motif "weeder", # doesn't work at the moment "posmo", # motif doesn't predictably look like AP1 + "dreme", # current dreme in bioconda is broken ]: return From 8cbb1c318a74abdf7d20020f605c1ce37744fe8b Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 10:09:44 +0200 Subject: [PATCH 14/85] check if openmp problem is resolved --- conda_env.osx.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/conda_env.osx.txt b/conda_env.osx.txt index 8eb820e6..b4944b03 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -8,6 +8,7 @@ ghostscript homer jinja2 logomaker +llvm-openmp matplotlib >=2.0 meme >=5 ncurses From 40c24555e95a0cc39686ce71aae9bd938386551e Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 10:28:08 +0200 Subject: [PATCH 15/85] test osx fix --- conda_env.osx.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda_env.osx.txt b/conda_env.osx.txt index b4944b03..dc597576 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -31,7 +31,7 @@ ucsc-bigbedtobed ucsc-genepredtobed weeder xdg -xgboost >=0.71 +py-xgboost=0.90 xxmotif # development-specific From a42f71bb6fde00c804dfb71d101339e36187e482 Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 11:00:13 +0200 Subject: [PATCH 16/85] update travis config --- .travis.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index b452bd84..c239bba1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,8 +21,11 @@ before_install: fi - chmod +x miniconda.sh - ./miniconda.sh -b -p $HOME/miniconda -f - - export PATH=$HOME/miniconda/bin:$PATH - - conda config --set always_yes yes + - source "$HOME/miniconda/etc/profile.d/conda.sh" + - hash -r + - conda config --set always_yes yes --set changeps1 no + - conda update -q conda + - conda info -a - if [ "$TRAVIS_OS_NAME" == "osx" ]; then ulimit -S -n 4096; ulimit -a; fi install: @@ -34,7 +37,7 @@ install: else conda env create -q -f conda_env.osx.txt -n gimme; fi - - source activate gimme + - conda activate gimme - python setup.py build && pip install -e . before_script: From e6ab1875f1a1707fefd17b5d319b07ff83f30cb9 Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 11:51:12 +0200 Subject: [PATCH 17/85] struggling with osx --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c239bba1..c8a17716 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,6 +38,7 @@ install: conda env create -q -f conda_env.osx.txt -n gimme; fi - conda activate gimme + - conda list - python setup.py build && pip install -e . before_script: From fc8b26974f2d5839b461ce39ff82b00b01da03fb Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 12:17:07 +0200 Subject: [PATCH 18/85] osx --- conda_env.osx.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conda_env.osx.txt b/conda_env.osx.txt index dc597576..84842a0e 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -21,8 +21,8 @@ pysam python python-xxhash pyyaml >=3.10 -scikit-learn >=0.18 -scipy <1.3.0 +scikit-learn >=0.23 +scipy seaborn statsmodels tqdm >=4.27.0 @@ -31,7 +31,7 @@ ucsc-bigbedtobed ucsc-genepredtobed weeder xdg -py-xgboost=0.90 +xgboost=0.72 xxmotif # development-specific From 353729d35879eb6b27e087ec14ca6fd74617ccb8 Mon Sep 17 00:00:00 2001 From: simonvh Date: Tue, 9 Jun 2020 13:34:15 +0200 Subject: [PATCH 19/85] test again --- conda_env.osx.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda_env.osx.txt b/conda_env.osx.txt index 84842a0e..9c931dd4 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -3,7 +3,7 @@ configparser diskcache feather-format gadem -genomepy >=0.6.1 +genomepy >=0.8.3 ghostscript homer jinja2 @@ -21,7 +21,7 @@ pysam python python-xxhash pyyaml >=3.10 -scikit-learn >=0.23 +scikit-learn scipy seaborn statsmodels From 8065900f21458eae6b7fb60a3c02b3a05738c65e Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Wed, 24 Jun 2020 10:04:32 +0200 Subject: [PATCH 20/85] coverage_table configurable nr of threads --- scripts/coverage_table | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/coverage_table b/scripts/coverage_table index 64385b65..4997304b 100644 --- a/scripts/coverage_table +++ b/scripts/coverage_table @@ -32,6 +32,7 @@ def make_table( topmethod="var", rmdup=True, rmrepeats=True, + ncpus=12 ): for x in datafiles: if not os.path.isfile(x): @@ -49,7 +50,7 @@ def make_table( data = {} try: # Load data in parallel - pool = multiprocessing.Pool(processes=12) + pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( @@ -196,7 +197,14 @@ if __name__ == "__main__": action="store_false", default=True, ) - + parser.add_argument( + "--nthreads", + dest="ncpus", + help="Number of threads", + metavar="INT", + type=int, + default=12, + ) args = parser.parse_args() peakfile = args.peakfile datafiles = args.datafiles @@ -210,6 +218,7 @@ if __name__ == "__main__": topmethod=args.topmethod, rmdup=args.rmdup, rmrepeats=args.rmrepeats, + ncpus=args.ncpus ) yesno = {True: "yes", False: "no"} From 76e82d1a484489d896332ae430e2d270879e3396 Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Wed, 24 Jun 2020 10:06:43 +0200 Subject: [PATCH 21/85] minor style --- scripts/coverage_table | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/coverage_table b/scripts/coverage_table index 4997304b..829e67df 100644 --- a/scripts/coverage_table +++ b/scripts/coverage_table @@ -205,6 +205,7 @@ if __name__ == "__main__": type=int, default=12, ) + args = parser.parse_args() peakfile = args.peakfile datafiles = args.datafiles From 71e8f48e29c555128130d04a6ecc5bd142a08bc5 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 11:57:40 +0200 Subject: [PATCH 22/85] center maelstrom input by default; improved maelstrom report --- .../sortable/sortable-theme-slick.css | 3 +- data/templates/table.tpl | 215 +++++++ gimmemotifs/cli.py | 7 + gimmemotifs/commands/maelstrom.py | 2 + gimmemotifs/maelstrom.py | 36 +- gimmemotifs/report.py | 553 +++++++++++++++--- 6 files changed, 719 insertions(+), 97 deletions(-) create mode 100644 data/templates/table.tpl diff --git a/data/templates/sortable/sortable-theme-slick.css b/data/templates/sortable/sortable-theme-slick.css index df20f5a8..a699c304 100644 --- a/data/templates/sortable/sortable-theme-slick.css +++ b/data/templates/sortable/sortable-theme-slick.css @@ -1,6 +1,7 @@ /* line 2, ../sass/_sortable.sass */ table[data-sortable] { - font-size: 80%; + font-family: 'Nunito Sans'; + font-size: 90%; border-collapse: collapse; border-spacing: 0; } diff --git a/data/templates/table.tpl b/data/templates/table.tpl new file mode 100644 index 00000000..bf836b55 --- /dev/null +++ b/data/templates/table.tpl @@ -0,0 +1,215 @@ +{# Update the template_structure.html document too #} +{%- block before_style -%}{%- endblock before_style -%} +{% block style %} + + +{%- endblock style %} +{%- block before_table %}{% endblock before_table %} +{%- block table %} + +{%- block caption %} +{%- if caption -%} + +{%- endif -%} +{%- endblock caption %} +{%- block thead %} + + {%- block before_head_rows %}{% endblock %} + {%- for r in head %} + {%- block head_tr scoped %} + + {%- for c in r %} + {%- if c.is_visible != False %} + <{{ c.type }} class="{{c.class}}" {{ c.attributes|join(" ") }}>{{c.value}} + {%- endif %} + {%- endfor %} + + {%- endblock head_tr %} + {%- endfor %} + {%- block after_head_rows %}{% endblock %} + +{%- endblock thead %} +{%- block tbody %} + + {% block before_rows %}{% endblock before_rows %} + {% for r in body %} + {% block tr scoped %} + + {% for c in r %} + {% if c.is_visible != False %} + <{{ c.type }} {% if c.id is defined -%} id="T_{{ uuid }}{{ c.id }}" {%- endif %} class="{{ c.class }}" {{ c.attributes|join(" ") }}>{{ c.display_value }} + {% endif %} + {%- endfor %} + + {% endblock tr %} + {%- endfor %} + {%- block after_rows %}{%- endblock after_rows %} + +{%- endblock tbody %} +
{{caption}}
+{%- endblock table %} +{%- block after_table %}{% endblock after_table %} + diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index bc0e602f..4878e96b 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -326,6 +326,13 @@ def cli(sys_args): default=default_pfm_file, metavar="pfmfile", ) + p.add_argument( + "--nocenter", + dest="center", + help="Don't mean-center the rows by default", + default=True, + action="store_false", + ) p.add_argument( "-m", "--methods", diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py index b7d8e43f..f8573160 100755 --- a/gimmemotifs/commands/maelstrom.py +++ b/gimmemotifs/commands/maelstrom.py @@ -18,6 +18,7 @@ def maelstrom(args): methods = args.methods ncpus = args.ncpus zscore = args.zscore + center = args.center gc = args.gc if not os.path.exists(infile): @@ -35,4 +36,5 @@ def maelstrom(args): ncpus=ncpus, zscore=zscore, gc=gc, + center=center, ) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index c4bf21ab..6d24058c 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -22,6 +22,7 @@ import numpy as np import pandas as pd from sklearn.preprocessing import scale +from scipy.stats import pearsonr from scipy.cluster import hierarchy from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram @@ -220,6 +221,7 @@ def run_maelstrom( ncpus=None, zscore=True, gc=True, + center=False, ): """Run maelstrom on an input table. @@ -264,6 +266,9 @@ def run_maelstrom( gc : bool, optional Use GC% bins to normalize motif scores. + + center : bool, optional + Mean-center the input table. """ logger.info("Starting maelstrom") if infile.endswith("feather"): @@ -272,6 +277,20 @@ def run_maelstrom( else: df = pd.read_table(infile, index_col=0, comment="#") + # Check if the input is mean-centered + if df.shape[1] > 1 and not np.allclose(df.mean(1), 0): + if center: + logger.info( + "Input is not mean-centered, setting the mean of all rows to 0." + ) + logger.info("Use --nocenter to change this behavior") + df = df.sub(df.mean(axis=1), axis=0) + else: + logger.info("Input is not mean-centered, but --nocenter was specified.") + logger.info( + "Leaving the data as-is, but make sure this is what your really want." + ) + # Check for duplicates if df.index.duplicated(keep=False).any(): logger.warning("Input file contains duplicate regions!") @@ -407,16 +426,31 @@ def run_maelstrom( except FileNotFoundError: logger.warn("Activity file for {} not found!\n".format(t)) + counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") + scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") + if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps) + + # Add correlation between motif score and signal + logger.info("Correlation") + cols = df_p.columns + for col in cols[::-1]: + df_p.insert(0, f"correlation {col}", 0) + for motif in df_p.index: + df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0] + + # Add percentage of input sequences with motif + df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100) + df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) # Write motif frequency table if df.shape[1] == 1: - mcount = df.join(pd.read_table(count_table, index_col=0, comment="#")) + mcount = df.join(counts) m_group = mcount.groupby(df.columns[0]) freq = m_group.sum() / m_group.count() freq.to_csv(os.path.join(outdir, "motif.freq.txt"), sep="\t") diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 239eb441..e7f63285 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -16,6 +16,9 @@ import numpy as np import pandas as pd from statsmodels.stats.multitest import multipletests +from pandas.core.indexing import _non_reducing_slice +from pandas.io.formats.style import Styler +import seaborn as sns from gimmemotifs.comparison import MotifComparer from gimmemotifs.fasta import Fasta @@ -28,6 +31,327 @@ logger = logging.getLogger("gimme.report") +FACTOR_TOOLTIP = "
factors
(direct or predicted)
" + + +def _wrap_html_str(x): + if " " not in x: + return x + + min_pos, max_pos = 0, len(x) + if ">" in x and "[^<>]*<").search(x) + min_pos, max_pos = m.start(), m.end() + + positions = [m.start() for m in re.compile(" ").finditer(x)] + positions = [p for p in positions if min_pos < p < max_pos] + + pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0] + x = x[:pos] + "
" + x[pos + 1 :] + return x + + +class ExtraStyler(Styler): + loader = jinja2.ChoiceLoader( + [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader] + ) + env = jinja2.Environment(loader=loader) + template = env.get_template("table.tpl") + + def __init__(self, *args, **kwargs): + self._data_todo = [] + self.circle_styles = None + self.palette_styles = None + self.col_heading_style = { + "name": "col_heading", + "props": [("border-bottom", "1px solid #e0e0e0")], + } + super(ExtraStyler, self).__init__(*args, **kwargs) + self.display_data = self.data.copy() + + # self.template = + + self._font = "Nunito Sans" + + @property + def font(self): + return self._font + + @font.setter + def font(self, font_name): + self._font = font_name + + def set_font(self, font_name): + self.font = font_name + return self + + def _current_index(self, subset, axis=0): + selected = self.data.loc[subset] + if axis == 0 or axis == "columns": + return self.data.columns.get_indexer(selected.columns) + if axis == 1 or axis == "index": + return self.data.index.get_indexer(selected.index) + + raise ValueError(f"unknown axis {axis}") + + def _translate(self): + self._compute_data() + d = super()._translate() + circle_styles = self.circle_styles or [] + palette_styles = self.palette_styles or [] + col_heading_style = self.col_heading_style or [] + d.update( + { + "font": self.font, + "circle_styles": circle_styles, + "palette_styles": palette_styles, + "col_heading_style": col_heading_style, + } + ) + return d + + def _compute_data(self): + r = self + for func, args, kwargs in self._data_todo: + r = func(self)(*args, **kwargs) + r.data = r.display_data + return r + + def _tooltip(self, tip, subset=None, part=None): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + if part is None: + part = "data" + + if part == "data": + self.display_data.loc[subset] = ( + "
" + + self.display_data.loc[subset].astype(str) + + "
" + ) + elif part == "columns": + idx = self._current_index(subset, axis="columns") + rename = dict( + zip( + self.display_data.columns[idx], + "
" + + self.display_data.columns[idx].astype(str) + + "
", + ) + ) + self.display_data.rename(columns=rename, inplace=True) + elif part == "index": + idx = self._current_index(subset, axis="index") + rename = dict( + zip( + self.display_data.index[idx], + "
" + + self.display_data.index[idx].astype(str) + + "
", + ) + ) + self.display_data.rename(index=rename, inplace=True) + else: + raise ValueError(f"unknown value for part: {part}") + return self + + def _wrap_iterable(self, it): + return [_wrap_html_str(val) for val in it] + + def _wrap(self, subset=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + if axis in [0, "columns"]: + idx = self._current_index(subset, axis="columns") + rename = dict( + zip( + self.display_data.columns[idx], + self._wrap_iterable(self.display_data.columns[idx]), + ) + ) + self.display_data.rename(columns=rename, inplace=True) + elif axis in [1, "index"]: + idx = self._current_index(subset, axis="index") + rename = dict( + zip( + self.display_data.index[idx], + self._wrap_iterable(self.display_data.index[idx]), + ) + ) + self.display_data.rename(index=rename, inplace=True) + else: + raise ValueError(f"unknown value for axis: {axis}") + return self + + def _convert_to_image(self, subset=None, height=30): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + self.display_data.loc[subset] = ( + f'
' + ) + return self + + def _border(self, idx, location="left"): + return [f"border-{location}: 2px solid #444;" for val in idx] + + def border(self, subset=None, location="left", part="data"): + if part == "data": + self.apply(self._border, subset=subset, location=location) + else: + self.col_heading_style["props"].append( + (f"border-{location}", "2px solid #444") + ) + return self + + def _center_align(self, idx): + return ["text-align:center;" for val in idx] + + def center_align(self, subset=None, axis=0): + self.apply(self._center_align, subset=subset, axis=axis) + return self + + def scaled_background_gradient( + self, subset=None, cmap="RdBu_r", scale_factor=1, center_zero=True + ): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + absmax = np.max( + ( + abs(self.data.loc[subset].max().max()), + abs(self.data.loc[subset].min().min()), + ) + ) + target = absmax * scale_factor + r = self + for col in self.data.loc[subset].columns: + smin = self.data[col].min() + smax = self.data[col].max() + diff = smax - smin + + if center_zero: + # Make sure center of palette is at 0 + low = abs((-target - smin) / diff) + high = (target - smax) / diff + else: + high = 1 / scale_factor + low = 1 / scale_factor + + r = r.background_gradient(cmap=cmap, low=low, high=high, subset=[col]) + return r + + def _circle( + self, + subset=None, + show_text=True, + color=None, + palette=None, + vmin=None, + vmax=None, + scale=False, + size=25, + min_size=5, + morph=False, + ): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + # Make sure we don't select text columns + subslice = pd.IndexSlice[ + self.data.loc[subset].index, + self.data.loc[subset].select_dtypes(exclude=["object"]).columns, + ] + + self.circle_styles = self.circle_styles or [] + circle_id = len(self.circle_styles) + 1 + + props = [ + ("height", f"{size}px"), + ("width", f"{size}px"), + ("border-radius", "50%"), + ("color", "#000"), + ("line-height", f"{size}px"), + ("display", "inline-block"), + ("text-align", "center"), + ("vertical-align", "middle"), + ] + + if color: + palette = sns.color_palette([color]) + # print(palette) + elif palette is None: + palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10) + else: + # if isinstance(palette, str): + palette = sns.color_palette(palette) + + self.circle_styles.append({"name": f"circle{circle_id}", "props": props}) + self.palette_styles = self.palette_styles or [] + for i, color in enumerate(palette.as_hex()): + props = [("background-color", color)] + if scale: + circle_size = min_size + ((size - min_size) / len(palette) * (i + 1)) + props += [ + ("height", f"{circle_size}px"), + ("width", f"{circle_size}px"), + ("line-height", f"{circle_size}px"), + ("text-align", "center"), + ] + if morph: + props += [("border-radius", f"{50 - int(50 / len(palette)) * i}%")] + self.palette_styles.append( + {"name": f"color{circle_id}_{i}", "props": props} + ) + + vmax = vmax or self.data.loc[subslice].max().max() * 1.01 + text = self.display_data.loc[subslice].astype(str) if show_text else "" + self.display_data.loc[subslice] = ( + f"
" + + text + + "
" + ) + + return self + + def add_circle(self, **kwargs): + self._data_todo.append( + (lambda instance: getattr(instance, "_circle"), (), kwargs) + ) + return self + + def wrap(self, **kwargs): + self._data_todo.append( + (lambda instance: getattr(instance, "_wrap"), (), kwargs) + ) + return self + + def add_tooltip(self, tip, **kwargs): + self._data_todo.append( + (lambda instance: getattr(instance, "_tooltip"), (tip,), kwargs) + ) + return self + + def convert_to_image(self, **kwargs): + self._data_todo.append( + (lambda instance: getattr(instance, "_convert_to_image"), (), kwargs) + ) + return self + + def rename(self, columns=None, index=None): + self.display_data = self.display_data.rename(columns=columns, index=index) + return self + def get_roc_values(motif, fg_file, bg_file, genome): """Calculate ROC AUC values for ROC plots.""" @@ -167,8 +491,8 @@ class ReportMotif(object): + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } - rm.bg[bg][u"roc_img_link"] = { - u"href": "images/" + rm.bg[bg]["roc_img_link"] = { + "href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } @@ -253,93 +577,142 @@ def create_denovo_motif_report( ) +def format_factors(motif, max_length=5): + fmt_d = "{}" + fmt_i = "{}" + + direct = sorted(list(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) + indirect = sorted( + list( + set( + [ + x.upper() + for x in motif.factors[INDIRECT_NAME] + if x.upper() not in direct + ] + ) + ) + ) + + if len(direct) > max_length: + show_factors = direct[:max_length] + else: + show_factors = direct[:] + for f in indirect: + if f not in show_factors: + show_factors.append(f) + if len(show_factors) >= max_length: + break + show_factors = sorted(show_factors) + + factor_str = ",".join( + [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] + ) + + if len(direct + indirect) > max_length: + factor_str += ", (...)" + + tooltip = "" + if len(direct) > 0: + tooltip += "direct: " + ",".join(sorted(direct)) + if len(indirect) > 0: + if tooltip != "": + tooltip += " " + tooltip += "predicted: " + ",".join(sorted(indirect)) + + factor_str = '
' + factor_str + "
" + + return factor_str + + +def motif_to_factor_series(series, pfmfile=None, motifs=None): + if motifs is None: + motifs = read_motifs(pfmfile, as_dict=True) + + if isinstance(series, pd.Index): + index = series + else: + index = series.index + + factors = [format_factors(motifs[motif]) for motif in series] + return pd.Series(data=factors, index=index) + + +def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"): + if motifs is None: + motifs = read_motifs(pfmfile, as_dict=True) + + if not os.path.exists(outdir): + os.makedirs(outdir) + if not os.path.exists(os.path.join(outdir, subdir)): + os.makedirs(os.path.join(outdir, subdir)) + + img_series = [] + for motif in series: + if motif not in motifs: + raise ValueError(f"Motif {motif} does not occur in motif database") + fname = subdir + "/{}.png".format(re.sub("[()/]", "_", motif)) + if not os.path.exists(fname): + motifs[motif].plot_logo(fname=os.path.join(outdir, fname)) + img_series.append(fname) + + if isinstance(series, pd.Index): + index = series + else: + index = series.index + return pd.Series(data=img_series, index=index) + + def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): + + # Read the maelstrom text report df = pd.read_table(infile, index_col=0) - df = df[np.any(abs(df) >= threshold, 1)] - motifs = read_motifs(pfmfile) + # Columns with maelstrom rank aggregation value + value_cols = df.columns[ + ~df.columns.str.contains("correlation") & ~df.columns.isin(["% with motif"]) + ] + # Columns with correlation values + corr_cols = df.columns[df.columns.str.contains("correlation")] - df.rename_axis(None, inplace=True) - cols = df.columns + df = df[np.any(abs(df[value_cols]) >= threshold, 1)] - motifs = read_motifs(pfmfile) - idx = [motif.id for motif in motifs] - direct = [ - ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) - for motif in motifs - ] - indirect = [ - ",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]]))) - for motif in motifs - ] - m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx) - - factor_cols = [DIRECT_NAME, INDIRECT_NAME] - if True: - for factor_col in factor_cols: - f = m2f[factor_col].str.len() > 30 - m2f[factor_col] = ( - '
' - + m2f[factor_col].str.slice(0, 30) - ) - m2f.loc[f, factor_col] += "(...)" - m2f[factor_col] += "
" - df = df.join(m2f) + # Add motif logo's + df.insert( + 0, + "logo", + motif_to_img_series(df.index, pfmfile=pfmfile, outdir=outdir, subdir="logos"), + ) + # Add factors that can bind to the motif + df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) - df["logo"] = [ - ''.format(re.sub("[()/]", "_", x)) - for x in list(df.index) - ] + df["% with motif"] = df["% with motif"].astype(int) - if not os.path.exists(outdir + "/logos"): - os.makedirs(outdir + "/logos") - for motif in motifs: - if motif.id in df.index: - motif.plot_logo( - fname=outdir + "/logos/{}.png".format(re.sub("[()/]", "_", motif.id)) - ) + rename_columns = {"factors": FACTOR_TOOLTIP} - template_dir = MotifConfig().get_template_dir() - js = open( - os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8" - ).read() - css = open( - os.path.join(template_dir, "sortable/sortable-theme-slick.css"), - encoding="utf-8", - ).read() - df = df[factor_cols + ["logo"] + list(cols)] - - df_styled = df.style - absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min()))) - target = absmax * 1.75 - - for col in cols: - smin = df[col].min() - smax = df[col].max() - diff = smax - smin - low = abs((-target - smin) / diff) - high = (target - smax) / diff - df_styled = df_styled.background_gradient( - cmap="RdBu_r", low=low, high=high, subset=[col] + df_styled = ( + ExtraStyler(df) + .set_precision(2) + .convert_to_image(subset=["logo"], height=30,) + .scaled_background_gradient( + subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75 ) - - df_styled = df_styled.set_precision(3) - df_styled = df_styled.set_table_attributes("data-sortable") - df_styled = df_styled.render() - df_styled = df_styled.replace( - "data-sortable", 'class="sortable-theme-slick" data-sortable' + .scaled_background_gradient( + subset=value_cols, center_zero=True, scale_factor=1.75 + ) + .border(subset=list(value_cols[:1]) + ["% with motif"], location="left") + .border(part="columns", location="bottom") + .add_circle(subset=["% with motif"], palette="Purples", vmax=100, size=40) + .set_table_attributes('class="sortable-theme-slick" data-sortable') + .center_align(subset=list(value_cols) + list(corr_cols) + ["% with motif"]) + .wrap(subset=["% with motif"] + list(corr_cols)) + .set_font("Nunito Sans") + .rename(columns=rename_columns,) + .render() ) with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: - f.write("\n") - f.write("\n".format(css)) - f.write("\n") - f.write("\n") f.write(df_styled) - f.write("\n".format(js)) - f.write("\n") def roc_html_report( @@ -374,24 +747,10 @@ def roc_html_report( idx = [motif.id for motif in motifs] df = df.loc[idx] - direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs] - indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs] - m2f = pd.DataFrame({DIRECT_NAME: direct, INDIRECT_NAME: indirect}, index=idx) - - factor_cols = [DIRECT_NAME, INDIRECT_NAME] - if True: - for factor_col in factor_cols: - f = m2f[factor_col].str.len() > 30 - m2f[factor_col] = ( - '
' - + m2f[factor_col].str.slice(0, 30) - ) - m2f.loc[f, factor_col] += "(...)" - m2f[factor_col] += "
" - df = df.join(m2f) - cols = factor_cols + cols + + # Add factors that can bind to the motif + df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) + cols = ["factors"] + cols df = df[df["corrected P-value"] <= threshold] @@ -410,6 +769,8 @@ def roc_html_report( ] df = df[cols] + + df = df.rename(columns={"factors": FACTOR_TOOLTIP}) if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: @@ -441,10 +802,12 @@ def roc_html_report( f.write("\n") if df.shape[0] > 0: f.write( - df.sort_values("ROC AUC", ascending=False) + df.reset_index() + .sort_values("ROC AUC", ascending=False) .style.bar(bar_cols) .set_precision(3) .set_table_attributes("data-sortable") + .hide_index() .render() .replace("data-sortable", 'class="sortable-theme-slick" data-sortable') ) From 85c0e8cf06cbde3c02a04bd16685015f1379b74a Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 12:03:09 +0200 Subject: [PATCH 23/85] don't add correlation if input is cluster-based --- gimmemotifs/maelstrom.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 6d24058c..0c78eae8 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -428,22 +428,23 @@ def run_maelstrom( counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") - + if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps) - - # Add correlation between motif score and signal - logger.info("Correlation") - cols = df_p.columns - for col in cols[::-1]: - df_p.insert(0, f"correlation {col}", 0) - for motif in df_p.index: - df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0] - + + if df.shape[1] > 1: + # Add correlation between motif score and signal + logger.info("Correlation") + cols = df_p.columns + for col in cols[::-1]: + df_p.insert(0, f"correlation {col}", 0) + for motif in df_p.index: + df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0] + # Add percentage of input sequences with motif df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100) - + df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) From 1e4b5a598dc57c452ec3c446b3a646c385effab3 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 12:03:35 +0200 Subject: [PATCH 24/85] black --- gimmemotifs/maelstrom.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 0c78eae8..97ecc69f 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -440,7 +440,9 @@ def run_maelstrom( for col in cols[::-1]: df_p.insert(0, f"correlation {col}", 0) for motif in df_p.index: - df_p.loc[motif, f"correlation {col}"] = pearsonr(df[col], scores[motif])[0] + df_p.loc[motif, f"correlation {col}"] = pearsonr( + df[col], scores[motif] + )[0] # Add percentage of input sequences with motif df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100) @@ -541,7 +543,7 @@ def plot_heatmap( figsize=None, max_len=50, aspect=1, - **kwargs + **kwargs, ): """Plot clustered heatmap of predicted motif activity. From 0f36a0ccd42910ba74b68d11184b7df4a0bd7c4b Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 13:22:12 +0200 Subject: [PATCH 25/85] support old maelstrom output --- gimmemotifs/report.py | 128 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 13 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index e7f63285..9b21cddb 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -52,6 +52,10 @@ def _wrap_html_str(x): class ExtraStyler(Styler): + """ + Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS. + """ + loader = jinja2.ChoiceLoader( [jinja2.FileSystemLoader(MotifConfig().get_template_dir()), Styler.loader] ) @@ -82,6 +86,27 @@ def font(self, font_name): self._font = font_name def set_font(self, font_name): + """ + Set the font that will be used. + + Parameters + ---------- + font_name : str + Should be a font name available though the Google Font API. + + Returns + ------- + self : ExtraStyler + + Notes + ----- + ``font_name`` can contain spaces, eg. "Nunito Sans". + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) + >>> ExtraStyler(df).font("Roboto) + """ self.font = font_name return self @@ -205,12 +230,57 @@ def _convert_to_image(self, subset=None, height=30): def _border(self, idx, location="left"): return [f"border-{location}: 2px solid #444;" for val in idx] - def border(self, subset=None, location="left", part="data"): + def border( + self, + subset=None, + location="bottom", + part="data", + width="2px", + style="solid", + color="#444", + ): + """ + Add a border to data cells, columns or index. + + Parameters + ---------- + subset : IndexSlice, optional + An argument to ``DataFrame.loc`` that restricts which elements + ``border`` is applied to. If ``part`` is "columns" or "index" + subset should be present in either the columns or the index. + + location : str, optional + Location of the border, default is "bottom". Can be "top", "bottom", + "right" or "left". + + part : str, optional + If ``part`` is "data", the border will be applied to the data cells. + Set part to "index" or to "column" to add a border to the index or + header, respectively. + + width : str, int or float, optional + Valid CSS value for border width. + + style : str, optional + Valid CSS value for border style. + + color : str, optional + Valid CSS value for border color. + + Returns + ------- + self : ExtraStyler + + Examples + -------- + >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) + >>> ExtraStyler(df).border(part="columns) + """ if part == "data": self.apply(self._border, subset=subset, location=location) else: self.col_heading_style["props"].append( - (f"border-{location}", "2px solid #444") + (f"border-{location}", f"{width} {style} {color}") ) return self @@ -218,6 +288,24 @@ def _center_align(self, idx): return ["text-align:center;" for val in idx] def center_align(self, subset=None, axis=0): + """ + Center align text. + + Parameters + ---------- + subset : IndexSlice, optional + An argument to ``DataFrame.loc`` that restricts which elements + ``center_align`` is applied to. + + axis : {0 or 'index', 1 or 'columns', None}, default 0 + Apply to each column (``axis=0`` or ``'index'``), to each row + (``axis=1`` or ``'columns'``), or to the entire DataFrame at once + with ``axis=None``. + + Returns + ------- + self : ExtraStyler + """ self.apply(self._center_align, subset=subset, axis=axis) return self @@ -686,31 +774,45 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): # Add factors that can bind to the motif df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) - df["% with motif"] = df["% with motif"].astype(int) - rename_columns = {"factors": FACTOR_TOOLTIP} df_styled = ( ExtraStyler(df) .set_precision(2) .convert_to_image(subset=["logo"], height=30,) - .scaled_background_gradient( - subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75 - ) .scaled_background_gradient( subset=value_cols, center_zero=True, scale_factor=1.75 ) - .border(subset=list(value_cols[:1]) + ["% with motif"], location="left") + .border(subset=list(value_cols[:1]), location="left") .border(part="columns", location="bottom") - .add_circle(subset=["% with motif"], palette="Purples", vmax=100, size=40) .set_table_attributes('class="sortable-theme-slick" data-sortable') - .center_align(subset=list(value_cols) + list(corr_cols) + ["% with motif"]) - .wrap(subset=["% with motif"] + list(corr_cols)) + .center_align(subset=list(value_cols)) .set_font("Nunito Sans") - .rename(columns=rename_columns,) - .render() + .rename(columns=rename_columns) ) + if len(corr_cols) > 0: + df_styled = ( + df_styled.wrap(subset=list(corr_cols)) + .center_align(subset=list(corr_cols)) + .scaled_background_gradient( + subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75 + ) + ) + + if "% with motif" in df.columns: + df["% with motif"] = df["% with motif"].astype(int) + df_styled = ( + df_styled.add_circle( + subset=["% with motif"], palette="Purples", vmax=100, size=40 + ) + .wrap(subset=["% with motif"]) + .center_align(subset=["% with motif"]) + .border(subset=["% with motif"], location="left") + ) + + df_styled = df_styled.render() + with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write(df_styled) From f8b7bc5c5f8054c5f1b39fafd305da9a89c58a75 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 14:59:53 +0200 Subject: [PATCH 26/85] Fix B009 warning --- gimmemotifs/report.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 9b21cddb..e429f643 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -414,25 +414,25 @@ def _circle( def add_circle(self, **kwargs): self._data_todo.append( - (lambda instance: getattr(instance, "_circle"), (), kwargs) + (lambda instance: instance._circle, (), kwargs) ) return self def wrap(self, **kwargs): self._data_todo.append( - (lambda instance: getattr(instance, "_wrap"), (), kwargs) + (lambda instance: instance._wrap, (), kwargs) ) return self def add_tooltip(self, tip, **kwargs): self._data_todo.append( - (lambda instance: getattr(instance, "_tooltip"), (tip,), kwargs) + (lambda instance: instance._tooltip, (tip,), kwargs) ) return self def convert_to_image(self, **kwargs): self._data_todo.append( - (lambda instance: getattr(instance, "_convert_to_image"), (), kwargs) + (lambda instance: instance._convert_to_image, (), kwargs) ) return self From e72e411ac7290adaa30e33caf32e4465c14d0a37 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 7 Jul 2020 15:01:35 +0200 Subject: [PATCH 27/85] fix test --- test/test_maelstrom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py index 269e5b92..4edda61b 100644 --- a/test/test_maelstrom.py +++ b/test/test_maelstrom.py @@ -32,7 +32,7 @@ def test1_maelstrom(self): ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) - self.assertEquals((623, 4), df.shape) + self.assertEquals((623, 5), df.shape) for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) From d8eae732cb4eacb3dec6669386f79e23c5cb095c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 8 Jul 2020 10:29:41 +0200 Subject: [PATCH 28/85] emojify --- gimmemotifs/report.py | 199 +++++++++++++++++++++++++++++++----------- 1 file changed, 150 insertions(+), 49 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index e429f643..8fdc27d8 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -20,6 +20,11 @@ from pandas.io.formats.style import Styler import seaborn as sns +try: + import emoji +except ImportError: + pass + from gimmemotifs.comparison import MotifComparer from gimmemotifs.fasta import Fasta from gimmemotifs.motif import read_motifs @@ -88,36 +93,37 @@ def font(self, font_name): def set_font(self, font_name): """ Set the font that will be used. - + Parameters ---------- font_name : str Should be a font name available though the Google Font API. - + Returns ------- self : ExtraStyler - + Notes ----- ``font_name`` can contain spaces, eg. "Nunito Sans". - + Examples -------- >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) - >>> ExtraStyler(df).font("Roboto) + >>> ExtraStyler(df).font("Roboto) """ self.font = font_name return self - def _current_index(self, subset, axis=0): + def _current_index(self, subset): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) selected = self.data.loc[subset] - if axis == 0 or axis == "columns": - return self.data.columns.get_indexer(selected.columns) - if axis == 1 or axis == "index": - return self.data.index.get_indexer(selected.index) - - raise ValueError(f"unknown axis {axis}") + idx_slice = pd.IndexSlice[ + self.data.index.get_indexer(selected.index), + self.data.columns.get_indexer(selected.columns), + ] + return idx_slice def _translate(self): self._compute_data() @@ -158,7 +164,7 @@ def _tooltip(self, tip, subset=None, part=None): + "" ) elif part == "columns": - idx = self._current_index(subset, axis="columns") + idx = self._current_index(subset)[1] rename = dict( zip( self.display_data.columns[idx], @@ -171,7 +177,7 @@ def _tooltip(self, tip, subset=None, part=None): ) self.display_data.rename(columns=rename, inplace=True) elif part == "index": - idx = self._current_index(subset, axis="index") + idx = self._current_index(subset)[0] rename = dict( zip( self.display_data.index[idx], @@ -195,7 +201,7 @@ def _wrap(self, subset=None, axis=0): subset = _non_reducing_slice(subset) if axis in [0, "columns"]: - idx = self._current_index(subset, axis="columns") + idx = self._current_index(subset)[1] rename = dict( zip( self.display_data.columns[idx], @@ -204,7 +210,7 @@ def _wrap(self, subset=None, axis=0): ) self.display_data.rename(columns=rename, inplace=True) elif axis in [1, "index"]: - idx = self._current_index(subset, axis="index") + idx = self._current_index(subset)[0] rename = dict( zip( self.display_data.index[idx], @@ -248,14 +254,14 @@ def border( An argument to ``DataFrame.loc`` that restricts which elements ``border`` is applied to. If ``part`` is "columns" or "index" subset should be present in either the columns or the index. - + location : str, optional Location of the border, default is "bottom". Can be "top", "bottom", "right" or "left". part : str, optional If ``part`` is "data", the border will be applied to the data cells. - Set part to "index" or to "column" to add a border to the index or + Set part to "index" or to "column" to add a border to the index or header, respectively. width : str, int or float, optional @@ -270,7 +276,7 @@ def border( Returns ------- self : ExtraStyler - + Examples -------- >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) @@ -284,29 +290,32 @@ def border( ) return self - def _center_align(self, idx): - return ["text-align:center;" for val in idx] + def _align(self, idx, location="center"): + return [f"text-align:{location};" for val in idx] - def center_align(self, subset=None, axis=0): + def align(self, subset=None, location="center", axis=0): """ - Center align text. + Align text. Parameters ---------- subset : IndexSlice, optional An argument to ``DataFrame.loc`` that restricts which elements - ``center_align`` is applied to. + ``center_align`` is applied to. + + location : str, optional + "center", "left" or "right" axis : {0 or 'index', 1 or 'columns', None}, default 0 Apply to each column (``axis=0`` or ``'index'``), to each row (``axis=1`` or ``'columns'``), or to the entire DataFrame at once with ``axis=None``. - + Returns ------- self : ExtraStyler """ - self.apply(self._center_align, subset=subset, axis=axis) + self.apply(self._align, subset=subset, location=location, axis=axis) return self def scaled_background_gradient( @@ -352,12 +361,13 @@ def _circle( morph=False, ): subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) + subslice = _non_reducing_slice(subset) # Make sure we don't select text columns - subslice = pd.IndexSlice[ - self.data.loc[subset].index, - self.data.loc[subset].select_dtypes(exclude=["object"]).columns, - ] + if scale or morph: + subslice = pd.IndexSlice[ + self.data.loc[subset].index, + self.data.loc[subset].select_dtypes(exclude=["object"]).columns, + ] self.circle_styles = self.circle_styles or [] circle_id = len(self.circle_styles) + 1 @@ -400,34 +410,36 @@ def _circle( {"name": f"color{circle_id}_{i}", "props": props} ) - vmax = vmax or self.data.loc[subslice].max().max() * 1.01 - text = self.display_data.loc[subslice].astype(str) if show_text else "" - self.display_data.loc[subslice] = ( - f"
" - + text - + "
" - ) + if scale or morph: + vmax = vmax or self.data.loc[subslice].max().max() * 1.01 + text = self.display_data.loc[subslice].astype(str) if show_text else "" + self.display_data.loc[subslice] = ( + f"
" + + text + + "
" + ) + else: + text = self.display_data.loc[subslice].astype(str) if show_text else "" + self.display_data.loc[subslice] = ( + f"
" + text + "
" + ) return self def add_circle(self, **kwargs): - self._data_todo.append( - (lambda instance: instance._circle, (), kwargs) - ) + self._data_todo.append((lambda instance: instance._circle, (), kwargs)) return self def wrap(self, **kwargs): - self._data_todo.append( - (lambda instance: instance._wrap, (), kwargs) - ) + self._data_todo.append((lambda instance: instance._wrap, (), kwargs)) return self def add_tooltip(self, tip, **kwargs): - self._data_todo.append( - (lambda instance: instance._tooltip, (tip,), kwargs) - ) + self._data_todo.append((lambda instance: instance._tooltip, (tip,), kwargs)) return self def convert_to_image(self, **kwargs): @@ -440,6 +452,95 @@ def rename(self, columns=None, index=None): self.display_data = self.display_data.rename(columns=columns, index=index) return self + def _emoji_score(self, series, emoji_str=None, bins=None): + if emoji_str is None: + emoji_str = ":star:" + if bins is None: + bins = 3 + + if isinstance(bins, int): + labels = range(1, bins + 1) + else: + labels = range(1, len(bins)) + + return [ + emoji.emojize(emoji_str * val, use_aliases=True) + for val in pd.cut(series, bins=bins, labels=labels) + ] + + def _emoji_scale(self, series, emojis=None, bins=None): + emoji_dict = { + "thumbs": [":thumbsdown:", ":thumbsup:"], + "check": [":cross_mark:", ":white_check_mark:"], + "smiley": [ + ":crying_face:", + ":slightly_frowning_face:", + ":neutral_face:", + ":slightly_smiling_face:", + ":grin:", + ], + "black_square": [ + ":black_small_square:", + ":black_medium_small_square:", + ":black_medium_square:", + ":black_large_square:", + ], + "white_square": [ + ":white_small_square:", + ":white_medium_small_square:", + ":white_medium_square:", + ":white_large_square:", + ], + } + + if emojis is None: + emojis = "smiley" + + if emojis in emoji_dict: + labels = emoji_dict[emojis] + if bins is None: + bins = len(labels) + + return [ + emoji.emojize(val, use_aliases=True) + for val in pd.cut(series, bins=bins, labels=labels) + ] + + def emoji_scale(self, subset=None, emojis=None, bins=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + + result = self.display_data.iloc[idx].apply( + self._emoji_scale, axis=axis, result_type="expand", args=(emojis, bins) + ) + self.display_data.iloc[idx] = result.values + + return self.align(subset=subset, location="center", axis=axis) + + def emoji_score(self, subset=None, emoji_str=None, bins=None, axis=0): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + result = self.display_data.iloc[idx].apply( + self._emoji_score, axis=axis, result_type="expand", args=(emoji_str, bins) + ) + self.display_data.iloc[idx] = result.values + + return self.align(subset=subset, location="left", axis=axis) + + def emojify(self, subset=None): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + idx = self._current_index(subset=subset) + result = self.display_data.iloc[idx].applymap(emoji.emojize) + self.display_data.iloc[idx] = result.values + + return self + def get_roc_values(motif, fg_file, bg_file, genome): """Calculate ROC AUC values for ROC plots.""" From 2eac4c4fcc8e4f4924e4c1eda672ca798c91c66d Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Tue, 14 Jul 2020 15:51:07 +0200 Subject: [PATCH 29/85] fix issue with chrom names as int --- scripts/combine_peaks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/combine_peaks b/scripts/combine_peaks index 155d6e83..07d41040 100644 --- a/scripts/combine_peaks +++ b/scripts/combine_peaks @@ -116,7 +116,7 @@ def combine_peaks(peaks, genome, window, scale_value): # store summit location + associated value in col4 df_all["col4"] = ( - df_all["chrom"] + df_all["chrom"].astype(str) + ";" + df_all["start"].astype(str) + ";" From bea0a24c003832a4eff8ec153dfc8c6e1565483e Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 00:05:52 +0200 Subject: [PATCH 30/85] correctly read motif file --- gimmemotifs/motif.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index cb8aba8b..2be35178 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -1393,13 +1393,7 @@ def parse_motifs(motifs): List of Motif instances. """ if isinstance(motifs, str): - with open(motifs) as f: - if motifs.endswith("pwm") or motifs.endswith("pfm"): - motifs = read_motifs(f, fmt="pwm") - elif motifs.endswith("transfac"): - motifs = read_motifs(f, fmt="transfac") - else: - motifs = read_motifs(f) + return read_motifs(motifs) elif isinstance(motifs, Motif): motifs = [motifs] else: From e87801aef7516a545e2fe04f8f72e887add3ebf8 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 08:43:04 +0200 Subject: [PATCH 31/85] update report --- gimmemotifs/commands/motifs.py | 12 ++- gimmemotifs/maelstrom.py | 1 + gimmemotifs/report.py | 159 +++++++++++++++++---------------- 3 files changed, 90 insertions(+), 82 deletions(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index a1681c8d..e72e7c5d 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -13,6 +13,7 @@ from tempfile import NamedTemporaryFile import numpy as np +import pandas as pd from gimmemotifs.background import create_background_file from gimmemotifs.comparison import MotifComparer, select_nonredundant_motifs @@ -142,7 +143,7 @@ def motifs(args): # Print the metrics f_out.write( - "Motif\t# matches\t# matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" + "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) logger.info("creating motif scan tables") @@ -174,6 +175,9 @@ def motifs(args): gcnorm=True, ) + n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0] + n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0] + logger.info("calculating stats") for motif_stats in calc_stats_iterator( motifs=pfmfile, @@ -188,10 +192,12 @@ def motifs(args): if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10(motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( - "{}\t{:d}\t{:d}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format( + "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n".format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], + motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], + motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], @@ -203,7 +209,7 @@ def motifs(args): f_out.close() # Select a set of "non-redundant" motifs. - # Using Recursive Feature Elemination, a set of motifs is selected that + # Using Recursive Feature Elimination, a set of motifs is selected that # best explains the peaks in comparison to the background sequences. nr_motifs = select_nonredundant_motifs( args.outdir + "/gimme.roc.report.txt", diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 97ecc69f..4719fabb 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -187,6 +187,7 @@ def _rank_agg_column(exps, dfs, e): tmp_dfs[i][k] = ( v.sample(frac=1).sort_values(e, ascending=sort_order).index.values ) + return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1])) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 8fdc27d8..5bde42e0 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -19,6 +19,8 @@ from pandas.core.indexing import _non_reducing_slice from pandas.io.formats.style import Styler import seaborn as sns +from matplotlib import colors +import matplotlib.pyplot as plt try: import emoji @@ -318,41 +320,22 @@ def align(self, subset=None, location="center", axis=0): self.apply(self._align, subset=subset, location=location, axis=axis) return self - def scaled_background_gradient( - self, subset=None, cmap="RdBu_r", scale_factor=1, center_zero=True - ): - subset = pd.IndexSlice[:, :] if subset is None else subset - subset = _non_reducing_slice(subset) - absmax = np.max( - ( - abs(self.data.loc[subset].max().max()), - abs(self.data.loc[subset].min().min()), - ) - ) - target = absmax * scale_factor - r = self - for col in self.data.loc[subset].columns: - smin = self.data[col].min() - smax = self.data[col].max() - diff = smax - smin - - if center_zero: - # Make sure center of palette is at 0 - low = abs((-target - smin) / diff) - high = (target - smax) / diff - else: - high = 1 / scale_factor - low = 1 / scale_factor + def _background_gradient(self, s, m, M, cmap='PuBu', low=0, high=0): + rng = M - m + norm = colors.Normalize(m - (rng * low), + M + (rng * high)) + normed = norm(s.values) + c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)] + return ['background-color: %s' % color for color in c] + - r = r.background_gradient(cmap=cmap, low=low, high=high, subset=[col]) - return r def _circle( self, subset=None, show_text=True, color=None, - palette=None, + cmap=None, vmin=None, vmax=None, scale=False, @@ -362,13 +345,24 @@ def _circle( ): subset = pd.IndexSlice[:, :] if subset is None else subset subslice = _non_reducing_slice(subset) + + if color: + palette = sns.color_palette([color]) + # print(palette) + elif cmap is None: + palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10) + else: + # if isinstance(palette, str): + palette = sns.color_palette(cmap) + # Make sure we don't select text columns - if scale or morph: + if len(palette) > 1: subslice = pd.IndexSlice[ - self.data.loc[subset].index, - self.data.loc[subset].select_dtypes(exclude=["object"]).columns, + self.data.loc[subslice].index, + self.data.loc[subslice].select_dtypes(exclude=["object"]).columns, ] + self.circle_styles = self.circle_styles or [] circle_id = len(self.circle_styles) + 1 @@ -383,14 +377,6 @@ def _circle( ("vertical-align", "middle"), ] - if color: - palette = sns.color_palette([color]) - # print(palette) - elif palette is None: - palette = sns.light_palette((210, 90, 60), input="husl", n_colors=10) - else: - # if isinstance(palette, str): - palette = sns.color_palette(palette) self.circle_styles.append({"name": f"circle{circle_id}", "props": props}) self.palette_styles = self.palette_styles or [] @@ -410,8 +396,8 @@ def _circle( {"name": f"color{circle_id}_{i}", "props": props} ) - if scale or morph: - vmax = vmax or self.data.loc[subslice].max().max() * 1.01 + if len(palette) > 1: + vmax = self.data.loc[subslice].max().max() * 1.01 if vmax is None else vmax * 1.01 text = self.display_data.loc[subslice].astype(str) if show_text else "" self.display_data.loc[subslice] = ( f"
0: df_styled = ( df_styled.wrap(subset=list(corr_cols)) - .center_align(subset=list(corr_cols)) + .align(subset=list(corr_cols), location="center") .scaled_background_gradient( - subset=corr_cols, cmap="PuOr_r", center_zero=True, scale_factor=1.75 + subset=corr_cols, cmap="PuOr_r", center_zero=True, min=1/1.75, max=1/1.75 ) ) @@ -905,10 +918,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): df["% with motif"] = df["% with motif"].astype(int) df_styled = ( df_styled.add_circle( - subset=["% with motif"], palette="Purples", vmax=100, size=40 + subset=["% with motif"], cmap="Purples", vmax=100, size=40 ) .wrap(subset=["% with motif"]) - .center_align(subset=["% with motif"]) + .align(subset=["% with motif"], location="center") .border(subset=["% with motif"], location="left") ) @@ -944,17 +957,12 @@ def roc_html_report( "Recall at 10% FDR", ] - motifs = read_motifs(pfmfile) + motifs = read_motifs(pfmfile, as_dict=True) if use_motifs is not None: - motifs = [m for m in motifs if m.id in use_motifs] - - idx = [motif.id for motif in motifs] + motifs = {k:v for k,v in motifs.items() if k in use_motifs} + idx = list(motifs.keys()) df = df.loc[idx] - # Add factors that can bind to the motif - df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) - cols = ["factors"] + cols - df = df[df["corrected P-value"] <= threshold] if link_matches: @@ -966,23 +974,17 @@ def roc_html_report( + "" ) - df["logo"] = [ - ''.format(re.sub(r"[^-_\w]+", "_", x)) - for x in list(df.index) - ] - - df = df[cols] - - df = df.rename(columns={"factors": FACTOR_TOOLTIP}) - if not os.path.exists(outdir + "/logos"): - os.makedirs(outdir + "/logos") - for motif in motifs: - if motif.id in df.index: - motif.plot_logo( - fname=outdir - + "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id)) - ) + # Add motif logo's + df.insert( + 0, + "logo", + motif_to_img_series(df.index, pfmfile=pfmfile, motifs=motifs, outdir=outdir, subdir="logos"), + ) + # Add factors that can bind to the motif + df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile, motifs=motifs)) + rename_columns = {"factors": FACTOR_TOOLTIP} + bar_cols = [ "log10 P-value", "ROC AUC", @@ -998,6 +1000,8 @@ def roc_html_report( os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8", ).read() + + df = df.reset_index().sort_values("ROC AUC", ascending=False) with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f: f.write("\n") f.write("\n".format(css)) @@ -1005,16 +1009,13 @@ def roc_html_report( f.write("\n") if df.shape[0] > 0: f.write( - df.reset_index() - .sort_values("ROC AUC", ascending=False) - .style.bar(bar_cols) - .set_precision(3) + ExtraStyler(df) + .bar(bar_cols) + .set_precision(2) .set_table_attributes("data-sortable") .hide_index() .render() - .replace("data-sortable", 'class="sortable-theme-slick" data-sortable') ) else: - f.write("No enriched motifs found.") - f.write("\n".format(js)) - f.write("\n") + f.write("No enriched motifs found.") + From f6e1374c8346d6b86bc2c07fbddce41f30ef049c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 12:30:53 +0200 Subject: [PATCH 32/85] fix GC%-normalized z-score cutoff --- gimmemotifs/scanner.py | 380 ++++++++++++++++++----------------------- 1 file changed, 166 insertions(+), 214 deletions(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index 337c912d..cbb05961 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -1,6 +1,7 @@ import os import re import sys +from collections import Counter from functools import partial from tempfile import mkdtemp, NamedTemporaryFile import logging @@ -18,6 +19,8 @@ from diskcache import Cache import numpy as np from scipy.stats import scoreatpercentile +from sklearn.preprocessing import scale +import pandas as pd from gimmemotifs import __version__ from gimmemotifs.background import RandomGenomicFasta, gc_bin_bedfile @@ -369,7 +372,9 @@ def parse_threshold_values(motif_file, cutoff): return threshold -def scan_sequence(seq, motifs, nreport, scan_rc): +def scan_sequence( + seq, seq_gc_bin, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False +): ret = [] # scan for motifs @@ -377,36 +382,38 @@ def scan_sequence(seq, motifs, nreport, scan_rc): if cutoff is None: ret.append([]) else: - result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) - if cutoff <= motif.pwm_min_score() and len(result) == 0: - result = [[motif.pwm_min_score(), 0, 1]] * nreport + if zscore: + m_mean, m_std = motifs_meanstd[seq_gc_bin][motif.id] + result = pwmscan( + seq, motif.logodds, motif.pwm_min_score(), nreport, scan_rc + ) + result = [[(row[0] - m_mean) / m_std, row[1], row[2]] for row in result] + result = [row for row in result if row[0] >= cutoff] + else: + result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) + if cutoff <= motif.pwm_min_score() and len(result) == 0: + result = [[motif.pwm_min_score(), 0, 1]] * nreport + ret.append(result) # return results return ret -def scan_region(region, genome, motifs, nreport, scan_rc): - - # retrieve sequence - chrom, start, end = re.split(r"[:-]", region) - seq = genome[chrom][int(start) : int(end)].seq.upper() - - return scan_sequence(seq, motifs, nreport, scan_rc) - - -def scan_seq_mult(seqs, motifs, nreport, scan_rc): - ret = [] - for seq in seqs: - result = scan_sequence(seq.upper(), motifs, nreport, scan_rc) - ret.append(result) - return ret - - -def scan_region_mult(regions, genome, motifs, nreport, scan_rc): +def scan_seq_mult( + seqs, seq_gc_bins, motifs, nreport, scan_rc, motifs_meanstd=None, zscore=False +): ret = [] - for region in regions: - result = scan_region(region, genome, motifs, nreport, scan_rc) + for seq, seq_gc_bin in zip(seqs, seq_gc_bins): + result = scan_sequence( + seq.upper(), + seq_gc_bin, + motifs, + nreport, + scan_rc, + motifs_meanstd=motifs_meanstd, + zscore=zscore, + ) ret.append(result) return ret @@ -528,7 +535,7 @@ class Scanner(object): def __init__(self, ncpus=None): self.config = MotifConfig() - self.threshold = None + self._threshold = None self.genome = None self.background = None self.meanstd = {} @@ -603,19 +610,15 @@ def _meanstd_from_seqs(self, motifs, seqs): def _threshold_from_seqs(self, motifs, seqs, fpr): scan_motifs = [(m, m.pwm_min_score()) for m in motifs] - table = [] - for x in self._scan_sequences_with_motif(scan_motifs, seqs, 1, True): - table.append([row[0][0] for row in x]) + seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs] + for gc_bin, result in zip( + seq_gc_bins, self._scan_sequences_with_motif(scan_motifs, seqs, 1, True) + ): + table.append([gc_bin] + [row[0][0] for row in result]) - for (motif, _), scores in zip(scan_motifs, np.array(table).transpose()): - if len(scores) > 0: - opt_score = scoreatpercentile(scores, 100 - (100 * fpr)) - yield motif, opt_score # cutoff - else: - raise ValueError( - "Could not determine threshold for motif {}".format(motif) - ) + df = pd.DataFrame(table, columns=["gc_bin"] + [m.id for m in motifs]) + return df def set_meanstd(self, gc=False): if not self.background: @@ -677,7 +680,7 @@ def set_meanstd(self, gc=False): lock.release() def set_background( - self, fname=None, genome=None, size=200, nseq=10000, gc=False, gc_bins=None + self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None ): """Set the background to use for FPR and z-score calculations. @@ -704,6 +707,16 @@ def set_background( size = int(size) + if gc_bins is None: + if gc: + gc_bins = [(0.0, 0.2), (0.8, 1)] + for b in np.arange(0.2, 0.799, 0.05): + gc_bins.append((b, b + 0.05)) + else: + gc_bins = [(0, 1)] + if nseq is None: + nseq = max(10000, len(gc_bins) * 1000) + if genome and fname: raise ValueError("Need either genome or filename for background.") @@ -735,12 +748,6 @@ def set_background( if not fa: if gc: - - if gc_bins is None: - gc_bins = [(0.0, 0.2), (0.8, 1)] - for b in np.arange(0.2, 0.799, 0.05): - gc_bins.append((b, b + 0.05)) - with NamedTemporaryFile() as tmp: logger.info("using {} sequences".format(nseq)) gc_bin_bedfile( @@ -756,6 +763,12 @@ def set_background( if gc_bins: self.gc_bins = gc_bins + @property + def threshold(self): + if self._threshold is None: + self.set_threshold() + return self._threshold + def set_threshold(self, fpr=None, threshold=None, gc=False): """Set motif scanning threshold based on background sequences. @@ -774,6 +787,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): if threshold and fpr: raise ValueError("Need either fpr or threshold.") + if threshold is None and fpr is None: + if self.genome: + fpr = 0.01 + logger.info(f"Using default FPR of {fpr}") + else: + threshold = 0.95 + logger.info( + f"Genome not specified, using default threshold of {threshold}." + ) + logger.info("This is likely not ideal.") + if fpr: fpr = float(fpr) if not (0.0 < fpr < 1.0): @@ -784,9 +808,17 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): thresholds = {} motifs = read_motifs(self.motifs) + gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins] if threshold is not None: - self.threshold = parse_threshold_values(self.motifs, threshold) + data = [] + + d = parse_threshold_values(self.motifs, threshold) + self._threshold = pd.DataFrame(d, index=[0]) + self._threshold = self._threshold.join( + pd.DataFrame(gc_bins, index=[0] * len(gc_bins), columns=["gc_bin"]) + ) + self._threshold = self._threshold.set_index("gc_bin") return if not self.background: @@ -800,36 +832,41 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): lock.acquire() with Cache(CACHE_DIR) as cache: scan_motifs = [] + self._threshold = None for motif in motifs: - k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) - - threshold = cache.get(k) - if threshold is None: + k = "{}|{}|{:.4f}|{}".format( + motif.hash(), self.background_hash, fpr, ",".join(sorted(gc_bins)) + ) + vals = cache.get(k) + if vals is None: scan_motifs.append(motif) else: - if np.isclose(threshold, motif.pwm_max_score()): - thresholds[motif.id] = None - elif np.isclose(threshold, motif.pwm_min_score()): - thresholds[motif.id] = 0.0 + if self._threshold is None: + self._threshold = vals.to_frame() else: - thresholds[motif.id] = threshold + self._threshold[motif.id] = vals if len(scan_motifs) > 0: logger.info("determining FPR-based threshold") - for motif, threshold in self._threshold_from_seqs( - scan_motifs, seqs, fpr - ): - k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) - cache.set(k, threshold) - if np.isclose(threshold, motif.pwm_max_score()): - thresholds[motif.id] = None - elif np.isclose(threshold, motif.pwm_min_score()): - thresholds[motif.id] = 0.0 - else: - thresholds[motif.id] = threshold + df = self._threshold_from_seqs(scan_motifs, seqs, fpr).set_index( + "gc_bin" + ) + if self._threshold is None: + self._threshold = df + else: + self._threshold = pd.concat((self._threshold, df), axis=1) + for motif in scan_motifs: + k = "{}|{}|{:.4f}|{}".format( + motif.hash(), + self.background_hash, + fpr, + ",".join(sorted(gc_bins)), + ) + cache.set(k, df[motif.id]) lock.release() - self.threshold_str = "{}_{}_{}".format(fpr, threshold, self.background_hash) - self.threshold = thresholds + self.threshold_str = "{}_{}_{}_{}".format( + fpr, threshold, self.background_hash, ",".join(sorted(gc_bins)) + ) def set_genome(self, genome): """ @@ -930,16 +967,9 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): """ Scan a set of regions or sequences. """ - - if not self.threshold: - logger.info( - "Using default threshold of 0.95. " "This is likely not optimal!" - ) - self.set_threshold(threshold=0.95) - seqs = as_fasta(seqs, genome=self.genome) - it = self._scan_sequences(seqs.seqs, nreport, scan_rc) + it = self._scan_sequences(seqs.seqs, nreport, scan_rc, zscore=zscore) if zscore: if gc: @@ -949,165 +979,86 @@ def scan(self, seqs, nreport=100, scan_rc=True, zscore=False, gc=False): if len(self.meanstd) != 1: self.set_meanstd(gc=gc) - gc_seqs = [self.get_seq_bin(seq) for seq in seqs.seqs] - logger.debug("Scanning") - for result, gc_seq in zip(it, gc_seqs): - if zscore: - zresult = [] - for i, mrow in enumerate(result): - try: - m_mean, m_std = self.get_motif_mean_std( - gc_seq, self.motif_ids[i] - ) - except Exception: - print(self.meanstd) - print(gc_seq, self.motif_ids[i]) - raise - mrow = [((x[0] - m_mean) / m_std, x[1], x[2]) for x in mrow] - zresult.append(mrow) - yield zresult - else: - yield result + for result in it: + yield result - def _scan_regions(self, regions, nreport, scan_rc): - genome = self.genome - motif_file = self.motifs - motif_digest = self.checksum.get(motif_file, None) + def get_gc_thresholds(self, seqs, motifs=None, zscore=False): + # Simple case, only one threshold + if np.all(self.threshold.nunique(axis=0) == 1): + return self.threshold.iloc[0].to_dict() - # determine which regions are not in the cache - scan_regions = regions - if self.use_cache: - scan_regions = [] - for region in regions: - key = str((region, genome, motif_digest, nreport, scan_rc)) - ret = self.cache.get(key) - if ret == NO_VALUE: - scan_regions.append(region) - - # scan the regions that are not in the cache - if len(scan_regions) > 0: - - g = Genome(genome) - - motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)] - scan_func = partial( - scan_region_mult, - genome=g, - motifs=motifs, - nreport=nreport, - scan_rc=scan_rc, - ) + if motifs is None: + motifs = read_motifs(self.motifs) + seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs] - for region, ret in self._scan_jobs(scan_func, scan_regions): - # return values or store values in cache - if self.use_cache: - # store values in cache - key = str( - ( - region, - genome, - motif_digest, - nreport, - scan_rc, - self.threshold_str, - ) - ) - self.cache.set(key, ret) - else: - # return values - yield ret + gc_bin_count = Counter(seq_gc_bins) - if self.use_cache: - # return results from cache - for region in regions: - key = str( - (region, genome, motif_digest, nreport, scan_rc, self.threshold_str) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - raise Exception( - "cache is not big enough to hold all " - "results, try increasing the cache size " - "or disable cache" - ) - yield ret + print(self.threshold) + + _treshold = self.threshold + if zscore: + grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0) + _threshold = pd.DataFrame( + np.vstack(grouped.values), + index=_treshold.index, + columns=_treshold.columns, + ) + + min_frac = min(gc_bin_count.values()) + dfs = [ + _threshold.loc[gc_bin].sample( + int(count / min_frac * 1000), replace=True, random_state=42 + ) + for gc_bin, count in gc_bin_count.items() + ] + print(dfs) + fpr_df = pd.concat(dfs) + print(fpr_df.shape) + t = fpr_df.quantile(0.99, interpolation="higher") + print(motifs) + print(t) + maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index) + t[t >= maxt] = None + # print(t) + return t.replace({np.nan: None}).to_dict() def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): scan_func = partial( scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc ) - for ret in self._scan_jobs(scan_func, seqs): yield ret[1] - def _scan_sequences(self, seqs, nreport, scan_rc): - - motif_file = self.motifs - motif_digest = self.checksum.get(motif_file, None) - - scan_seqs = seqs - if self.use_cache: - # determine which sequences are not in the cache - hashes = dict([(s.upper(), xxhash.xxh64(s.upper()).digest()) for s in seqs]) - scan_seqs = [] - - for seq, seq_hash in hashes.items(): - key = str( - (seq_hash, motif_digest, nreport, scan_rc, self.threshold_str) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - scan_seqs.append(seq.upper()) - - # scan the sequences that are not in the cache - if len(scan_seqs) > 0: - motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)] - scan_func = partial( - scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc - ) - - for seq, ret in self._scan_jobs(scan_func, scan_seqs): - if self.use_cache: - h = hashes[seq] - key = str((h, motif_digest, nreport, scan_rc, self.threshold_str)) - self.cache.set(key, ret) - else: - yield ret - - if self.use_cache: - # return results from cache - for seq in seqs: - key = str( - ( - hashes[seq.upper()], - motif_digest, - nreport, - scan_rc, - self.threshold_str, - ) - ) - ret = self.cache.get(key) - if ret == NO_VALUE or ret is None: - raise Exception( - "cache is not big enough to hold all " - "results, try increasing the cache size " - "or disable cache" - ) - - yield ret + def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False): + thresholds = self.get_gc_thresholds(seqs, zscore=zscore) + motifs = [(m, thresholds[m.id]) for m in read_motifs(self.motifs)] + motifs_meanstd = None + if zscore: + motifs_meanstd = self.meanstd + scan_func = partial( + scan_seq_mult, + motifs=motifs, + nreport=nreport, + scan_rc=scan_rc, + motifs_meanstd=motifs_meanstd, + zscore=zscore, + ) + for seq, ret in self._scan_jobs(scan_func, seqs): + yield ret def _scan_jobs(self, scan_func, scan_seqs): batchsize = 1000 + if self.ncpus > 1: for i in range((len(scan_seqs) - 1) // batchsize + 1): batch = scan_seqs[i * batchsize : (i + 1) * batchsize] chunksize = len(batch) // self.ncpus + 1 jobs = [] for j in range((len(batch) - 1) // chunksize + 1): - job = self.pool.apply_async( - scan_func, (batch[j * chunksize : (j + 1) * chunksize],) - ) + batch_seqs = batch[j * chunksize : (j + 1) * chunksize] + seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs] + job = self.pool.apply_async(scan_func, (batch_seqs, seq_gc_bins)) jobs.append(job) for k, job in enumerate(jobs): @@ -1116,7 +1067,8 @@ def _scan_jobs(self, scan_func, scan_seqs): yield region, ret else: for i in range((len(scan_seqs) - 1) // batchsize + 1): - for _j, ret in enumerate( - scan_func(scan_seqs[i * batchsize : (i + 1) * batchsize]) - ): + batch_seqs = scan_seqs[i * batchsize : (i + 1) * batchsize] + seq_gc_bins = [self.get_seq_bin(seq) for seq in batch_seqs] + + for _j, ret in enumerate(scan_func(batch_seqs, seq_gc_bins)): yield scan_seqs[i], ret From 588d0864f9e8844cfcc717bef7337f3ca5a98a52 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 12:31:17 +0200 Subject: [PATCH 33/85] update report --- gimmemotifs/report.py | 201 ++++++++++++++++++++++++++++++------------ 1 file changed, 143 insertions(+), 58 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 5bde42e0..9c15ad72 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -51,6 +51,9 @@ def _wrap_html_str(x): min_pos, max_pos = m.start(), m.end() positions = [m.start() for m in re.compile(" ").finditer(x)] + if len(positions) == 0: + return x + positions = [p for p in positions if min_pos < p < max_pos] pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0] @@ -58,6 +61,31 @@ def _wrap_html_str(x): return x +def relative_luminance(rgba): + """ + Calculate relative luminance of a color. + The calculation adheres to the W3C standards + (https://www.w3.org/WAI/GL/wiki/Relative_luminance) + Parameters + ---------- + color : rgb or rgba tuple + Returns + ------- + float + The relative luminance as a value from 0 to 1 + """ + r, g, b = ( + x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4) for x in rgba[:3] + ) + return 0.2126 * r + 0.7152 * g + 0.0722 * b + + +def contrasting_text_color(color, text_color_threshold=0.408): + dark = relative_luminance(color) < text_color_threshold + text_color = "#f1f1f1" if dark else "#000000" + return text_color + + class ExtraStyler(Styler): """ Extra styles for a DataFrame or Series based on pandas.styler using HTML and CSS. @@ -320,15 +348,15 @@ def align(self, subset=None, location="center", axis=0): self.apply(self._align, subset=subset, location=location, axis=axis) return self - def _background_gradient(self, s, m, M, cmap='PuBu', low=0, high=0): + def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0): rng = M - m - norm = colors.Normalize(m - (rng * low), - M + (rng * high)) + norm = colors.Normalize(m - (rng * low), M + (rng * high)) normed = norm(s.values) - c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)] - return ['background-color: %s' % color for color in c] - - + c = plt.cm.get_cmap(cmap)(normed) + return [ + f"background-color: {colors.rgb2hex(color)}; color: {contrasting_text_color(color)}" + for color in c + ] def _circle( self, @@ -345,7 +373,7 @@ def _circle( ): subset = pd.IndexSlice[:, :] if subset is None else subset subslice = _non_reducing_slice(subset) - + if color: palette = sns.color_palette([color]) # print(palette) @@ -362,7 +390,6 @@ def _circle( self.data.loc[subslice].select_dtypes(exclude=["object"]).columns, ] - self.circle_styles = self.circle_styles or [] circle_id = len(self.circle_styles) + 1 @@ -377,7 +404,6 @@ def _circle( ("vertical-align", "middle"), ] - self.circle_styles.append({"name": f"circle{circle_id}", "props": props}) self.palette_styles = self.palette_styles or [] for i, color in enumerate(palette.as_hex()): @@ -397,7 +423,11 @@ def _circle( ) if len(palette) > 1: - vmax = self.data.loc[subslice].max().max() * 1.01 if vmax is None else vmax * 1.01 + vmax = ( + self.data.loc[subslice].max().max() * 1.01 + if vmax is None + else vmax * 1.01 + ) text = self.display_data.loc[subslice].astype(str) if show_text else "" self.display_data.loc[subslice] = ( f"
{}" fmt_i = "{}" - direct = sorted(list(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) + direct = sorted( + list( + set( + [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]] + ) + ) + ) indirect = sorted( list( set( @@ -805,7 +862,11 @@ def format_factors(motif, max_length=5): show_factors.append(f) if len(show_factors) >= max_length: break - show_factors = sorted(show_factors) + + if "de novo" in show_factors: + show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"]) + else: + show_factors = sorted(show_factors) factor_str = ",".join( [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] @@ -895,7 +956,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): .set_precision(2) .convert_to_image(subset=["logo"], height=30,) .scaled_background_gradient( - subset=value_cols, center_zero=True, min=1/1.75, max=1/1.75 + subset=value_cols, center_zero=True, min=1 / 1.75, max=1 / 1.75 ) .border(subset=list(value_cols[:1]), location="left") .border(part="columns", location="bottom") @@ -910,7 +971,11 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): df_styled.wrap(subset=list(corr_cols)) .align(subset=list(corr_cols), location="center") .scaled_background_gradient( - subset=corr_cols, cmap="PuOr_r", center_zero=True, min=1/1.75, max=1/1.75 + subset=corr_cols, + cmap="PuOr_r", + center_zero=True, + min=1 / 1.75, + max=1 / 1.75, ) ) @@ -942,29 +1007,29 @@ def roc_html_report( ): df = pd.read_table(infile, index_col=0) df.rename_axis(None, inplace=True) - df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1] + + motifs = read_motifs(pfmfile, as_dict=True) + if use_motifs is not None: + motifs = {k: v for k, v in motifs.items() if k in use_motifs} + idx = list(motifs.keys()) + df = df.loc[idx] + + df.insert(2, "corrected P-value", multipletests(df["P-value"], method="fdr_bh")[1]) + df.insert(3, "-log10 P-value", -np.log10(df["corrected P-value"])) + df = df[df["corrected P-value"] <= threshold] cols = [ + "factors", "logo", - "# matches", - "# matches background", - "P-value", - "log10 P-value", - "corrected P-value", + "% matches input", + "%matches background", + "-log10 P-value", "ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR", ] - motifs = read_motifs(pfmfile, as_dict=True) - if use_motifs is not None: - motifs = {k:v for k,v in motifs.items() if k in use_motifs} - idx = list(motifs.keys()) - df = df.loc[idx] - - df = df[df["corrected P-value"] <= threshold] - if link_matches: df["# matches"] = ( "\n") - f.write("\n".format(css)) - f.write("\n") - f.write("\n") if df.shape[0] > 0: f.write( ExtraStyler(df) - .bar(bar_cols) + .convert_to_image(subset=["logo"], height=30,) + .add_circle( + subset=["% matches input", "%matches background"], + vmax=100, + cmap="Purples", + ) + .scaled_background_gradient( + "-log10 P-value", vmin=0, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "ROC AUC", vmin=0.5, vmax=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "PR AUC", vmin=0, vmax=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "Enr. at 1% FPR", vmin=1, high=0.3, cmap="Reds" + ) + .scaled_background_gradient( + "Recall at 10% FDR", vmin=0, vmax=1, high=0.7, cmap="Reds" + ) .set_precision(2) - .set_table_attributes("data-sortable") - .hide_index() + .set_table_attributes('class="sortable-theme-slick" data-sortable') + .wrap(subset=cols) + .align(subset=bar_cols, location="center") + .rename(columns=rename_columns) .render() ) else: f.write("No enriched motifs found.") - From 7f04bceb31d49a90f544652d738806921fa5de34 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 13:20:26 +0200 Subject: [PATCH 34/85] update genomepy dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5d790bf1..e25ef055 100644 --- a/setup.py +++ b/setup.py @@ -131,7 +131,7 @@ def run(self): "diskcache", "xxhash", "configparser", - "genomepy >= 0.7.2", + "genomepy >= 0.8.3", "tqdm", "pillow", "logomaker", From 1d0dc74359b83c5c697a10f5a82b9f577c43512e Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 14:55:21 +0200 Subject: [PATCH 35/85] new aggregation method --- gimmemotifs/cli.py | 13 ++++ gimmemotifs/commands/maelstrom.py | 2 + gimmemotifs/rank.py | 123 +++++++++++++++++++++++++++--- gimmemotifs/report.py | 11 +-- 4 files changed, 133 insertions(+), 16 deletions(-) diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index 4878e96b..84b89961 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -341,6 +341,19 @@ def cli(sys_args): default=None, metavar="NAMES", ) + p.add_argument( + "-a", + "--aggregation", + dest="aggregation", + help=( + 'How to combine motifs from individual methods. Default is "int_stouffer", ' + "for inverse normal transform of ranks, followed by Stouffer's method to combine " + 'z-scores. Alternatively, specify "stuart" for log-transformed rank aggregation ' + "p-values." + ), + default="int_stouffer", + metavar="method", + ) p.add_argument( "-N", "--nthreads", diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py index f8573160..4cae068d 100755 --- a/gimmemotifs/commands/maelstrom.py +++ b/gimmemotifs/commands/maelstrom.py @@ -20,6 +20,7 @@ def maelstrom(args): zscore = args.zscore center = args.center gc = args.gc + aggregation = args.aggregation if not os.path.exists(infile): raise ValueError("file {} does not exist".format(infile)) @@ -37,4 +38,5 @@ def maelstrom(args): zscore=zscore, gc=gc, center=center, + aggregation=aggregation, ) diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py index 15076a86..a11ad71e 100644 --- a/gimmemotifs/rank.py +++ b/gimmemotifs/rank.py @@ -9,6 +9,7 @@ import subprocess as sp import pandas as pd import numpy as np +from scipy.stats import rankdata, norm try: from scipy.special import factorial @@ -77,23 +78,77 @@ def qStuart(r): return factorial(N) * v[N] -def rankagg(df, method="stuart"): - """Return aggregated ranks. +def _rank_int(series, c=3.0 / 8, stochastic=True): + # Based on code by Edward Mountjoy + # See: https://github.com/edm1/rank-based-INT + """ Perform rank-based inverse normal transformation on pandas series. + If stochastic is True ties are given rank randomly, otherwise ties will + share the same value. NaN values are ignored. + Args: + param1 (pandas.Series): Series of values to transform + param2 (Optional[float]): Constand parameter (Bloms constant) + param3 (Optional[bool]): Whether to randomise rank of ties + + Returns: + pandas.Series + """ + + # Check input + assert isinstance(series, pd.Series) + assert isinstance(c, float) + assert isinstance(stochastic, bool) + + # Set seed + np.random.seed(123) + + # Take original series indexes + orig_idx = series.index + + # Drop NaNs + series = series.loc[~pd.isnull(series)] + + # Get ranks + if stochastic == True: + # Shuffle by index + series = series.loc[np.random.permutation(series.index)] + # Get rank, ties are determined by their position in the series (hence + # why we randomised the series) + rank = rankdata(series, method="ordinal") + else: + # Get rank, ties are averaged + rank = rankdata(series, method="average") + + # Convert numpy array back to series + rank = pd.Series(rank, index=series.index) + + # Convert rank to normal distribution + transformed = rank.apply(_rank_to_normal, c=c, n=len(rank)) + + return transformed[orig_idx] + +def _rank_to_normal(rank, c, n): + # Standard quantile function + x = (rank - c) / (n - 2 * c + 1) + return norm.ppf(x) + + +def _rankagg_int(df): + # Convert values to ranks + df_int = df.apply(_rank_int) + # Combine z-score using Stouffer's method + df_int = (df_int.sum(1) / np.sqrt(df_int.shape[1])).to_frame() + df_int.columns = ["z-score"] + return df_int + + +def _rankagg_stuart(df): + """ Implementation is ported from the RobustRankAggreg R package References: Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709 Stuart et al., 2003, DOI: 10.1126/science.1087447 - - Parameters - ---------- - df : pandas.DataFrame - DataFrame with values to be ranked and aggregated - - Returns - ------- - pandas.DataFrame with aggregated ranks """ rmat = pd.DataFrame(index=df.iloc[:, 0]) @@ -105,3 +160,49 @@ def rankagg(df, method="stuart"): rmat = rmat.apply(sorted, 1, result_type="expand") p = rmat.apply(qStuart, 1) return pd.DataFrame({"score": p}, index=rmat.index) + + +def rankagg(df, method="int_stouffer", include_reverse=True, log_transform=True): + """Return aggregated ranks. + + Stuart implementation is ported from the RobustRankAggreg R package + + References: + Kolde et al., 2012, DOI: 10.1093/bioinformatics/btr709 + Stuart et al., 2003, DOI: 10.1126/science.1087447 + + Parameters + ---------- + df : pandas.DataFrame + DataFrame with values to be ranked and aggregated + method : str, optional + Either "int_stouffer" or "stuart". The "int_stouffer" method is based on combining z-scores + from a inverse normal transform of ranks using Stouffer's method. + + Returns + ------- + pandas.DataFrame with aggregated ranks + """ + method = method.lower() + if method not in ["stuart", "int_stouffer"]: + raise ValueError("unknown method for rank aggregation") + + if method == "stuart": + df_asc = pd.DataFrame() + df_desc = pd.DataFrame() + for col in df.columns: + df_asc[col] = ( + df.sample(frac=1).sort_values(col, ascending=False).index.values + ) + if include_reverse: + df_desc[col] = ( + df.sample(frac=1).sort_values(col, ascending=True).index.values + ) + + df_result = -np.log10(_rankagg_stuart(df_asc)) + if include_reverse: + df_result += np.log10(_rankagg_stuart(df_desc)) + + return df_result + if method == "int_stouffer": + return _rankagg_int(df) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 9c15ad72..766349e7 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -926,7 +926,7 @@ def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="l return pd.Series(data=img_series, index=index) -def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): +def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): # Read the maelstrom text report df = pd.read_table(infile, index_col=0) @@ -950,13 +950,15 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) rename_columns = {"factors": FACTOR_TOOLTIP} + if "% with motif" in df.columns: + df["% with motif"] = df["% with motif"].astype(int) df_styled = ( ExtraStyler(df) .set_precision(2) .convert_to_image(subset=["logo"], height=30,) .scaled_background_gradient( - subset=value_cols, center_zero=True, min=1 / 1.75, max=1 / 1.75 + subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75 ) .border(subset=list(value_cols[:1]), location="left") .border(part="columns", location="bottom") @@ -974,13 +976,12 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=4): subset=corr_cols, cmap="PuOr_r", center_zero=True, - min=1 / 1.75, - max=1 / 1.75, + low=1 / 1.75, + high=1 / 1.75, ) ) if "% with motif" in df.columns: - df["% with motif"] = df["% with motif"].astype(int) df_styled = ( df_styled.add_circle( subset=["% with motif"], cmap="Purples", vmax=100, size=40 From e979f10067854397be31762a2ce975704b967c5b Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 14:55:45 +0200 Subject: [PATCH 36/85] style --- gimmemotifs/commands/motifs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index e72e7c5d..b2b136a6 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -197,7 +197,9 @@ def motifs(args): motif_stats[str(motif)]["matches_at_fpr"][0], motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], - motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100, + motif_stats[str(motif)]["matches_at_fpr"][1] + / n_background + * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], From f6541ec83a4001fc53ed9c40eabb6fea7a52cb1c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 14:55:53 +0200 Subject: [PATCH 37/85] new aggregation method --- gimmemotifs/maelstrom.py | 41 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 4719fabb..8cf32ecf 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -173,35 +173,21 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300) -def _rank_agg_column(exps, dfs, e): - tmp_dfs = [pd.DataFrame(), pd.DataFrame()] - - for i, sort_order in enumerate([False, True]): - for method, scoring, _ in exps: - k = "{}.{}".format(method, scoring) - if k in dfs: - v = dfs[k] - # Sample rows before sorting to shuffle - # Otherwise all ties will not have a random order due to inherent - # ordering of the motif dataframe - tmp_dfs[i][k] = ( - v.sample(frac=1).sort_values(e, ascending=sort_order).index.values - ) - - return -np.log10(rankagg(tmp_dfs[0])) + np.log10(rankagg(tmp_dfs[1])) - - -def df_rank_aggregation(df, dfs, exps): +def df_rank_aggregation(df, dfs, exps, method="int_stouffer"): df_p = pd.DataFrame(index=list(dfs.values())[0].index) names = list(dfs.values())[0].columns + dfs = [ + pd.concat([v[col].rename(k, inplace=True) for k, v in dfs.items()], axis=1) + for col in names + ] pool = Pool(16) - func = partial(_rank_agg_column, exps, dfs) - ret = pool.map(func, names) + func = partial(rankagg, method=method) + ret = pool.map(func, dfs) pool.close() pool.join() - for e, result in zip(names, ret): - df_p[e] = result + for name, result in zip(names, ret): + df_p[name] = result if df.shape[1] != 1: df_p = df_p[df.columns] @@ -223,6 +209,7 @@ def run_maelstrom( zscore=True, gc=True, center=False, + aggregation="int_stouffer", ): """Run maelstrom on an input table. @@ -270,6 +257,12 @@ def run_maelstrom( center : bool, optional Mean-center the input table. + + aggregation: str, optional + How to combine scores of the predictors. The default is "int_stouffer", for + inverse normal transform followed by Stouffer's methods to combine z-scores. + Alternatively, "stuart" performs rank aggregation and reports the -log10 of + the rank aggregation p-value. """ logger.info("Starting maelstrom") if infile.endswith("feather"): @@ -432,7 +425,7 @@ def run_maelstrom( if len(methods) > 1: logger.info("Rank aggregation") - df_p = df_rank_aggregation(df, dfs, exps) + df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) if df.shape[1] > 1: # Add correlation between motif score and signal From 1f8b465219c2eb9d62138d1b68c0aad94827c06c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 15:13:42 +0200 Subject: [PATCH 38/85] add quantile normalization to coverage_table --- conda_env.dev.txt | 1 + conda_env.osx.txt | 1 + conda_env.test.txt | 1 + conda_env.txt | 1 + scripts/coverage_table | 28 +++++++++++++++++----------- setup.py | 1 + 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/conda_env.dev.txt b/conda_env.dev.txt index cb3f62e3..8dc6f56e 100644 --- a/conda_env.dev.txt +++ b/conda_env.dev.txt @@ -21,6 +21,7 @@ pysam python python-xxhash pyyaml >=3.10 +qnorm scikit-learn >=0.23 scipy >=1.4.1 seaborn diff --git a/conda_env.osx.txt b/conda_env.osx.txt index 9c931dd4..dd1a2c98 100644 --- a/conda_env.osx.txt +++ b/conda_env.osx.txt @@ -21,6 +21,7 @@ pysam python python-xxhash pyyaml >=3.10 +qnorm scikit-learn scipy seaborn diff --git a/conda_env.test.txt b/conda_env.test.txt index a61ea95d..af2f478c 100644 --- a/conda_env.test.txt +++ b/conda_env.test.txt @@ -22,6 +22,7 @@ pybedtools python >=3.8 python-xxhash pyyaml >=3.10 +qnorm scikit-learn >=0.18 scipy <1.3.0 seaborn diff --git a/conda_env.txt b/conda_env.txt index 441b7f03..b569daff 100644 --- a/conda_env.txt +++ b/conda_env.txt @@ -22,6 +22,7 @@ pysam python >=3 python-xxhash pyyaml >=3.10 +qnorm scikit-learn >=0.18 scipy <1.3.0 seaborn diff --git a/scripts/coverage_table b/scripts/coverage_table index 829e67df..14f3128f 100644 --- a/scripts/coverage_table +++ b/scripts/coverage_table @@ -9,6 +9,7 @@ import pysam import numpy as np import pandas as pd from sklearn.preprocessing import scale +import qnorm from gimmemotifs import __version__ @@ -27,7 +28,7 @@ def make_table( datafiles, window, log_transform=True, - scale_table=True, + normalization="none", top=0, topmethod="var", rmdup=True, @@ -100,9 +101,14 @@ def make_table( if log_transform: print("Log transform", file=sys.stderr) df = np.log1p(df) - if scale_table: - print("Scale", file=sys.stderr) + if normalization == "scale": + print("Normalization by scaling", file=sys.stderr) df[:] = scale(df, axis=0) + if normalization == "quantile": + print("Normalization by quantile normalization", file=sys.stderr) + df = qnorm.quantile_normalize(df) + else: + print("No normalization", file=sys.stderr) if top > 0: if topmethod == "var": @@ -166,12 +172,12 @@ if __name__ == "__main__": action="store_true", ) parser.add_argument( - "-s", - "--scale", - dest="scale_table", - help="Scale per datafile", - default=False, - action="store_true", + "-n", + "--normalization", + dest="normalization", + help="Normalization: none, quantile or scale", + default="none", + metavar="METHOD", ) parser.add_argument( "-t", "--top", dest="top", help="Select regions.", default=0, type=int @@ -214,7 +220,7 @@ if __name__ == "__main__": datafiles, args.window, log_transform=args.log_transform, - scale_table=args.scale_table, + normalization=args.normalization, top=args.top, topmethod=args.topmethod, rmdup=args.rmdup, @@ -232,7 +238,7 @@ output.write("# Window: {}\n".format(args.window)) output.write("# Duplicates removed: {}\n".format(yesno[args.rmdup])) output.write("# MAPQ 0 removed: {}\n".format(yesno[args.rmrepeats])) output.write("# Log transformed: {}\n".format(yesno[args.log_transform])) -output.write("# Scaled: {}\n".format(yesno[args.scale_table])) +output.write("# Normalization: {}\n".format(args.normalization)) if args.top > 0: output.write("# Top {} regions selected by {}\n".format(args.top, args.topmethod)) df.to_csv(output, sep="\t", float_format="%0.5f") diff --git a/setup.py b/setup.py index e25ef055..b5dc2217 100644 --- a/setup.py +++ b/setup.py @@ -135,5 +135,6 @@ def run(self): "tqdm", "pillow", "logomaker", + "qnorm", ], ) From 0ff3773ac5b7a43ee92cdea17912bdff73b15410 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 15:13:57 +0200 Subject: [PATCH 39/85] extra informative message --- gimmemotifs/maelstrom.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 8cf32ecf..1be9fd2c 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -277,7 +277,13 @@ def run_maelstrom( logger.info( "Input is not mean-centered, setting the mean of all rows to 0." ) - logger.info("Use --nocenter to change this behavior") + logger.info("Use --nocenter if you know what you're doing and want to change this behavior.") + logger.info( + "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to " + "first transform your data, for instance using log2(), and to normalize " + "between samples. To create a table suitable for maelstrom you can use the " + "coverage_table script included with GimmeMotifs." + ) df = df.sub(df.mean(axis=1), axis=0) else: logger.info("Input is not mean-centered, but --nocenter was specified.") From 4e02768c1391808d6c2cab803ee53eec5c339832 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 16:26:29 +0200 Subject: [PATCH 40/85] add SVR regressor, replace Lasso regressor --- gimmemotifs/moap.py | 173 +++++++++++++++++++++++--------------------- 1 file changed, 90 insertions(+), 83 deletions(-) diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index fdcabe98..4b69661b 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -34,8 +34,9 @@ def warn(*args, **kwargs): from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier -from sklearn.linear_model import MultiTaskLasso, BayesianRidge +from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge from sklearn.preprocessing import scale, LabelEncoder +from sklearn.svm import SVR import xgboost @@ -541,22 +542,16 @@ def fit(self, df_X, df_y): logger.info("Done") -@register_predictor("Lasso") -class LassoMoap(Moap): - def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): - """Predict motif activities using Lasso MultiTask regression +@register_predictor("MultiTaskLasso") +class MultiTaskLassoMoap(Moap): + def __init__(self, scale=True, ncpus=None): + """Predict motif activities using MultiTaskLasso. Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled - before classification - - kfolds : integer, optional, default 5 - number of kfolds for parameter search - - alpha_stepsize : float, optional, default 1.0 - stepsize for use in alpha gridsearch + before classification. ncpus : int, optional Number of threads. Default is the number specified in the config. @@ -564,101 +559,113 @@ def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) - fitted motif activities - - sig_ : DataFrame, shape (n_motifs,) - boolean values, if coefficients are higher/lower than - the 1%t from random permutation + Coefficients of the regression model. """ - self.kfolds = kfolds - self.act_description = "activity values: coefficients from " "fitted model" + self.act_description = "activity values: coefficients of the" "regression model" - self.scale = scale if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus - - # initialize attributes + self.scale = scale self.act_ = None - self.sig_ = None - - mtk = MultiTaskLasso() - parameters = {"alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)]} - self.clf = GridSearchCV( - mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2" - ) self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression" - def fit(self, df_X, df_y, permute=False): - logger.info("Fitting Lasso") + def fit(self, df_X, df_y): + logger.info("Fitting MultiTaskLasso") + if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if self.scale: + logger.debug("Scaling motif scores") # Scale motif scores - df_X[:] = scale(df_X, axis=0) + df_X.loc[:,:] = scale(df_X, axis=0) + + # logger.debug("Scaling y") - idx = list(range(df_y.shape[0])) - y = df_y.iloc[idx] - X = df_X.loc[y.index].values - y = y.values - - # fit coefficients - coefs = self._get_coefs(X, y) - self.act_ = pd.DataFrame(coefs.T) - - # convert labels back to original names - self.act_.columns = df_y.columns - self.act_.index = df_X.columns - - if permute: - # Permutations - logger.info("permutations\n") - random_dfs = [] - for _ in range(10): - y_random = y[np.random.permutation(range(y.shape[0]))] - coefs = self._get_coefs(X, y_random) - random_dfs.append(pd.DataFrame(coefs.T)) - random_df = pd.concat(random_dfs) - - # Select cutoff based on percentile - high_cutoffs = random_df.quantile(0.99) - low_cutoffs = random_df.quantile(0.01) - - # Set significance - self.sig_ = pd.DataFrame(index=df_X.columns) - self.sig_["sig"] = False - - for col, c_high, c_low in zip(self.act_.columns, high_cutoffs, low_cutoffs): - self.sig_["sig"].loc[self.act_[col] >= c_high] = True - self.sig_["sig"].loc[self.act_[col] <= c_low] = True + # Normalize across samples and features + # y = df_y.apply(scale, 1).apply(scale, 0) + y = df_y + X = df_X.loc[y.index] + + model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus) + logger.debug("Fitting model") + coefs = [] + model.fit(df_X, df_y) logger.info("Done") - def _get_coefs(self, X, y): - logger.info("set alpha through cross-validation\n") - # Determine best parameters based on CV - self.clf.fit(X, y) + self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T - logger.debug( - "average score ({} fold CV): {}".format(self.kfolds, self.clf.best_score_) - ) + def predict(self, df_X): + return df_X.dot(self.act_.loc[df_X.columns]) + +@register_predictor("SVR") +class SVRMoap(Moap): + def __init__(self, scale=True, ncpus=None): + """Predict motif activities using Support Vector Regression. + + Parameters + ---------- + scale : boolean, optional, default True + If ``True``, the motif scores will be scaled + before classification. + + ncpus : int, optional + Number of threads. Default is the number specified in the config. + + Attributes + ---------- + act_ : DataFrame, shape (n_motifs, n_clusters) + SVR weights. + """ + + self.act_description = "activity values: SVR weights" + + if ncpus is None: + ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) + self.ncpus = ncpus + self.scale = scale + self.act_ = None + self.pref_table = "score" + self.supported_tables = ["score", "count"] + self.ptype = "regression" + + def fit(self, df_X, df_y): + logger.info("Fitting SVR") + + if not df_y.shape[0] == df_X.shape[0]: + raise ValueError("number of regions is not equal") + + if self.scale: + logger.debug("Scaling motif scores") + # Scale motif scores + df_X.loc[:,:] = scale(df_X, axis=0) + + # logger.debug("Scaling y") - logger.info("Estimate coefficients using bootstrapping\n") + # Normalize across samples and features + # y = df_y.apply(scale, 1).apply(scale, 0) + y = df_y + self.columns = df_y.columns + X = df_X.loc[y.index] - n_samples = 0.75 * X.shape[0] - max_samples = X.shape[0] - m = self.clf.best_estimator_ + clf = SVR(kernel="linear") + self.model = MultiOutputRegressor(clf, n_jobs=1) + logger.debug("Fitting model") coefs = [] - for _ in range(10): - idx = np.random.randint(0, n_samples, max_samples) - m.fit(X[idx], y[idx]) - coefs.append(m.coef_) - coefs = np.array(coefs).mean(axis=0) - return coefs + self.model.fit(df_X, df_y) + logger.info("Done") + + self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T + + def predict(self, df_X): + #print(self.model.predict(df_X) ) + + return pd.DataFrame(self.model.predict(df_X), index=df_X.index, columns=self.columns) def moap( From a1920053553d3eb6b2832eabb8bf1290734b65bf Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 16:27:09 +0200 Subject: [PATCH 41/85] tests --- gimmemotifs/scanner.py | 29 +++++++++++++++++++---------- test/data/rank/ranked.txt | 10 +++++----- test/test_rank.py | 10 ++++++---- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index cbb05961..3a2de020 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -678,6 +678,19 @@ def set_meanstd(self, gc=False): self.meanstd[gcbin][motif.id] = mean, std lock.release() + for gc_bin in bins: + if gc_bin not in self.meanstd: + valid_bins = [] + for b in self.gc_bins: + bstr = "{:.2f}-{:.2f}".format(b[0], b[1]) + if bstr in self.meanstd: + valid_bins.append(((b[0] + b[1]) / 2, bstr)) + + v = float(gc_bin.split("-")[1]) + _, bstr = sorted(valid_bins, key=lambda x: abs(x[0] - v))[0] + logger.warn(f"Using {bstr}") + self.meanstd[gc_bin] = self.meanstd[bstr] + def set_background( self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None @@ -994,15 +1007,13 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): gc_bin_count = Counter(seq_gc_bins) - print(self.threshold) - - _treshold = self.threshold + _threshold = self.threshold if zscore: grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0) _threshold = pd.DataFrame( np.vstack(grouped.values), - index=_treshold.index, - columns=_treshold.columns, + index=_threshold.index, + columns=_threshold.columns, ) min_frac = min(gc_bin_count.values()) @@ -1012,12 +1023,9 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): ) for gc_bin, count in gc_bin_count.items() ] - print(dfs) - fpr_df = pd.concat(dfs) - print(fpr_df.shape) + + fpr_df = pd.concat(dfs) t = fpr_df.quantile(0.99, interpolation="higher") - print(motifs) - print(t) maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index) t[t >= maxt] = None # print(t) @@ -1036,6 +1044,7 @@ def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False): motifs_meanstd = None if zscore: motifs_meanstd = self.meanstd + scan_func = partial( scan_seq_mult, motifs=motifs, diff --git a/test/data/rank/ranked.txt b/test/data/rank/ranked.txt index f5f36f40..bb52cd79 100644 --- a/test/data/rank/ranked.txt +++ b/test/data/rank/ranked.txt @@ -1,6 +1,6 @@ a b c d e -1 bZIP bZIP SOX T-box POU -2 AP2 AP2 AP2 AP2 AP2 -3 T-box SOX T-box SOX SOX -4 SOX T-box POU POU T-box -5 POU POU bZIP bZIP bZIP +bZIP 5 5 1 1 1 +AP2 4 4 4 4 4 +T-box 3 2 3 5 2 +SOX 2 3 5 3 3 +POU 1 1 2 2 4 diff --git a/test/test_rank.py b/test/test_rank.py index 5cb533e9..cca52132 100644 --- a/test/test_rank.py +++ b/test/test_rank.py @@ -2,7 +2,7 @@ import tempfile import os import pandas as pd -from gimmemotifs.rank import rankagg +from gimmemotifs.rank import rankagg, _rankagg_stuart class TestRank(unittest.TestCase): @@ -17,13 +17,15 @@ def setUp(self): def test1_rankagg(self): """ Test rank aggregation """ df = pd.read_csv(self.fname, index_col=0, sep="\t") - result = rankagg(df) - self.assertEqual("AP2", result.sort_values("score").index[0]) + result = rankagg(df, method="stuart") + self.assertEqual("AP2", result.sort_values("score").index[-1]) + result = rankagg(df, method="int_stouffer") + self.assertEqual("AP2", result.sort_values("z-score").index[-1]) def test2_rankagg(self): """ Test Python implementation of rank aggregation """ df = pd.read_csv(self.rank_in, index_col=0, sep="\t") - result = rankagg(df)["score"].values + result = _rankagg_stuart(df)["score"].values ref = pd.read_csv(self.rank_out, index_col=0, sep="\t")["score"].values for v1, v2 in zip(ref, result): self.assertAlmostEqual(v1, v2) From eaf191af0f72fbbdbb8965ffd238d8cd1bbe79cb Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 16 Jul 2020 23:27:51 +0200 Subject: [PATCH 42/85] black --- gimmemotifs/report.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 766349e7..53d781dc 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -51,11 +51,12 @@ def _wrap_html_str(x): min_pos, max_pos = m.start(), m.end() positions = [m.start() for m in re.compile(" ").finditer(x)] - if len(positions) == 0: - return x positions = [p for p in positions if min_pos < p < max_pos] + if len(positions) == 0: + return x + pos = sorted(positions, key=lambda p: abs(p - len(x) / 2))[0] x = x[:pos] + "
" + x[pos + 1 :] return x From 3c4ac02bc9ee19a2326ed3fcc1b06fe0a0fc5a3f Mon Sep 17 00:00:00 2001 From: simonvh Date: Thu, 16 Jul 2020 23:28:53 +0200 Subject: [PATCH 43/85] fix test --- gimmemotifs/scanner.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index 3a2de020..e8768248 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -391,12 +391,11 @@ def scan_sequence( result = [row for row in result if row[0] >= cutoff] else: result = pwmscan(seq, motif.logodds, cutoff, nreport, scan_rc) - if cutoff <= motif.pwm_min_score() and len(result) == 0: - result = [[motif.pwm_min_score(), 0, 1]] * nreport + if cutoff <= motif.pwm_min_score() and len(result) == 0: + result = [[motif.pwm_min_score(), 0, 1]] * nreport ret.append(result) - # return results return ret @@ -678,7 +677,9 @@ def set_meanstd(self, gc=False): self.meanstd[gcbin][motif.id] = mean, std lock.release() - for gc_bin in bins: + + for gc_bin in self.gc_bins: + gc_bin = "{:.2f}-{:.2f}".format(*gc_bin) if gc_bin not in self.meanstd: valid_bins = [] for b in self.gc_bins: @@ -691,7 +692,6 @@ def set_meanstd(self, gc=False): logger.warn(f"Using {bstr}") self.meanstd[gc_bin] = self.meanstd[bstr] - def set_background( self, fname=None, genome=None, size=200, nseq=None, gc=False, gc_bins=None ): @@ -1009,7 +1009,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): _threshold = self.threshold if zscore: - grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0) + grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0) _threshold = pd.DataFrame( np.vstack(grouped.values), index=_threshold.index, @@ -1023,12 +1023,11 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): ) for gc_bin, count in gc_bin_count.items() ] - - fpr_df = pd.concat(dfs) + + fpr_df = pd.concat(dfs) t = fpr_df.quantile(0.99, interpolation="higher") maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index) t[t >= maxt] = None - # print(t) return t.replace({np.nan: None}).to_dict() def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): From bc76a10af82aa1f54f15ef6180ab8e1aa20ff799 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 23:36:52 +0200 Subject: [PATCH 44/85] fix test --- gimmemotifs/scanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index 3a2de020..16744193 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -1009,7 +1009,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): _threshold = self.threshold if zscore: - grouped = _treshold.groupby(_treshold.index).apply(scale, axis=0) + grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0) _threshold = pd.DataFrame( np.vstack(grouped.values), index=_threshold.index, From b1d658a419fe795c3d320c08840ec1476558d4e9 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 16 Jul 2020 23:57:18 +0200 Subject: [PATCH 45/85] style --- gimmemotifs/maelstrom.py | 10 ++++++---- gimmemotifs/moap.py | 19 ++++++++++--------- gimmemotifs/rank.py | 4 ++-- gimmemotifs/scanner.py | 7 +------ 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 1be9fd2c..be7d94cf 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -257,11 +257,11 @@ def run_maelstrom( center : bool, optional Mean-center the input table. - + aggregation: str, optional - How to combine scores of the predictors. The default is "int_stouffer", for + How to combine scores of the predictors. The default is "int_stouffer", for inverse normal transform followed by Stouffer's methods to combine z-scores. - Alternatively, "stuart" performs rank aggregation and reports the -log10 of + Alternatively, "stuart" performs rank aggregation and reports the -log10 of the rank aggregation p-value. """ logger.info("Starting maelstrom") @@ -277,7 +277,9 @@ def run_maelstrom( logger.info( "Input is not mean-centered, setting the mean of all rows to 0." ) - logger.info("Use --nocenter if you know what you're doing and want to change this behavior.") + logger.info( + "Use --nocenter if you know what you're doing and want to change this behavior." + ) logger.info( "Note that if you use count data (ChIP-seq, ATAC-seq) we recommend to " "first transform your data, for instance using log2(), and to normalize " diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 4b69661b..55a4bb50 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -31,10 +31,10 @@ def warn(*args, **kwargs): from tqdm.auto import tqdm # scikit-learn -from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge +from sklearn.multioutput import MultiOutputRegressor from sklearn.preprocessing import scale, LabelEncoder from sklearn.svm import SVR @@ -582,7 +582,7 @@ def fit(self, df_X, df_y): if self.scale: logger.debug("Scaling motif scores") # Scale motif scores - df_X.loc[:,:] = scale(df_X, axis=0) + df_X.loc[:, :] = scale(df_X, axis=0) # logger.debug("Scaling y") @@ -594,7 +594,6 @@ def fit(self, df_X, df_y): model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus) logger.debug("Fitting model") - coefs = [] model.fit(df_X, df_y) logger.info("Done") @@ -603,6 +602,7 @@ def fit(self, df_X, df_y): def predict(self, df_X): return df_X.dot(self.act_.loc[df_X.columns]) + @register_predictor("SVR") class SVRMoap(Moap): def __init__(self, scale=True, ncpus=None): @@ -643,7 +643,7 @@ def fit(self, df_X, df_y): if self.scale: logger.debug("Scaling motif scores") # Scale motif scores - df_X.loc[:,:] = scale(df_X, axis=0) + df_X.loc[:, :] = scale(df_X, axis=0) # logger.debug("Scaling y") @@ -656,16 +656,17 @@ def fit(self, df_X, df_y): clf = SVR(kernel="linear") self.model = MultiOutputRegressor(clf, n_jobs=1) logger.debug("Fitting model") - coefs = [] self.model.fit(df_X, df_y) logger.info("Done") - self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T + self.act_ = pd.DataFrame(self.model.coef_, columns=X.columns, index=y.columns).T def predict(self, df_X): - #print(self.model.predict(df_X) ) - - return pd.DataFrame(self.model.predict(df_X), index=df_X.index, columns=self.columns) + # print(self.model.predict(df_X) ) + + return pd.DataFrame( + self.model.predict(df_X), index=df_X.index, columns=self.columns + ) def moap( diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py index a11ad71e..1a8cfde6 100644 --- a/gimmemotifs/rank.py +++ b/gimmemotifs/rank.py @@ -88,7 +88,7 @@ def _rank_int(series, c=3.0 / 8, stochastic=True): param1 (pandas.Series): Series of values to transform param2 (Optional[float]): Constand parameter (Bloms constant) param3 (Optional[bool]): Whether to randomise rank of ties - + Returns: pandas.Series """ @@ -108,7 +108,7 @@ def _rank_int(series, c=3.0 / 8, stochastic=True): series = series.loc[~pd.isnull(series)] # Get ranks - if stochastic == True: + if stochastic: # Shuffle by index series = series.loc[np.random.permutation(series.index)] # Get rank, ties are determined by their position in the series (hence diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index e8768248..337fdcb4 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -18,7 +18,6 @@ from genomepy import Genome from diskcache import Cache import numpy as np -from scipy.stats import scoreatpercentile from sklearn.preprocessing import scale import pandas as pd @@ -48,7 +47,6 @@ def _pickle_method(m): # only used when using cache, should not be a requirement try: from dogpile.cache import make_region - from dogpile.cache.api import NO_VALUE import xxhash except ImportError: pass @@ -819,13 +817,10 @@ def set_threshold(self, fpr=None, threshold=None, gc=False): if not self.motifs: raise ValueError("please run set_motifs() first") - thresholds = {} motifs = read_motifs(self.motifs) gc_bins = ["{:.2f}-{:.2f}".format(*gc_bin) for gc_bin in self.gc_bins] if threshold is not None: - data = [] - d = parse_threshold_values(self.motifs, threshold) self._threshold = pd.DataFrame(d, index=[0]) self._threshold = self._threshold.join( @@ -1052,7 +1047,7 @@ def _scan_sequences(self, seqs, nreport, scan_rc, zscore=False): motifs_meanstd=motifs_meanstd, zscore=zscore, ) - for seq, ret in self._scan_jobs(scan_func, seqs): + for _, ret in self._scan_jobs(scan_func, seqs): yield ret def _scan_jobs(self, scan_func, scan_seqs): From 1e9e22955dac5aa94fd9aa4c9a6b55554f42d548 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Fri, 17 Jul 2020 09:54:34 +0200 Subject: [PATCH 46/85] fix automatic adjusting of input size. fixes #128 #129 --- gimmemotifs/commands/motifs.py | 35 +++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index b2b136a6..4e89ed0c 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -39,6 +39,19 @@ def motifs(args): if not os.path.exists(scan_dir): os.makedirs(scan_dir) + sample = args.sample + if args.size and args.size > 0: + file_type = determine_file_type(args.sample) + if file_type == "fasta": + logger.warn("size parameter will be ignored for FASTA input") + else: + outfile = os.path.join(args.outdir, f"input.w{args.size}.bed") + if file_type == "narrowpeak": + narrowpeak_to_bed(args.sample, outfile, size=args.size) + if file_type == "bed": + write_equalsize_bedfile(args.sample, args.size, outfile) + sample = outfile + genome = args.genome if genome is None: args.zscore = False @@ -71,7 +84,7 @@ def motifs(args): bg, fmt="fasta", genome=genome, - inputfile=args.sample, + inputfile=sample, size=size, number=10000, ) @@ -84,7 +97,7 @@ def motifs(args): if args.denovo: gimme_motifs( - args.sample, + sample, args.outdir, params={ "tools": args.tools, @@ -147,15 +160,15 @@ def motifs(args): ) logger.info("creating motif scan tables") - ftype = determine_file_type(args.sample) - sample = args.sample - delete_sample = False - if ftype == "narrowpeak": - f = NamedTemporaryFile(delete=False) - logger.debug("Using {} as temporary BED file".format(f.name)) - narrowpeak_to_bed(args.sample, f.name, size=args.size) - sample = f.name - delete_sample = True + # ftype = determine_file_type(args.sample) + # sample = args.sample + # delete_sample = False + # if ftype == "narrowpeak": + # f = NamedTemporaryFile(delete=False) + # logger.debug("Using {} as temporary BED file".format(f.name)) + # narrowpeak_to_bed(args.sample, f.name, size=args.size) + # sample = f.name + # delete_sample = True # Create a table with the best score per motif for all motifs. # This has three reasons: From 380d2a9256d31284b4f957f8383ea0e957a05470 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 10:32:05 +0200 Subject: [PATCH 47/85] Hopefully fix memory issue --- gimmemotifs/scanner.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index 337fdcb4..bf4c2170 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -1012,18 +1012,27 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): ) min_frac = min(gc_bin_count.values()) - dfs = [ - _threshold.loc[gc_bin].sample( - int(count / min_frac * 1000), replace=True, random_state=42 - ) - for gc_bin, count in gc_bin_count.items() - ] + t = {} + maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns) + # We do this in a loop as the DataFrame will get too big to fit in memory + # when the difference between the number of sequences per gc_bin is very + # high. + for motif in _threshold.columns: + dfs = [ + _threshold.loc[gc_bin, motif].sample( + int(count / min_frac * 1000), replace=True, random_state=42 + ) + for gc_bin, count in gc_bin_count.items() + ] + + fpr_df = pd.concat(dfs) + val = fpr_df.quantile(0.99, interpolation="higher") + if val < maxt.loc[motif]: + t[motif] = val + else: + t[motif] = None - fpr_df = pd.concat(dfs) - t = fpr_df.quantile(0.99, interpolation="higher") - maxt = pd.Series([m.pwm_max_score() for m in motifs], index=t.index) - t[t >= maxt] = None - return t.replace({np.nan: None}).to_dict() + return t def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): scan_func = partial( From 6bb850869ac6774492863306e5807b1f2aa5dd45 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 11:03:49 +0200 Subject: [PATCH 48/85] correct missing import --- gimmemotifs/commands/motifs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index 4e89ed0c..dfbab38e 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -23,7 +23,7 @@ from gimmemotifs.stats import calc_stats_iterator from gimmemotifs.report import roc_html_report from gimmemotifs.scanner import scan_to_file -from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed +from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed, write_equalsize_bedfile logger = logging.getLogger("gimme.motifs") From 0d0aa4b973ccc75d2a1bb0752a0fcd8933e152e6 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 11:04:00 +0200 Subject: [PATCH 49/85] fix coef --- gimmemotifs/moap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 55a4bb50..258069a0 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -659,7 +659,7 @@ def fit(self, df_X, df_y): self.model.fit(df_X, df_y) logger.info("Done") - self.act_ = pd.DataFrame(self.model.coef_, columns=X.columns, index=y.columns).T + self.act_ = pd.DataFrame({c: e.coef_[0] for c, e in zip(df_y.columns, self.model.estimators_)}, index=X.columns) def predict(self, df_X): # print(self.model.predict(df_X) ) From 010fed716fe3ea96d63b05b27b6a52a29cb370f3 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 14:44:33 +0200 Subject: [PATCH 50/85] filter redudant motifs for maelstrom --- gimmemotifs/cli.py | 16 ++++++++ gimmemotifs/commands/maelstrom.py | 4 ++ gimmemotifs/commands/motifs.py | 3 -- gimmemotifs/maelstrom.py | 68 ++++++++++++++++++++++++++----- gimmemotifs/motif.py | 6 +++ gimmemotifs/report.py | 17 +++++--- 6 files changed, 96 insertions(+), 18 deletions(-) diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index 84b89961..2f778384 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -326,6 +326,22 @@ def cli(sys_args): default=default_pfm_file, metavar="pfmfile", ) + p.add_argument( + "--no-filter", + dest="filter_redundant", + help="Don't remove redundant motifs.", + default=True, + action="store_false", + ) + p.add_argument( + "-F", + "--filter_cutoff", + dest="filter_cutoff", + help="Cutoff to select non-redundant motifs. Default is 0.8.", + default=0.8, + type=float, + metavar="FLOAT", + ) p.add_argument( "--nocenter", dest="center", diff --git a/gimmemotifs/commands/maelstrom.py b/gimmemotifs/commands/maelstrom.py index 4cae068d..04d8cd1c 100755 --- a/gimmemotifs/commands/maelstrom.py +++ b/gimmemotifs/commands/maelstrom.py @@ -15,6 +15,8 @@ def maelstrom(args): genome = args.genome outdir = args.outdir pfmfile = args.pfmfile + filter_redundant = args.filter_redundant + filter_cutoff = args.filter_cutoff methods = args.methods ncpus = args.ncpus zscore = args.zscore @@ -33,6 +35,8 @@ def maelstrom(args): genome, outdir, pfmfile, + filter_redundant=filter_redundant, + filter_cutoff=filter_cutoff, methods=methods, ncpus=ncpus, zscore=zscore, diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index dfbab38e..6b126a0b 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -253,9 +253,6 @@ def motifs(args): gcnorm=True, ) - if delete_sample: - os.unlink(sample) - if args.report: logger.info("creating statistics report") if args.outdir: diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index be7d94cf..a9d50afe 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -26,6 +26,8 @@ from scipy.cluster import hierarchy from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram +from sklearn.cluster import FeatureAgglomeration +# from scipy.spatial.distance import correlation # Plotting import matplotlib.pyplot as plt @@ -97,7 +99,7 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): mapfile = pfmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(mapfile): - m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0) + m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#") m2f["factors"] = m2f["factors"].str[:50] else: motifs = [m.id for m in read_motifs(pfmfile)] @@ -200,6 +202,8 @@ def run_maelstrom( genome, outdir, pfmfile=None, + filter_redundant=True, + filter_cutoff=0.8, plot=True, cluster=False, score_table=None, @@ -229,6 +233,12 @@ def run_maelstrom( pfmfile : str, optional Specify a PFM file for scanning. + filter_redundant : bool, optional + Create a non-redundant set of motifs based on correlation of motif scores in the input data. + + filter_cutoff : float, optional + Cutoff to use for non-redundant motif selection. Default is 0.8. + plot : bool, optional Create heatmaps. @@ -355,6 +365,51 @@ def run_maelstrom( else: logger.info("Scores, using: %s", score_table) + counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") + scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") + + if filter_redundant: + logger.info("Selecting non-redundant motifs") + + fa = FeatureAgglomeration(distance_threshold=filter_cutoff, n_clusters=None, affinity="correlation", linkage="complete", compute_full_tree=True) + fa.fit(scores) + X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_}) + X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif") + selected_motifs = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")["motif"].values + nr_motif = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")[["label", "motif"]].set_index("label") + X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label") + motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif") + + scores = scores[selected_motifs] + counts = counts[selected_motifs] + score_table = os.path.join(outdir, "motif.nr.score.txt.gz") + scores.to_csv(score_table, sep="\t", compression="gzip") + count_table = os.path.join(outdir, "motif.nr.count.txt.gz") + counts.to_csv(count_table, sep="\t", compression="gzip") + + m2f = pd.read_table(os.path.join(outdir, mapfile), comment="#") + m2f = m2f.join(motif_map, on="Motif") + m2f.loc[m2f["Motif"] != m2f["motif_nr"], "Curated"] = "N" + m2f["Motif"] = m2f["motif_nr"] + m2f = m2f.drop(columns=["motif_nr"]) + + motifs = read_motifs(pfmfile) + pfmfile = os.path.join(outdir, "nonredundant.motifs.pfm") + with open(pfmfile, "w") as f: + for motif in motifs: + f.write(f"{motif.to_pfm()}\n") + mapfile = pfmfile.replace(".pfm", ".motif2factors.txt") + with open(mapfile, "w") as f: + f.write("# Note: this mapping is specifically created for this non-redundant set of motifs.\n") + f.write("# It also includes factors for motifs that were similar, but this can be\n") + f.write("# specific to this analysis.\n") + + with open(mapfile, "a") as f: + m2f.to_csv(f, index=False, sep="\t") + logger.info(f"Selected {len(selected_motifs)} motifs") + logger.info(f"Motifs: {pfmfile}") + logger.info(f"Factor mappings: {mapfile}") + if cluster: cluster = False for method in methods: @@ -401,18 +456,14 @@ def run_maelstrom( for method, scoring, fname in exps: try: - if scoring == "count" and count_table is not None: + if scoring == "count": moap_with_table( fname, count_table, outdir, method, scoring, ncpus=ncpus ) - elif scoring == "score" and score_table is not None: + elif scoring == "score": moap_with_table( fname, score_table, outdir, method, scoring, ncpus=ncpus ) - else: - moap_with_bg( - fname, genome, outdir, method, scoring, pfmfile=pfmfile, ncpus=ncpus - ) except Exception as e: logger.warn("Method %s with scoring %s failed", method, scoring) @@ -428,9 +479,6 @@ def run_maelstrom( except FileNotFoundError: logger.warn("Activity file for {} not found!\n".format(t)) - counts = pd.read_csv(count_table, index_col=0, comment="#", sep="\t") - scores = pd.read_csv(score_table, index_col=0, comment="#", sep="\t") - if len(methods) > 1: logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index 2be35178..a0f1608c 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -1419,6 +1419,8 @@ def _add_factors_from_handle(motifs, handle): m2f_direct = {} m2f_indirect = {} for line in open(map_file): + if line.startswith("#"): + continue try: motif, *factor_info = line.strip().split("\t") if len(factor_info) == 1: @@ -1431,7 +1433,11 @@ def _add_factors_from_handle(motifs, handle): except Exception: pass + m2f = pd.read_csv(map_file, sep="\t", comment="#", index_col=0) + for motif in motifs: + if motif.id in m2f.index: + motif.factor_info = m2f.loc[motif.id] if motif.id in m2f_direct: motif.factors[DIRECT_NAME] = m2f_direct[motif.id] if motif.id in m2f_indirect: diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 53d781dc..beb798b0 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -11,6 +11,7 @@ import re import shutil import logging +from collections import Counter import jinja2 import numpy as np @@ -835,12 +836,18 @@ def format_factors(motif, max_length=5): fmt_d = "{}" fmt_i = "{}" + if hasattr(motif, "factor_info"): + fcount = Counter([x.upper() for x in motif.factor_info["Factor"]]) + else: + fcount = Counter(motif.factors[DIRECT_NAME] + motif.factors[INDIRECT_NAME]) + direct = sorted( list( set( [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]] ) - ) + ), + key=lambda x: fcount[x], reverse=True ) indirect = sorted( list( @@ -851,23 +858,23 @@ def format_factors(motif, max_length=5): if x.upper() not in direct ] ) - ) + ), key=lambda x: fcount[x], reverse=True ) if len(direct) > max_length: show_factors = direct[:max_length] else: show_factors = direct[:] - for f in indirect: + for f in sorted(indirect, key=lambda x: fcount[x], reverse=True): if f not in show_factors: show_factors.append(f) if len(show_factors) >= max_length: break if "de novo" in show_factors: - show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"]) + show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"], key=lambda x: fcount[x], reverse=True) else: - show_factors = sorted(show_factors) + show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True) factor_str = ",".join( [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] From 47d2dfeba709b0f1178aebbd0812fe324351696c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 15:08:51 +0200 Subject: [PATCH 51/85] extra help --- gimmemotifs/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index 2f778384..ad07d663 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -337,7 +337,7 @@ def cli(sys_args): "-F", "--filter_cutoff", dest="filter_cutoff", - help="Cutoff to select non-redundant motifs. Default is 0.8.", + help="Cutoff to select non-redundant motifs. Default is 0.8, increase this value to get fewer motifs.", default=0.8, type=float, metavar="FLOAT", From df82b6160964069fda53361daae0da8c9d04180a Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 15:28:48 +0200 Subject: [PATCH 52/85] fix help --- gimmemotifs/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/cli.py b/gimmemotifs/cli.py index ad07d663..b24e3652 100644 --- a/gimmemotifs/cli.py +++ b/gimmemotifs/cli.py @@ -389,7 +389,7 @@ def cli(sys_args): p.add_argument( "--nogc", dest="gc", - help="Don't use GC% bins", + help="Don't use GC%% bins", action="store_false", default=True, ) From 0a86356665a20062f31d010917a44b0f81978310 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 19:57:04 +0200 Subject: [PATCH 53/85] print small numbers as <1 --- gimmemotifs/report.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index beb798b0..b01e2f54 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -360,6 +360,19 @@ def _background_gradient(self, s, m, M, cmap="PuBu", low=0, high=0): for color in c ] + def to_precision_str(self, subset=None, precision=0, include_zero=True): + subset = pd.IndexSlice[:, :] if subset is None else subset + subset = _non_reducing_slice(subset) + + def precision_str(x, precision=precision): + if (include_zero or x > 0) and x <= 10 ** -precision: + return f"<{10**-precision}" + else: + return f"{{0:.{precision}f}}".format(x) + + self.display_data.loc[subset] = self.data.loc[subset].applymap(precision_str) + return self + def _circle( self, subset=None, @@ -997,6 +1010,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): .wrap(subset=["% with motif"]) .align(subset=["% with motif"], location="center") .border(subset=["% with motif"], location="left") + .to_precision_str(subset=["% with motif"]) ) df_styled = df_styled.render() @@ -1109,6 +1123,7 @@ def roc_html_report( .wrap(subset=cols) .align(subset=bar_cols, location="center") .rename(columns=rename_columns) + .to_precision_str(subset=["% matches input", "%matches background"]) .render() ) else: From 01207e48a0773ddb2e5be914ae886181cdf5a94b Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 21:35:50 +0200 Subject: [PATCH 54/85] update moaps --- gimmemotifs/commands/motifs.py | 6 +++++- gimmemotifs/maelstrom.py | 33 +++++++++++++++++++++++++++------ gimmemotifs/moap.py | 27 ++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index 6b126a0b..c4489d43 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -23,7 +23,11 @@ from gimmemotifs.stats import calc_stats_iterator from gimmemotifs.report import roc_html_report from gimmemotifs.scanner import scan_to_file -from gimmemotifs.utils import determine_file_type, narrowpeak_to_bed, write_equalsize_bedfile +from gimmemotifs.utils import ( + determine_file_type, + narrowpeak_to_bed, + write_equalsize_bedfile, +) logger = logging.getLogger("gimme.motifs") diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index a9d50afe..6195960d 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -27,6 +27,7 @@ from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, dendrogram from sklearn.cluster import FeatureAgglomeration + # from scipy.spatial.distance import correlation # Plotting @@ -99,7 +100,9 @@ def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): mapfile = pfmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(mapfile): - m2f = pd.read_csv(mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#") + m2f = pd.read_csv( + mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#" + ) m2f["factors"] = m2f["factors"].str[:50] else: motifs = [m.id for m in read_motifs(pfmfile)] @@ -371,12 +374,26 @@ def run_maelstrom( if filter_redundant: logger.info("Selecting non-redundant motifs") - fa = FeatureAgglomeration(distance_threshold=filter_cutoff, n_clusters=None, affinity="correlation", linkage="complete", compute_full_tree=True) + fa = FeatureAgglomeration( + distance_threshold=filter_cutoff, + n_clusters=None, + affinity="correlation", + linkage="complete", + compute_full_tree=True, + ) fa.fit(scores) X_cluster = pd.DataFrame({"motif": scores.columns, "label": fa.labels_}) X_cluster = X_cluster.join(scores.var().to_frame(name="var"), on="motif") - selected_motifs = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")["motif"].values - nr_motif = X_cluster.sort_values("var").drop_duplicates(subset=["label"], keep="last")[["label", "motif"]].set_index("label") + selected_motifs = ( + X_cluster.sort_values("var") + .drop_duplicates(subset=["label"], keep="last")["motif"] + .values + ) + nr_motif = ( + X_cluster.sort_values("var") + .drop_duplicates(subset=["label"], keep="last")[["label", "motif"]] + .set_index("label") + ) X_cluster = X_cluster.join(nr_motif, rsuffix="_nr", on="label") motif_map = X_cluster[["motif", "motif_nr"]].set_index("motif") @@ -400,8 +417,12 @@ def run_maelstrom( f.write(f"{motif.to_pfm()}\n") mapfile = pfmfile.replace(".pfm", ".motif2factors.txt") with open(mapfile, "w") as f: - f.write("# Note: this mapping is specifically created for this non-redundant set of motifs.\n") - f.write("# It also includes factors for motifs that were similar, but this can be\n") + f.write( + "# Note: this mapping is specifically created for this non-redundant set of motifs.\n" + ) + f.write( + "# It also includes factors for motifs that were similar, but this can be\n" + ) f.write("# specific to this analysis.\n") with open(mapfile, "a") as f: diff --git a/gimmemotifs/moap.py b/gimmemotifs/moap.py index 258069a0..d4053bc4 100644 --- a/gimmemotifs/moap.py +++ b/gimmemotifs/moap.py @@ -36,7 +36,9 @@ def warn(*args, **kwargs): from sklearn.linear_model import MultiTaskLassoCV, BayesianRidge from sklearn.multioutput import MultiOutputRegressor from sklearn.preprocessing import scale, LabelEncoder -from sklearn.svm import SVR +from sklearn.svm import LinearSVR +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline import xgboost @@ -592,12 +594,24 @@ def fit(self, df_X, df_y): X = df_X.loc[y.index] - model = MultiTaskLassoCV(selection="random", n_alphas=20, n_jobs=self.ncpus) + model = Pipeline( + [ + ("scale", StandardScaler()), + ( + "reg", + MultiTaskLassoCV( + fit_intercept=False, n_alphas=20, n_jobs=self.ncpus + ), + ), + ] + ) logger.debug("Fitting model") model.fit(df_X, df_y) logger.info("Done") - self.act_ = pd.DataFrame(model.coef_, columns=X.columns, index=y.columns).T + self.act_ = pd.DataFrame( + model.steps[1][1].coef_, index=y.columns, columns=X.columns + ).T def predict(self, df_X): return df_X.dot(self.act_.loc[df_X.columns]) @@ -653,13 +667,16 @@ def fit(self, df_X, df_y): self.columns = df_y.columns X = df_X.loc[y.index] - clf = SVR(kernel="linear") + clf = LinearSVR() self.model = MultiOutputRegressor(clf, n_jobs=1) logger.debug("Fitting model") self.model.fit(df_X, df_y) logger.info("Done") - self.act_ = pd.DataFrame({c: e.coef_[0] for c, e in zip(df_y.columns, self.model.estimators_)}, index=X.columns) + self.act_ = pd.DataFrame( + {c: e.coef_ for c, e in zip(df_y.columns, self.model.estimators_)}, + index=X.columns, + ) def predict(self, df_X): # print(self.model.predict(df_X) ) From 5bb73a11cef8103d2baec71416ea0871fb1e7e5f Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 22 Jul 2020 21:36:03 +0200 Subject: [PATCH 55/85] update report --- gimmemotifs/report.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index b01e2f54..20af73a8 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -860,7 +860,8 @@ def format_factors(motif, max_length=5): [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]] ) ), - key=lambda x: fcount[x], reverse=True + key=lambda x: fcount[x], + reverse=True, ) indirect = sorted( list( @@ -871,7 +872,9 @@ def format_factors(motif, max_length=5): if x.upper() not in direct ] ) - ), key=lambda x: fcount[x], reverse=True + ), + key=lambda x: fcount[x], + reverse=True, ) if len(direct) > max_length: @@ -885,7 +888,11 @@ def format_factors(motif, max_length=5): break if "de novo" in show_factors: - show_factors = ["de novo"] + sorted([f for f in show_factors if f != "de novo"], key=lambda x: fcount[x], reverse=True) + show_factors = ["de novo"] + sorted( + [f for f in show_factors if f != "de novo"], + key=lambda x: fcount[x], + reverse=True, + ) else: show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True) From e3270038344361ca0abde6da0cd8b36c63f787eb Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 08:17:41 +0200 Subject: [PATCH 56/85] update test --- test/test_maelstrom.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py index 4edda61b..66f602e2 100644 --- a/test/test_maelstrom.py +++ b/test/test_maelstrom.py @@ -26,13 +26,30 @@ def test1_maelstrom(self): self.clusters, "mm10", self.outdir, + filter_redundant=False, score_table=self.score_table, count_table=self.count_table, plot=False, ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) + self.assertEquals((623, 5), df.shape) + + # Filter redundant motifs + run_maelstrom( + self.clusters, + "mm10", + self.outdir, + filter_redundant=True, + score_table=self.score_table, + count_table=self.count_table, + plot=False, + ) + df = pd.read_table(self.outfile, index_col=0, comment="#") + print(df.shape) + self.assertEquals((156, 5), df.shape) + for fname in glob(os.path.join(self.outdir, "activity*")): os.unlink(fname) From 6165925a0b88b2031cc32d69a648e2d9ebf74447 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 08:45:09 +0200 Subject: [PATCH 57/85] updated for readthedocs --- docs/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 2ba060ac..9429d2a2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -12,5 +12,3 @@ six future statsmodels tqdm -xgboost >=0.71 -sklearn-contrib-lightning==0.4.0 From 857f13cb80c73e3aee47d26d61f365dc42104168 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 08:56:41 +0200 Subject: [PATCH 58/85] readthedocs --- .rtd-environment.yml | 30 ++++++++++++++++++++++++++++++ readthedocs.yml | 15 +++++---------- 2 files changed, 35 insertions(+), 10 deletions(-) create mode 100644 .rtd-environment.yml diff --git a/.rtd-environment.yml b/.rtd-environment.yml new file mode 100644 index 00000000..dd64911b --- /dev/null +++ b/.rtd-environment.yml @@ -0,0 +1,30 @@ +name: gimmemotifs +channels: + - defaults + - bioconda + - conda-forge +dependencies: + - configparser + - diskcache + - feather-format + - genomepy >=0.8.3 + - jinja2 + - logomaker + - matplotlib-base >=3.1.2 + - ncurses + - numpy + - pandas >=1.0.3 + - pillow + - pyarrow >=0.16.0 + - pybedtools + - python >=3 + - python-xxhash + - pyyaml >=3.10 + - qnorm + - scikit-learn >=0.18 + - scipy >=1.3.0 + - seaborn + - statsmodels + - tqdm >=4.27.0 + - xdg + - xgboost >=0.71 diff --git a/readthedocs.yml b/readthedocs.yml index 9a6a856f..63c8c19b 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -5,14 +5,9 @@ version: 2 sphinx: configuration: docs/conf.py +conda: + file: .rtd-environment.yml + python: - version: 3.7 - install: - - requirements: docs/requirements.txt - - method: pip - path: . - extra_requirements: - - docs - - method: setuptools - path: another/package - system_packages: true + version: 3.7 + setup_py_install: true From 82cbb17ea2d5bb1e72cbdef1fb64f682b911f542 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 09:06:02 +0200 Subject: [PATCH 59/85] readthedocs --- readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs.yml b/readthedocs.yml index 63c8c19b..1e54352b 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -6,7 +6,7 @@ sphinx: configuration: docs/conf.py conda: - file: .rtd-environment.yml + environment: .rtd-environment.yml python: version: 3.7 From 579e9859ab6566d6fb61048e9c6bcde7305da251 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 09:15:02 +0200 Subject: [PATCH 60/85] readthedocs --- readthedocs.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/readthedocs.yml b/readthedocs.yml index 1e54352b..f9d483db 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -10,4 +10,6 @@ conda: python: version: 3.7 - setup_py_install: true + install: + - method: pip + path: . From bcd2d8f657700be8c32c23a5459c9ae7e35b0024 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 09:41:41 +0200 Subject: [PATCH 61/85] update readthedocs --- .rtd-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.rtd-environment.yml b/.rtd-environment.yml index dd64911b..5fafc7d3 100644 --- a/.rtd-environment.yml +++ b/.rtd-environment.yml @@ -21,6 +21,7 @@ dependencies: - python-xxhash - pyyaml >=3.10 - qnorm + - represent - scikit-learn >=0.18 - scipy >=1.3.0 - seaborn From ab0432986ea471ba50be149f4e09ba92ed58c34e Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 10:51:57 +0200 Subject: [PATCH 62/85] update MaelstromResult to deal with new format --- gimmemotifs/maelstrom.py | 79 ++++++++++++++++++++++++++++------------ gimmemotifs/report.py | 61 +++++++++++++++++-------------- 2 files changed, 90 insertions(+), 50 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 6195960d..fc5f936b 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -41,7 +41,7 @@ from gimmemotifs.moap import moap, Moap, scan_to_table from gimmemotifs.rank import rankagg from gimmemotifs.motif import read_motifs -from gimmemotifs.report import maelstrom_html_report +from gimmemotifs.report import maelstrom_html_report, format_factors from gimmemotifs.utils import join_max, pfmfile_location from multiprocessing import Pool @@ -563,7 +563,10 @@ def __init__(self, outdir): raise FileNotFoundError("No such directory: " + outdir) # Load motifs - fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) + fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m")) + print(fnames) + if len(fnames) == 0: + fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) if len(fnames) > 0: pfmfile = fnames[0] with open(pfmfile) as fin: @@ -582,6 +585,17 @@ def __init__(self, outdir): self.result = pd.read_table( os.path.join(outdir, "final.out.txt"), comment="#", index_col=0 ) + self.correlation = self.result.loc[ + :, self.result.columns.str.contains("correlation") + ] + self.percent_match = self.result.loc[ + :, self.result.columns.str.contains("% with motif") + ] + self.result = self.result.loc[ + :, + ~self.result.columns.str.contains("correlation") + & ~self.result.columns.str.contains("% with motif"), + ] # Read motif results self.scores = pd.read_table( @@ -610,10 +624,11 @@ def plot_heatmap( min_freq=0.01, threshold=2, name=True, - indirect=False, + indirect=True, figsize=None, - max_len=50, + max_number_factors=5, aspect=1, + cmap="RdBu_r", **kwargs, ): """Plot clustered heatmap of predicted motif activity. @@ -622,7 +637,7 @@ def plot_heatmap( ---------- kind : str, optional Which data type to use for plotting. Default is 'final', which will - plot the result of the rang aggregation. Other options are 'freq' + plot the result of the rank aggregation. Other options are 'freq' for the motif frequencies, or any of the individual activities such as 'rf.score'. @@ -636,19 +651,22 @@ def plot_heatmap( Use factor names instead of motif names for plotting. indirect : bool, optional - Include indirect factors. Default is False. + Include indirect factors (computationally predicted or non-curated). Default is True. - max_len : int, optional - Truncate the list of factors to this maximum length. + max_number_factors : int, optional + Truncate the list of factors to this maximum size. figsize : tuple, optional Tuple of figure size (width, height). aspect : int, optional Aspect ratio for tweaking the plot. + + cmap : str, optional + Color paletter to use, RdBu_r by default. kwargs : other keyword arguments - All other keyword arguments are passed to sns.clustermap + All other keyword arguments are passed to sns.heatmap Returns ------- @@ -663,13 +681,17 @@ def plot_heatmap( filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq) idx = self.result.loc[filt].index + + if idx.shape[0] == 0: + logger.warning("Empty matrix, try lowering the threshold") + return + if idx.shape[0] >= 100: logger.warning("The filtered matrix has more than 100 rows.") logger.warning( "It might be worthwhile to increase the threshold for visualization" ) - cmap = "RdBu_r" if kind == "final": data = self.result elif kind == "freq": @@ -687,18 +709,26 @@ def plot_heatmap( else: raise ValueError("Unknown dtype") - # print(data.head()) - # plt.figure( m = data.loc[idx] - vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99)) - vmin = -vmax + + if "vmax" in kwargs: + vmax = kwargs.pop("vmax") + else: + vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99)) + + if "vmin" in kwargs: + vmin = kwargs.pop("vmin") + else: + vmin = -vmax + if name: m["factors"] = [ - join_max( - _get_factor_list(self.motifs[n], indirect), - max_len, - ",", - suffix=",(...)", + format_factors( + self.motifs[n], + max_length=max_number_factors, + html=False, + include_indirect=indirect, + extra_str=",..", ) for n in m.index ] @@ -706,7 +736,8 @@ def plot_heatmap( h, w = m.shape if figsize is None: - figsize = (3 + m.shape[1] / 4, 1 + m.shape[0] / 3) + figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3) + fig = plt.figure(figsize=figsize) npixels = 30 g = GridSpec( @@ -714,8 +745,8 @@ def plot_heatmap( ) ax1 = fig.add_subplot(g[0, :]) ax2 = fig.add_subplot(g[1, :]) - ax2.set_title("Significance (-log10(p-value))") - dm = pdist(m, metric="euclidean") + ax2.set_title("aggregated z-score") + dm = pdist(m, metric="correlation") hc = linkage(dm, method="ward") leaves = dendrogram(hc, no_plot=True)["leaves"] cg = sns.heatmap( @@ -727,10 +758,12 @@ def plot_heatmap( linewidths=1, vmin=vmin, vmax=vmax, + **kwargs, ) + plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90) plt.tight_layout() # cg.ax_col_dendrogram.set_visible(False) - # plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) + # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) return cg def plot_scores(self, motifs, name=True, max_len=50): diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 20af73a8..dd65aefd 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -845,9 +845,12 @@ def create_denovo_motif_report( ) -def format_factors(motif, max_length=5): - fmt_d = "{}" - fmt_i = "{}" +def format_factors(motif, max_length=5, html=True, include_indirect=True, extra_str=", (...)"): + if html: + fmt_d = "{}" + fmt_i = "{}" + else: + fmt_d = fmt_i = "{}" if hasattr(motif, "factor_info"): fcount = Counter([x.upper() for x in motif.factor_info["Factor"]]) @@ -863,19 +866,22 @@ def format_factors(motif, max_length=5): key=lambda x: fcount[x], reverse=True, ) - indirect = sorted( - list( - set( - [ - x.upper() - for x in motif.factors[INDIRECT_NAME] - if x.upper() not in direct - ] - ) - ), - key=lambda x: fcount[x], - reverse=True, - ) + + indirect = [] + if include_indirect: + indirect = sorted( + list( + set( + [ + x.upper() + for x in motif.factors[INDIRECT_NAME] + if x.upper() not in direct + ] + ) + ), + key=lambda x: fcount[x], + reverse=True, + ) if len(direct) > max_length: show_factors = direct[:max_length] @@ -901,17 +907,18 @@ def format_factors(motif, max_length=5): ) if len(direct + indirect) > max_length: - factor_str += ", (...)" - - tooltip = "" - if len(direct) > 0: - tooltip += "direct: " + ",".join(sorted(direct)) - if len(indirect) > 0: - if tooltip != "": - tooltip += " " - tooltip += "predicted: " + ",".join(sorted(indirect)) - - factor_str = '
' + factor_str + "
" + factor_str += extra_str + + if html: + tooltip = "" + if len(direct) > 0: + tooltip += "direct: " + ",".join(sorted(direct)) + if len(indirect) > 0: + if tooltip != "": + tooltip += " " + tooltip += "predicted: " + ",".join(sorted(indirect)) + + factor_str = '
' + factor_str + "
" return factor_str From a5ae647c4355c9ecbbc3c65feadf9949dd20d04e Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 10:55:22 +0200 Subject: [PATCH 63/85] updated docs --- docs/examples.rst | 8 ++++---- docs/reference.rst | 38 ++++++++++++++++++++++++++++++-------- docs/tutorials.rst | 12 ++++++------ readthedocs.yml | 11 +++++++---- 4 files changed, 47 insertions(+), 22 deletions(-) diff --git a/docs/examples.rst b/docs/examples.rst index 33fd06c7..9eedc21a 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -41,10 +41,10 @@ Compare motifs between data sets $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.out/ The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using rank aggregation. -The score that is shown is the -log10(p-value), where the p-value (from the rank aggregation) is corrected for multiple testing. -This procedure is then repeated with the ranking reversed. These are shown as negative values. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. Create sequence logos --------------------- diff --git a/docs/reference.rst b/docs/reference.rst index f29e2044..2ba5b10e 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -362,11 +362,26 @@ This command can be used to identify differential motifs between two or more dat :: -h, --help show this help message and exit - -p PFMFILE, --pfmfile PFMFILE + -p pfmfile, --pfmfile pfmfile PFM file with motifs (default: - gimme.vertebrate.v5.0.pwm) + gimme.vertebrate.v5.0.pfm) + --no-filter Don't remove redundant motifs. + -F FLOAT, --filter_cutoff FLOAT + Cutoff to select non-redundant motifs. Default is 0.8, + increase this value to get fewer motifs. + --nocenter Don't mean-center the rows by default -m NAMES, --methods NAMES Run with specific methods + -a method, --aggregation method + How to combine motifs from individual methods. Default + is "int_stouffer", for inverse normal transform of + ranks, followed by Stouffer's method to combine + z-scores. Alternatively, specify "stuart" for log- + transformed rank aggregation p-values. + -N INT, --nthreads INT + Number of threads (default 12) + --rawscore Don't z-score normalize motif scores + --nogc Don't use GC% bins **Input file formats** @@ -407,16 +422,23 @@ The second option looks like this: This is a tab-separated table, with a header describing the experiments. In case of sequencing data, such as ChIP-seq, ATAC-seq or DNaseI seq, we recommend to use **log-transformed** read counts which are -**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after the log-transformatiion step, -for instance by quantile normalization or scaling. +**mean-centered per row**. For optimal results, it is recommended to normalize between experiments (columns) after + the log-transformatiion step, for instance by quantile normalization or scaling. +By default, ``gimme maelstrom`` will mean-center the input, disable this with ``--nocenter``. The second input format generally gives better results than the first one and would be the recommended format. The output scores of ``gimme maelstrom`` represent the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using the rank aggregation method from `Kolde, 2012 `_. -The score that is shown is the -log10(p-value), where the p-value comes from the rank aggregation. -This procedure is then repeated with the ranking reversed. These are shown as negative values. +This z-score represents the combined result of multiple methods. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. + +By default, ``gimme maelstrom`` selects a non-redundant set of motifs by clustering the motifs based on scores in the set of +input sequences. You can disable this by using the ``--no-filter`` argument. You can tweak the number of selected motifs by +changing the ``--filter-cutoff`` parameter. By default this is set to ``0.8``. Increase this value to select fewer motifs, +decrease it to select more motifs. Keep in mind that you may start to lose biologically relevant motifs if you set this too high. .. _`gimme_scan`: diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 088f8710..1439dcac 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -330,11 +330,11 @@ There output directory contains several files: The two motif files, ``motif.count.txt.gz`` and ``motif.score.txt.gz`` contain the motif scan results. The ``activity.*.out.txt`` files are tables with the results of the individual methods. The main result is ``final.out.txt``, which integrates all individual methods in a final score. -This score represents the combined result of multiple methods. -The individual results from different methods are ranked from high-scoring motif to low-scoring motif -and then aggregated using the rank aggregation method from `Kolde, 2012 `_. -The score that is shown is the -log10(p-value). -This procedure is then repeated with the ranking reversed. These are shown as negative values. +This z-score represents the combined result of multiple methods. +The individual results from different methods are ranked from high-scoring motif to low-scoring motif and converted +to z-scores using the inverse normal transformation. The z-scores from individual methods are then combined using +Stouffer's method. The score that is shown is the aggregated z-score. A higher z-score means that presence of +the motif or a higher motif score is associated with higher signal in a specific sample. The file ``gimme.maelstrom.report.html`` contains a graphical summary of this file that can be opened in your web browser. @@ -359,7 +359,7 @@ This will show a heatmap like this: .. image:: images/heatmap.png We see that the expected motifs for different cell types are identified. GATA/TAL1 for Erythrocytes, CEBP for monocytes, LEF/TCF for T cells (ie. Wnt signaling), SPIB and PAX5 for B cells and so on. -Keep in mind that this shows only the most relevant motifs (-log10 p-value cutoff of 6), there are more relevant motifs. +Keep in mind that this shows only the most relevant motifs (z-score threshold of 6), there are more relevant motifs. This example was run only on 1,000 variable enhancer. A file with more regions, ``hg19.blood.most_variable.10k.txt`` for this example, will usually yield better results. The Jupyter notebook example `maelstrom.ipynb `_ shows a more extensive example on how to work with maelstrom results in Python. diff --git a/readthedocs.yml b/readthedocs.yml index f9d483db..e458c84e 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,15 +1,18 @@ # Required version: 2 -# Build documentation in the docs/ directory with Sphinx +conda: + environment: .rtd-environment.yml + + # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py -conda: - environment: .rtd-environment.yml +build: + image: latest python: version: 3.7 install: - - method: pip + - method: setuptools path: . From c448ee8c4fd20bd98fe6461046304957d18a975b Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 11:51:29 +0200 Subject: [PATCH 64/85] trd --- .rtd-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.rtd-environment.yml b/.rtd-environment.yml index 5fafc7d3..2f23c0d2 100644 --- a/.rtd-environment.yml +++ b/.rtd-environment.yml @@ -29,3 +29,4 @@ dependencies: - tqdm >=4.27.0 - xdg - xgboost >=0.71 + - sphinx_bootstrap_theme From bb1f82a803967e8b174dc0c472dac7d1bf4cfe77 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 13:46:16 +0200 Subject: [PATCH 65/85] rtd --- .rtd-environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.rtd-environment.yml b/.rtd-environment.yml index 2f23c0d2..cbcad0d7 100644 --- a/.rtd-environment.yml +++ b/.rtd-environment.yml @@ -30,3 +30,4 @@ dependencies: - xdg - xgboost >=0.71 - sphinx_bootstrap_theme + - numpydoc From 5c49f59953050664ba4bddc3b8065787bf707fa3 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 17:01:20 +0200 Subject: [PATCH 66/85] move format_factors to motif --- gimmemotifs/maelstrom.py | 7 ++-- gimmemotifs/motif.py | 78 ++++++++++++++++++++++++++++++++++++++ gimmemotifs/report.py | 81 +--------------------------------------- 3 files changed, 82 insertions(+), 84 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index fc5f936b..d5847214 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -41,7 +41,7 @@ from gimmemotifs.moap import moap, Moap, scan_to_table from gimmemotifs.rank import rankagg from gimmemotifs.motif import read_motifs -from gimmemotifs.report import maelstrom_html_report, format_factors +from gimmemotifs.report import maelstrom_html_report from gimmemotifs.utils import join_max, pfmfile_location from multiprocessing import Pool @@ -564,7 +564,7 @@ def __init__(self, outdir): # Load motifs fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m")) - print(fnames) + if len(fnames) == 0: fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) if len(fnames) > 0: @@ -723,8 +723,7 @@ def plot_heatmap( if name: m["factors"] = [ - format_factors( - self.motifs[n], + self.motifs[n].format_factors( max_length=max_number_factors, html=False, include_indirect=indirect, diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index a0f1608c..e4eb7f6d 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -10,6 +10,7 @@ import sys import random from math import log, sqrt +from collections import Counter from warnings import warn from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME @@ -1303,6 +1304,83 @@ def wiggle_pwm(self): return self.wiggled_pwm + def format_factors(self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"): + if html: + fmt_d = "{}" + fmt_i = "{}" + else: + fmt_d = fmt_i = "{}" + + if hasattr(self, "factor_info"): + fcount = Counter([x.upper() for x in self.factor_info["Factor"]]) + else: + fcount = Counter(self.factors[DIRECT_NAME] + self.factors[INDIRECT_NAME]) + + direct = sorted( + list( + set( + [x.upper() if x != "de novo" else x for x in self.factors[DIRECT_NAME]] + ) + ), + key=lambda x: fcount[x], + reverse=True, + ) + + indirect = [] + if include_indirect: + indirect = sorted( + list( + set( + [ + x.upper() + for x in self.factors[INDIRECT_NAME] + if x.upper() not in direct + ] + ) + ), + key=lambda x: fcount[x], + reverse=True, + ) + + if len(direct) > max_length: + show_factors = direct[:max_length] + else: + show_factors = direct[:] + for f in sorted(indirect, key=lambda x: fcount[x], reverse=True): + if f not in show_factors: + show_factors.append(f) + if len(show_factors) >= max_length: + break + + if "de novo" in show_factors: + show_factors = ["de novo"] + sorted( + [f for f in show_factors if f != "de novo"], + key=lambda x: fcount[x], + reverse=True, + ) + else: + show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True) + + factor_str = ",".join( + [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] + ) + + if len(direct + indirect) > max_length: + factor_str += extra_str + + if html: + tooltip = "" + if len(direct) > 0: + tooltip += "direct: " + ",".join(sorted(direct)) + if len(indirect) > 0: + if tooltip != "": + tooltip += " " + tooltip += "predicted: " + ",".join(sorted(indirect)) + + factor_str = '
' + factor_str + "
" + + return factor_str + def default_motifs(): """Return list of Motif instances from default motif database.""" diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index dd65aefd..aa0b3a9a 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -11,7 +11,6 @@ import re import shutil import logging -from collections import Counter import jinja2 import numpy as np @@ -845,84 +844,6 @@ def create_denovo_motif_report( ) -def format_factors(motif, max_length=5, html=True, include_indirect=True, extra_str=", (...)"): - if html: - fmt_d = "{}" - fmt_i = "{}" - else: - fmt_d = fmt_i = "{}" - - if hasattr(motif, "factor_info"): - fcount = Counter([x.upper() for x in motif.factor_info["Factor"]]) - else: - fcount = Counter(motif.factors[DIRECT_NAME] + motif.factors[INDIRECT_NAME]) - - direct = sorted( - list( - set( - [x.upper() if x != "de novo" else x for x in motif.factors[DIRECT_NAME]] - ) - ), - key=lambda x: fcount[x], - reverse=True, - ) - - indirect = [] - if include_indirect: - indirect = sorted( - list( - set( - [ - x.upper() - for x in motif.factors[INDIRECT_NAME] - if x.upper() not in direct - ] - ) - ), - key=lambda x: fcount[x], - reverse=True, - ) - - if len(direct) > max_length: - show_factors = direct[:max_length] - else: - show_factors = direct[:] - for f in sorted(indirect, key=lambda x: fcount[x], reverse=True): - if f not in show_factors: - show_factors.append(f) - if len(show_factors) >= max_length: - break - - if "de novo" in show_factors: - show_factors = ["de novo"] + sorted( - [f for f in show_factors if f != "de novo"], - key=lambda x: fcount[x], - reverse=True, - ) - else: - show_factors = sorted(show_factors, key=lambda x: fcount[x], reverse=True) - - factor_str = ",".join( - [fmt_d.format(f) if f in direct else fmt_i.format(f) for f in show_factors] - ) - - if len(direct + indirect) > max_length: - factor_str += extra_str - - if html: - tooltip = "" - if len(direct) > 0: - tooltip += "direct: " + ",".join(sorted(direct)) - if len(indirect) > 0: - if tooltip != "": - tooltip += " " - tooltip += "predicted: " + ",".join(sorted(indirect)) - - factor_str = '
' + factor_str + "
" - - return factor_str - - def motif_to_factor_series(series, pfmfile=None, motifs=None): if motifs is None: motifs = read_motifs(pfmfile, as_dict=True) @@ -932,7 +853,7 @@ def motif_to_factor_series(series, pfmfile=None, motifs=None): else: index = series.index - factors = [format_factors(motifs[motif]) for motif in series] + factors = [motifs[motif].format_factors(html=True) for motif in series] return pd.Series(data=factors, index=index) From 951459fa2689084d46ebfdfb35e98bf4746a3b45 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 17:02:12 +0200 Subject: [PATCH 67/85] fix slow threshold determination --- gimmemotifs/scanner.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index bf4c2170..d2726993 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -931,8 +931,11 @@ def best_match(self, seqs, scan_rc=True, zscore=False, gc=False): yield [m[0] for m in matches] def get_seq_bin(self, seq): - useq = seq.upper() - gc = round((useq.count("G") + useq.count("C")) / len(useq), 2) + if len(str(seq)) == 0: + gc = 0 + else: + useq = seq.upper() + gc = round((useq.count("G") + useq.count("C")) / len(useq), 2) if gc == 0: gc = 0.01 for b_start, b_end in self.gc_bins: @@ -1011,29 +1014,23 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): columns=_threshold.columns, ) - min_frac = min(gc_bin_count.values()) + nseqs = int(20000 / np.sum(list(gc_bin_count.values()))) t = {} maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns) # We do this in a loop as the DataFrame will get too big to fit in memory # when the difference between the number of sequences per gc_bin is very # high. - for motif in _threshold.columns: - dfs = [ - _threshold.loc[gc_bin, motif].sample( - int(count / min_frac * 1000), replace=True, random_state=42 - ) - for gc_bin, count in gc_bin_count.items() - ] - - fpr_df = pd.concat(dfs) - val = fpr_df.quantile(0.99, interpolation="higher") + _threshold = _threshold.reset_index() + idx = np.hstack([_threshold[_threshold[_threshold.columns[0]] == gc_bin].sample(nseqs * count, re +place=True, random_state=42).index.values for gc_bin, count in gc_bin_count.items()]) + for motif in _threshold.columns[1:]: + val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher") if val < maxt.loc[motif]: t[motif] = val else: t[motif] = None - return t - + def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): scan_func = partial( scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc From 227941e078bb904e38ee94ef5a5861d6eb166fa5 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 17:04:31 +0200 Subject: [PATCH 68/85] style --- gimmemotifs/conversion.py | 103 ++++++++++++++++++++++++++++++++++++++ gimmemotifs/maelstrom.py | 2 +- gimmemotifs/motif.py | 9 +++- gimmemotifs/scanner.py | 12 +++-- 4 files changed, 120 insertions(+), 6 deletions(-) create mode 100644 gimmemotifs/conversion.py diff --git a/gimmemotifs/conversion.py b/gimmemotifs/conversion.py new file mode 100644 index 00000000..ed2a1487 --- /dev/null +++ b/gimmemotifs/conversion.py @@ -0,0 +1,103 @@ +# import mygene +import pandas as pd +import pybedtools +from genomepy import Genome +import sys +from gimmemotifs.fasta import Fasta + +# mg = mygene.MyGeneInfo() + +# xli = ["gata3"] + +# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all") + +# for hit in out: +# print(hit) +# # if "genomic_pos" in hit: +# # print("{}:{}-{}\t{}".format( +# # hit["genomic_pos"]["chr"], +# # hit["genomic_pos"]["start"], +# # hit["genomic_pos"]["end"], +# # hit["query"], +# # )) + +# sys.exit() +from functools import singledispatch + + +@singledispatch +def scan(obj): + # default implementation + raise NotImplementedError(f"Not implemented for {type(obj)}") + + +@scan.register(pd.DataFrame) +def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"): + if not set(columns).issubset(df.columns): + raise ValueError(f"Expected columns {columns}") + + if len(columns) == 3: + # Assume this is chromosome start, end + g = Genome(genome) + seqs = list( + ( + df[columns[0]] + + ":" + + df[columns[1]].astype(str) + + "-" + + df[columns[2]].astype(str) + ).values + ) + return g.track2fasta(seqs) + elif len(columns) == 1: + # Assume this is some kind of gene_id + return df[columns[0]].values + + +# @scan.register(pybedtools.BedTool) +# @profile +def _scan_bedtool(bed, genome="hg38"): + g = Genome(genome) + intervals = [g[f.chrom][f.start : f.stop] for f in bed] + return intervals + + +# @profile +def _scan_bedtool2(bed, genome="hg38"): + g = Genome(genome) + return Fasta(bed.sequence(fi=g.filename).seqfn).seqs + + +import requests + +rest_url = "https://rest.ensembl.org/info/species" +r = requests.get(rest_url, headers={"Content-Type": "application/json"}) + +if not r.ok: + r.raise_for_status() + +json = r.json() + +print(json) + + +# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822, +# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]}) +# b = pybedtools.BedTool("5k.bed") +# # for f in b: +# # print(f) +# # break +# seqs = _scan_bedtool(b, genome="Spur_3.1") +# seqs = _scan_bedtool2(b, genome="Spur_3.1") + +# g = Genome("hg19") +# print(g["chr1"][1000000:1000100]) + +# FASTA file +# BED file +# region file +# Gene file +# - promoter (all species) +# - closest accessible region (human) +# - sum / mean / max of regions within distance of promoter +# - sum / mean / max of regions within weighted distance of promoter diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index d5847214..d764a1c9 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -564,7 +564,7 @@ def __init__(self, outdir): # Load motifs fnames = glob.glob(os.path.join(outdir, "nonredundant*.p[fw]m")) - + if len(fnames) == 0: fnames = glob.glob(os.path.join(outdir, "*.p[fw]m")) if len(fnames) > 0: diff --git a/gimmemotifs/motif.py b/gimmemotifs/motif.py index e4eb7f6d..253bc410 100644 --- a/gimmemotifs/motif.py +++ b/gimmemotifs/motif.py @@ -1304,7 +1304,9 @@ def wiggle_pwm(self): return self.wiggled_pwm - def format_factors(self, max_length=5, html=False, include_indirect=True, extra_str=", (...)"): + def format_factors( + self, max_length=5, html=False, include_indirect=True, extra_str=", (...)" + ): if html: fmt_d = "{}" fmt_i = "{}" @@ -1319,7 +1321,10 @@ def format_factors(self, max_length=5, html=False, include_indirect=True, extra_ direct = sorted( list( set( - [x.upper() if x != "de novo" else x for x in self.factors[DIRECT_NAME]] + [ + x.upper() if x != "de novo" else x + for x in self.factors[DIRECT_NAME] + ] ) ), key=lambda x: fcount[x], diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index d2726993..cfccc415 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -1021,8 +1021,14 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): # when the difference between the number of sequences per gc_bin is very # high. _threshold = _threshold.reset_index() - idx = np.hstack([_threshold[_threshold[_threshold.columns[0]] == gc_bin].sample(nseqs * count, re -place=True, random_state=42).index.values for gc_bin, count in gc_bin_count.items()]) + idx = np.hstack( + [ + _threshold[_threshold[_threshold.columns[0]] == gc_bin] + .sample(nseqs * count, replace=True, random_state=42) + .index.values + for gc_bin, count in gc_bin_count.items() + ] + ) for motif in _threshold.columns[1:]: val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher") if val < maxt.loc[motif]: @@ -1030,7 +1036,7 @@ def get_gc_thresholds(self, seqs, motifs=None, zscore=False): else: t[motif] = None return t - + def _scan_sequences_with_motif(self, motifs, seqs, nreport, scan_rc): scan_func = partial( scan_seq_mult, motifs=motifs, nreport=nreport, scan_rc=scan_rc From bd321e633e72111b11548ec2da15b9f66b54ab13 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 19:24:16 +0200 Subject: [PATCH 69/85] typo --- docs/tutorials.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 1439dcac..9aae5889 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -320,7 +320,7 @@ There is also a larger file, that contains more regions ``hg19.blood.most_variab $ gimme maelstrom hg19.blood.most_variable.1k.txt hg19 maelstrom.blood.1k.out -There output directory contains several files: +The output directory contains several files: :: @@ -342,7 +342,6 @@ The file ``gimme.maelstrom.report.html`` contains a graphical summary of this fi You can sort on the different columns by clicking on them. - The following Python snippet will create a heatmap of the results. .. code-block:: python From 2b4d5e6e58f64e7091cd00bdc5922f9ca801cba9 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Thu, 23 Jul 2020 19:27:37 +0200 Subject: [PATCH 70/85] flakefix --- gimmemotifs/conversion.py | 103 -------------------------------------- gimmemotifs/maelstrom.py | 2 +- gimmemotifs/report.py | 2 +- 3 files changed, 2 insertions(+), 105 deletions(-) delete mode 100644 gimmemotifs/conversion.py diff --git a/gimmemotifs/conversion.py b/gimmemotifs/conversion.py deleted file mode 100644 index ed2a1487..00000000 --- a/gimmemotifs/conversion.py +++ /dev/null @@ -1,103 +0,0 @@ -# import mygene -import pandas as pd -import pybedtools -from genomepy import Genome -import sys -from gimmemotifs.fasta import Fasta - -# mg = mygene.MyGeneInfo() - -# xli = ["gata3"] - -# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all") - -# for hit in out: -# print(hit) -# # if "genomic_pos" in hit: -# # print("{}:{}-{}\t{}".format( -# # hit["genomic_pos"]["chr"], -# # hit["genomic_pos"]["start"], -# # hit["genomic_pos"]["end"], -# # hit["query"], -# # )) - -# sys.exit() -from functools import singledispatch - - -@singledispatch -def scan(obj): - # default implementation - raise NotImplementedError(f"Not implemented for {type(obj)}") - - -@scan.register(pd.DataFrame) -def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"): - if not set(columns).issubset(df.columns): - raise ValueError(f"Expected columns {columns}") - - if len(columns) == 3: - # Assume this is chromosome start, end - g = Genome(genome) - seqs = list( - ( - df[columns[0]] - + ":" - + df[columns[1]].astype(str) - + "-" - + df[columns[2]].astype(str) - ).values - ) - return g.track2fasta(seqs) - elif len(columns) == 1: - # Assume this is some kind of gene_id - return df[columns[0]].values - - -# @scan.register(pybedtools.BedTool) -# @profile -def _scan_bedtool(bed, genome="hg38"): - g = Genome(genome) - intervals = [g[f.chrom][f.start : f.stop] for f in bed] - return intervals - - -# @profile -def _scan_bedtool2(bed, genome="hg38"): - g = Genome(genome) - return Fasta(bed.sequence(fi=g.filename).seqfn).seqs - - -import requests - -rest_url = "https://rest.ensembl.org/info/species" -r = requests.get(rest_url, headers={"Content-Type": "application/json"}) - -if not r.ok: - r.raise_for_status() - -json = r.json() - -print(json) - - -# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822, -# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]}) -# b = pybedtools.BedTool("5k.bed") -# # for f in b: -# # print(f) -# # break -# seqs = _scan_bedtool(b, genome="Spur_3.1") -# seqs = _scan_bedtool2(b, genome="Spur_3.1") - -# g = Genome("hg19") -# print(g["chr1"][1000000:1000100]) - -# FASTA file -# BED file -# region file -# Gene file -# - promoter (all species) -# - closest accessible region (human) -# - sum / mean / max of regions within distance of promoter -# - sum / mean / max of regions within weighted distance of promoter diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index d764a1c9..d214ffa8 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -661,7 +661,7 @@ def plot_heatmap( aspect : int, optional Aspect ratio for tweaking the plot. - + cmap : str, optional Color paletter to use, RdBu_r by default. diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index aa0b3a9a..bb5caa6f 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -30,7 +30,7 @@ from gimmemotifs.comparison import MotifComparer from gimmemotifs.fasta import Fasta from gimmemotifs.motif import read_motifs -from gimmemotifs.config import MotifConfig, DIRECT_NAME, INDIRECT_NAME +from gimmemotifs.config import MotifConfig from gimmemotifs.plot import roc_plot from gimmemotifs.stats import calc_stats, add_star, write_stats from gimmemotifs import __version__ From 21075494c1ecaaf628526a0f0f71b25cb0da949a Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Fri, 24 Jul 2020 08:08:22 +0200 Subject: [PATCH 71/85] update report --- gimmemotifs/maelstrom.py | 25 ++++++++++++------------- gimmemotifs/report.py | 14 ++++++++------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index d214ffa8..95f68c72 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -197,6 +197,10 @@ def df_rank_aggregation(df, dfs, exps, method="int_stouffer"): if df.shape[1] != 1: df_p = df_p[df.columns] + if method == "int_stouffer": + df_p.columns = ["z-score " + c for c in df_p.columns] + else: + df_p.columns = ["activity " + c for c in df_p.columns] return df_p @@ -504,19 +508,16 @@ def run_maelstrom( logger.info("Rank aggregation") df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) + # Add percentage of input sequences with motif + df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100 + if df.shape[1] > 1: # Add correlation between motif score and signal logger.info("Correlation") - cols = df_p.columns - for col in cols[::-1]: - df_p.insert(0, f"correlation {col}", 0) + for col in df.columns: + df_p[f"corr {col}"] = 0 for motif in df_p.index: - df_p.loc[motif, f"correlation {col}"] = pearsonr( - df[col], scores[motif] - )[0] - - # Add percentage of input sequences with motif - df_p.insert(0, "% with motif", counts[df_p.index].sum(0) / df.shape[0] * 100) + df_p.loc[motif, f"corr {col}"] = pearsonr(df[col], scores[motif])[0] df_p.to_csv(os.path.join(outdir, "final.out.txt"), sep="\t") # df_p = df_p.join(m2f) @@ -585,15 +586,13 @@ def __init__(self, outdir): self.result = pd.read_table( os.path.join(outdir, "final.out.txt"), comment="#", index_col=0 ) - self.correlation = self.result.loc[ - :, self.result.columns.str.contains("correlation") - ] + self.correlation = self.result.loc[:, self.result.columns.str.contains("corr")] self.percent_match = self.result.loc[ :, self.result.columns.str.contains("% with motif") ] self.result = self.result.loc[ :, - ~self.result.columns.str.contains("correlation") + ~self.result.columns.str.contains("corr") & ~self.result.columns.str.contains("% with motif"), ] diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index bb5caa6f..381c887a 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -403,6 +403,7 @@ def _circle( self.data.loc[subslice].index, self.data.loc[subslice].select_dtypes(exclude=["object"]).columns, ] + idx = self._current_index(subslice) self.circle_styles = self.circle_styles or [] circle_id = len(self.circle_styles) + 1 @@ -442,8 +443,8 @@ def _circle( if vmax is None else vmax * 1.01 ) - text = self.display_data.loc[subslice].astype(str) if show_text else "" - self.display_data.loc[subslice] = ( + text = self.display_data.iloc[idx].astype(str) if show_text else "" + self.display_data.iloc[idx] = ( f"
" + text + "
" ) @@ -889,10 +890,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): # Columns with maelstrom rank aggregation value value_cols = df.columns[ - ~df.columns.str.contains("correlation") & ~df.columns.isin(["% with motif"]) + ~df.columns.str.contains("corr") & ~df.columns.isin(["% with motif"]) ] # Columns with correlation values - corr_cols = df.columns[df.columns.str.contains("correlation")] + corr_cols = df.columns[df.columns.str.contains("corr")] df = df[np.any(abs(df[value_cols]) >= threshold, 1)] @@ -921,6 +922,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): .set_table_attributes('class="sortable-theme-slick" data-sortable') .align(subset=list(value_cols), location="center") .set_font("Nunito Sans") + .wrap() .rename(columns=rename_columns) ) From 55550992490ca8a6a0a46a27316b74f252371c1c Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 27 Jul 2020 11:12:05 +0200 Subject: [PATCH 72/85] fix '% with motif' column --- gimmemotifs/report.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 381c887a..87bc8398 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -922,7 +922,6 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): .set_table_attributes('class="sortable-theme-slick" data-sortable') .align(subset=list(value_cols), location="center") .set_font("Nunito Sans") - .wrap() .rename(columns=rename_columns) ) @@ -950,7 +949,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): .to_precision_str(subset=["% with motif"]) ) - df_styled = df_styled.render() + df_styled = df_styled.wrap().render() with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write(df_styled) From b8cb1059c9e3178a96bf377b911d8a7c3a9ee002 Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Fri, 21 Aug 2020 11:57:46 +0200 Subject: [PATCH 73/85] fix another issue with numeric chrom names --- scripts/combine_peaks | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/combine_peaks b/scripts/combine_peaks index 155d6e83..3d7c0e49 100644 --- a/scripts/combine_peaks +++ b/scripts/combine_peaks @@ -46,7 +46,7 @@ def read_peak_file_to_df(fname): "qval", "peak", ] - df = pd.read_table(fname, names=header) + df = pd.read_table(fname, names=header, dtype={"chrom": "str"}) df["chrom"] = df["chrom"].astype(str) # get the summit @@ -57,7 +57,7 @@ def read_peak_file_to_df(fname): df["value"] = df["qval"] df = df[summit_header] elif ftype == "bed": - df = pd.read_table(fname, names=summit_header) + df = pd.read_table(fname, names=summit_header, dtype={"chrom": "str"}) if ((df["end"] - df["start"]) != 1).sum() != 0: raise ValueError(f"{fname} does not contain summits.") else: From baf48207a7126c79d98adaaad78865af9a637ee0 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 7 Sep 2020 19:40:58 +0200 Subject: [PATCH 74/85] initial support for multispecies maelstrom --- gimmemotifs/utils.py | 61 ++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 27d7bed3..23747a88 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -23,7 +23,7 @@ from scipy import special import numpy as np import pybedtools -from genomepy import Genome +from genomepy import Genome, list_installed_genomes # gimme imports @@ -470,7 +470,7 @@ def get_seqs_type(seqs): - region file - BED file """ - region_p = re.compile(r"^(.+):(\d+)-(\d+)$") + region_p = re.compile(r"^([^\s:]+\@)?(.+):(\d+)-(\d+)$") if isinstance(seqs, Fasta): return "fasta" elif isinstance(seqs, list) or isinstance(seqs, np.ndarray): @@ -496,24 +496,51 @@ def get_seqs_type(seqs): raise ValueError("unknown type {}".format(type(seqs).__name__)) -def as_fasta(seqs, genome=None): - ftype = get_seqs_type(seqs) +def as_fasta(input_seqs, genome=None): + ftype = get_seqs_type(input_seqs) if ftype == "fasta": - return seqs + return input_seqs elif ftype == "fastafile": - return Fasta(seqs) + return Fasta(input_seqs) else: - if genome is None: - raise ValueError("need genome to convert to FASTA") - - tmpfa = NamedTemporaryFile() - if isinstance(genome, str): - genome = Genome(genome) - - if isinstance(seqs, np.ndarray): - seqs = list(seqs) - genome.track2fasta(seqs, tmpfa.name) - return Fasta(tmpfa.name) + if isinstance(input_seqs, np.ndarray): + seqs = list(input_seqs) + + genomic_regions = {} + if "@" in input_seqs[0]: + available = list_installed_genomes() + for seq in input_seqs: + genome, region = seq.split("@") + if genome not in genomic_regions: + if genome not in available: + raise ValueError(f"genome {genome} is not installed!") + genomic_regions[genome] = [] + genomic_regions[genome].append(region) + else: + if genome is None: + raise ValueError("need genome to convert to FASTA") + genomic_regions[genome] = input_seqs + + tmpfa = NamedTemporaryFile(mode="w") + for genome, regions in genomic_regions.items(): + + if isinstance(genome, str): + genome = Genome(genome) + + tmpfa2 = NamedTemporaryFile() + genome.track2fasta(regions, tmpfa2.name) + + fa = Fasta(tmpfa2.name) + for name, seq in fa.items(): + print(f">{genome.name}@{name}\n{fa._format_seq(seq)}", file=tmpfa) + tmpfa.flush() + + # Open tempfile and restore original sequence order + fa = Fasta(tmpfa.name) + seqs = [fa[region] for region in input_seqs] + fa.ids = input_seqs[:] + fa.seqs = seqs[:] + return fa def file_checksum(fname): From 4385c11794ca1737af02aad3b7f95b5f63249855 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 9 Sep 2020 11:29:38 +0200 Subject: [PATCH 75/85] % with motif per category in maelstrom report --- data/templates/sortable/sortable.min.js | 2 +- gimmemotifs/maelstrom.py | 8 +++++++- gimmemotifs/report.py | 26 +++++++++++++------------ 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/data/templates/sortable/sortable.min.js b/data/templates/sortable/sortable.min.js index b968cf01..11d419dd 100644 --- a/data/templates/sortable/sortable.min.js +++ b/data/templates/sortable/sortable.min.js @@ -1,2 +1,2 @@ /*! sortable.js 0.8.0 */ -(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(-?[£$¤]?[\d,.e\-]+%?|inf)$/,g=/^\s+|\s+$/g,c=["click"],f="ontouchstart"in document.documentElement,f&&c.push("touchstart"),b=function(a,b,c){return null!=a.addEventListener?a.addEventListener(b,c,!1):a.attachEvent("on"+b,c)},e={init:function(b){var c,d,f,g,h;for(null==b&&(b={}),null==b.selector&&(b.selector=a),d=document.querySelectorAll(b.selector),h=[],f=0,g=d.length;g>f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this); +(function(){var a,b,c,d,e,f,g;a="table[data-sortable]",d=/^(.+\>)?f;f++)c=d[f],h.push(e.initTable(c));return h},initTable:function(a){var b,c,d,f,g,h;if(1===(null!=(h=a.tHead)?h.rows.length:void 0)&&"true"!==a.getAttribute("data-sortable-initialized")){for(a.setAttribute("data-sortable-initialized","true"),d=a.querySelectorAll("th"),b=f=0,g=d.length;g>f;b=++f)c=d[b],"false"!==c.getAttribute("data-sortable")&&e.setupClickableTH(a,c,b);return a}},setupClickableTH:function(a,d,f){var g,h,i,j,k,l;for(i=e.getColumnType(a,f),h=function(b){var c,g,h,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D;if(b.handled===!0)return!1;for(b.handled=!0,m="true"===this.getAttribute("data-sorted"),n=this.getAttribute("data-sorted-direction"),h=m?"ascending"===n?"descending":"ascending":i.defaultSortDirection,p=this.parentNode.querySelectorAll("th"),s=0,w=p.length;w>s;s++)d=p[s],d.setAttribute("data-sorted","false"),d.removeAttribute("data-sorted-direction");if(this.setAttribute("data-sorted","true"),this.setAttribute("data-sorted-direction",h),o=a.tBodies[0],l=[],m){for(D=o.rows,v=0,z=D.length;z>v;v++)g=D[v],l.push(g);for(l.reverse(),B=0,A=l.length;A>B;B++)k=l[B],o.appendChild(k)}else{for(r=null!=i.compare?i.compare:function(a,b){return b-a},c=function(a,b){return a[0]===b[0]?a[2]-b[2]:i.reverse?r(b[0],a[0]):r(a[0],b[0])},C=o.rows,j=t=0,x=C.length;x>t;j=++t)k=C[j],q=e.getNodeValue(k.cells[f]),null!=i.comparator&&(q=i.comparator(q)),l.push([q,k,j]);for(l.sort(c),u=0,y=l.length;y>u;u++)k=l[u],o.appendChild(k[1])}return"function"==typeof window.CustomEvent&&"function"==typeof a.dispatchEvent?a.dispatchEvent(new CustomEvent("Sortable.sorted",{bubbles:!0})):void 0},l=[],j=0,k=c.length;k>j;j++)g=c[j],l.push(b(d,g,h));return l},getColumnType:function(a,b){var c,d,f,g,h,i,j,k,l,m,n;if(d=null!=(l=a.querySelectorAll("th")[b])?l.getAttribute("data-sortable-type"):void 0,null!=d)return e.typesObject[d];for(m=a.tBodies[0].rows,h=0,j=m.length;j>h;h++)for(c=m[h],f=e.getNodeValue(c.cells[b]),n=e.types,i=0,k=n.length;k>i;i++)if(g=n[i],g.match(f))return g;return e.typesObject.alpha},getNodeValue:function(a){var b;return a?(b=a.getAttribute("data-value"),null!==b?b:"undefined"!=typeof a.innerText?a.innerText.replace(g,""):a.textContent.replace(g,"")):""},setupTypes:function(a){var b,c,d,f;for(e.types=a,e.typesObject={},f=[],c=0,d=a.length;d>c;c++)b=a[c],f.push(e.typesObject[b.name]=b);return f}},e.setupTypes([{name:"numeric",defaultSortDirection:"descending",match:function(a){return a.match(d)},comparator:function(a){if(a=="inf"){return Infinity}else{return parseFloat(a.replace(/^[^0-9\-]+/g,""),10)||0}}},{name:"date",defaultSortDirection:"ascending",reverse:!0,match:function(a){return!isNaN(Date.parse(a))},comparator:function(a){return Date.parse(a)||0}},{name:"alpha",defaultSortDirection:"ascending",match:function(){return!0},compare:function(a,b){return a.localeCompare(b)}}]),setTimeout(e.init,0),"function"==typeof define&&define.amd?define(function(){return e}):"undefined"!=typeof exports?module.exports=e:window.Sortable=e}).call(this); diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index 95f68c72..f257bdf7 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -509,7 +509,13 @@ def run_maelstrom( df_p = df_rank_aggregation(df, dfs, exps, method=aggregation) # Add percentage of input sequences with motif - df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100 + if df.shape[1] > 1: + df_p["% with motif"] = counts[df_p.index].sum(0) / df.shape[0] * 100 + else: + bla = counts.join(df).groupby(df.columns[0]).mean() * 100 + bla = bla.T + bla = bla.rename(columns={col: f"{col} % with motif" for col in bla.columns}) + df_p = df_p.join(bla) if df.shape[1] > 1: # Add correlation between motif score and signal diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 87bc8398..7a77d51d 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -890,7 +890,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): # Columns with maelstrom rank aggregation value value_cols = df.columns[ - ~df.columns.str.contains("corr") & ~df.columns.isin(["% with motif"]) + ~df.columns.str.contains("corr") & ~df.columns.str.contains("% with motif") ] # Columns with correlation values corr_cols = df.columns[df.columns.str.contains("corr")] @@ -907,8 +907,9 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) rename_columns = {"factors": FACTOR_TOOLTIP} - if "% with motif" in df.columns: - df["% with motif"] = df["% with motif"].astype(int) + for col in df.columns: + if "% with motif" in col: + df[col] = df[col].astype(int) df_styled = ( ExtraStyler(df) @@ -938,16 +939,17 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): ) ) - if "% with motif" in df.columns: - df_styled = ( - df_styled.add_circle( - subset=["% with motif"], cmap="Purples", vmax=100, size=40 + for col in df.columns: + if "% with motif" in col: + df_styled = ( + df_styled.add_circle( + subset=[col], cmap="Purples", vmax=100, size=40 + ) + .wrap(subset=[col]) + .align(subset=[col], location="center") + .border(subset=[col], location="left") + .to_precision_str(subset=[col]) ) - .wrap(subset=["% with motif"]) - .align(subset=["% with motif"], location="center") - .border(subset=["% with motif"], location="left") - .to_precision_str(subset=["% with motif"]) - ) df_styled = df_styled.wrap().render() From 88783d9232fb600217ab8a218359ce19242dd781 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Wed, 9 Sep 2020 17:03:03 +0200 Subject: [PATCH 76/85] * Safe motif names (fixes #135) * Multiples species in maelstrom now supported (#141) --- gimmemotifs/commands/motifs.py | 4 +- gimmemotifs/fasta.py | 6 +- gimmemotifs/maelstrom.py | 4 +- gimmemotifs/report.py | 8 +- gimmemotifs/utils.py | 228 +++++++++++++++++++++++++++------ 5 files changed, 201 insertions(+), 49 deletions(-) diff --git a/gimmemotifs/commands/motifs.py b/gimmemotifs/commands/motifs.py index c4489d43..f5b72737 100755 --- a/gimmemotifs/commands/motifs.py +++ b/gimmemotifs/commands/motifs.py @@ -7,6 +7,7 @@ """Command line function 'roc'.""" from __future__ import print_function import os +import re import sys import shutil import logging @@ -246,10 +247,11 @@ def motifs(args): with NamedTemporaryFile(mode="w") as f: print(motif_dict[motif].to_pwm(), file=f) f.flush() + safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif) scan_to_file( sample, f.name, - filepath_or_buffer=os.path.join(scan_dir, f"{motif}.matches.bed"), + filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"), bed=True, fpr=0.01, genome=args.genome, diff --git a/gimmemotifs/fasta.py b/gimmemotifs/fasta.py index 6cc1d5d4..781d1cf0 100644 --- a/gimmemotifs/fasta.py +++ b/gimmemotifs/fasta.py @@ -12,7 +12,7 @@ class Fasta(object): - def __init__(self, fname=None, split_whitespace=False): + def __init__(self, fname=None, split_whitespace=False, fdict=None): """ Instantiate fasta object. Optional Fasta-formatted file as argument""" self.ids = [] self.seqs = [] @@ -35,6 +35,10 @@ def __init__(self, fname=None, split_whitespace=False): if p.match(sequence): raise IOError("Not a valid FASTA file") self.seqs.append(sequence) + elif fdict is not None: + for name, seq in fdict.items(): + self.ids.append(name) + self.seqs.append(seq) def hardmask(self): """ Mask all lowercase nucleotides with N's """ diff --git a/gimmemotifs/maelstrom.py b/gimmemotifs/maelstrom.py index f257bdf7..fbe46b03 100644 --- a/gimmemotifs/maelstrom.py +++ b/gimmemotifs/maelstrom.py @@ -514,7 +514,9 @@ def run_maelstrom( else: bla = counts.join(df).groupby(df.columns[0]).mean() * 100 bla = bla.T - bla = bla.rename(columns={col: f"{col} % with motif" for col in bla.columns}) + bla = bla.rename( + columns={col: f"{col} % with motif" for col in bla.columns} + ) df_p = df_p.join(bla) if df.shape[1] > 1: diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index 7a77d51d..d0cdcfa8 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -871,7 +871,7 @@ def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="l for motif in series: if motif not in motifs: raise ValueError(f"Motif {motif} does not occur in motif database") - fname = subdir + "/{}.png".format(re.sub("[()/]", "_", motif)) + fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)) if not os.path.exists(fname): motifs[motif].plot_logo(fname=os.path.join(outdir, fname)) img_series.append(fname) @@ -942,9 +942,7 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): for col in df.columns: if "% with motif" in col: df_styled = ( - df_styled.add_circle( - subset=[col], cmap="Purples", vmax=100, size=40 - ) + df_styled.add_circle(subset=[col], cmap="Purples", vmax=100, size=40) .wrap(subset=[col]) .align(subset=[col], location="center") .border(subset=[col], location="left") @@ -994,7 +992,7 @@ def roc_html_report( if link_matches: df["# matches"] = ( "
" + df["# matches"].astype(str) + "" diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 23747a88..fd2ca25b 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -15,15 +15,19 @@ import random import tempfile import requests +from io import TextIOWrapper +from functools import singledispatch from subprocess import Popen from tempfile import NamedTemporaryFile from shutil import copyfile # External imports +import pyfaidx from scipy import special import numpy as np import pybedtools -from genomepy import Genome, list_installed_genomes +from genomepy import Genome +from Bio.SeqIO.FastaIO import SimpleFastaParser # gimme imports @@ -496,51 +500,193 @@ def get_seqs_type(seqs): raise ValueError("unknown type {}".format(type(seqs).__name__)) -def as_fasta(input_seqs, genome=None): - ftype = get_seqs_type(input_seqs) - if ftype == "fasta": - return input_seqs - elif ftype == "fastafile": - return Fasta(input_seqs) - else: - if isinstance(input_seqs, np.ndarray): - seqs = list(input_seqs) - - genomic_regions = {} - if "@" in input_seqs[0]: - available = list_installed_genomes() - for seq in input_seqs: - genome, region = seq.split("@") - if genome not in genomic_regions: - if genome not in available: - raise ValueError(f"genome {genome} is not installed!") - genomic_regions[genome] = [] - genomic_regions[genome].append(region) - else: - if genome is None: - raise ValueError("need genome to convert to FASTA") - genomic_regions[genome] = input_seqs +# Regular expression to check for region (chr:start-end or genome@chr:start-end) +region_p = re.compile(r"^[^@]+@([^\s]+):(\d+)-(\d+)$") - tmpfa = NamedTemporaryFile(mode="w") - for genome, regions in genomic_regions.items(): - if isinstance(genome, str): - genome = Genome(genome) +def _check_minsize(fa, minsize): + """ + Raise ValueError if there is any sequence that is shorter than minsize. + If minsize is None the size will not be checked. + """ + if minsize is None: + return fa - tmpfa2 = NamedTemporaryFile() - genome.track2fasta(regions, tmpfa2.name) + for name, seq in fa.items(): + if len(seq) < minsize: + raise ValueError(f"sequence {name} is shorter than {minsize}") - fa = Fasta(tmpfa2.name) - for name, seq in fa.items(): - print(f">{genome.name}@{name}\n{fa._format_seq(seq)}", file=tmpfa) - tmpfa.flush() + return fa - # Open tempfile and restore original sequence order - fa = Fasta(tmpfa.name) - seqs = [fa[region] for region in input_seqs] - fa.ids = input_seqs[:] - fa.seqs = seqs[:] - return fa + +def _genomepy_convert(to_convert, genome, minsize=None): + """ + Convert a variety of inputs using track2fasta(). + """ + if genome is None: + raise ValueError("input file is not a FASTA file, need a genome!") + + g = Genome(genome) + tmpfile = NamedTemporaryFile() + g.track2fasta(to_convert, tmpfile.name) + + fa = as_seqdict(tmpfile.name) + return _check_minsize(fa, minsize) + + +def _as_seqdict_genome_regions(regions, minsize=None): + """ + Accepts list of regions where the genome is encoded in the region, + using the genome@chrom:start-end format. + """ + genomic_regions = {} + for region in regions: + genome, region = region.split("@") + if genome not in genomic_regions: + Genome(genome) + genomic_regions[genome] = [] + genomic_regions[genome].append(region) + + tmpfa = NamedTemporaryFile(mode="w", delete=False) + for genome, g_regions in genomic_regions.items(): + g = Genome(genome) + + fa = g.track2fasta(g_regions) + + for seq in fa: + seq.name = f"{genome}@{seq.name}" + print(seq.__repr__(), file=tmpfa) + + tmpfa.flush() + + # Open tempfile and restore original sequence order + fa = as_seqdict(tmpfa.name) + fa = {region: fa[region] for region in regions} + return _check_minsize(fa, minsize) + + +@singledispatch +def as_seqdict(to_convert, genome=None, minsize=None): + """ + Convert input to a dictionary with name as key and sequence as value. + + If the input contains genomic coordinates, the genome needs to be + specified. If minsize is specified all sequences will be checked if they + are not shorter than minsize. If regions (or a region file) are used as + the input, the genome can optionally be specified in the region using the + following format: genome@chrom:start-end. + + Current supported input types include: + * FASTA, BED and region files. + * List or numpy.ndarray of regions. + * pyfaidx.Fasta object. + * pybedtools.BedTool object. + + Parameters + ---------- + to_convert : list, str, pyfaidx.Fasta or pybedtools.BedTool + Input to convert to FASTA-like dictionary + + genome : str, optional + Genomepy genome name. + + minsize : int or None, optional + If specified, check if all sequences have at least size minsize. + + Returns + ------- + dict with sequence names as key and sequences as value. + """ + raise NotImplementedError(f"Not implement for {type(to_convert)}") + + +@as_seqdict.register(list) +def _as_seqdict_list(to_convert, genome=None, minsize=None): + """ + Accepts list of regions as input. + """ + if region_p.match(to_convert[0]): + return _as_seqdict_genome_regions(to_convert, minsize) + + return _genomepy_convert(to_convert, genome, minsize) + + +@as_seqdict.register(TextIOWrapper) +def _as_seqdict_file_object(to_convert, genome=None, minsize=None): + """ + Accepts file object as input, should be a FASTA file. + """ + fa = {x: y for x, y in SimpleFastaParser(to_convert)} + return _check_minsize(fa, minsize) + + +@as_seqdict.register(str) +def _as_seqdict_filename(to_convert, genome=None, minsize=None): + """ + Accepts filename as input. + """ + if not os.path.exists(to_convert): + raise ValueError("Assuming filename, but it does not exist") + + f = open(to_convert) + fa = as_seqdict(f) + + if any(fa): + return _check_minsize(fa, minsize) + + with open(to_convert) as f: + line = "" + while True: + line = f.readline() + if line == "": + break + if not line.startswith("#"): + break + + if line == "": + raise IOError(f"empty file {to_convert}") + + if region_p.match(line.strip()): + regions = [l.strip() for l in [line] + f.readlines()] + return _as_seqdict_genome_regions(regions, minsize=None) + + # Biopython parser resulted in empty dict + # Assuming it's a BED or region file + return _genomepy_convert(to_convert, genome, minsize) + + +@as_seqdict.register(pyfaidx.Fasta) +def _as_seqdict_pyfaidx(to_convert, genome=None, minsize=None): + """ + Accepts pyfaidx.Fasta object as input. + """ + fa = {k: str(v) for k, v in to_convert.items()} + return _check_minsize(fa, minsize) + + +@as_seqdict.register(pybedtools.BedTool) +def _as_seqdict_bedtool(to_convert, genome=None, minsize=None): + """ + Accepts pybedtools.BedTool as input. + """ + return _genomepy_convert( + ["{}:{}-{}".format(*f[:3]) for f in to_convert], genome, minsize + ) + + +@as_seqdict.register(np.ndarray) +def _as_seqdict_array(to_convert, genome=None, minsize=None): + """ + Accepts numpy.ndarray with regions as input. + """ + return as_seqdict(list(to_convert), genome, minsize) + + +def as_fasta(to_convert, genome=None, minsize=None): + if isinstance(to_convert, Fasta): + return to_convert + + return Fasta(fdict=as_seqdict(to_convert, genome, minsize)) def file_checksum(fname): From 3c96fda3e6cbd4516065144c0bc033f76c9888e5 Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Fri, 25 Sep 2020 15:33:31 +0200 Subject: [PATCH 77/85] "easier" imports --- gimmemotifs/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py index 3d63dd7f..c7dafa84 100644 --- a/gimmemotifs/__init__.py +++ b/gimmemotifs/__init__.py @@ -52,3 +52,23 @@ def filter(self, record): __version__ = get_versions()["version"] del get_versions + +# easier import of gimme (config and cli left out) +from . import background +from . import cluster +from . import comparison +from . import denovo +from . import fasta +from . import maelstrom +from . import moap +from . import motif +from . import plot +from . import prediction +from . import rank +from . import report +from . import rocmetrics +from . import scanner +from . import shutils +from . import stats +from . import utils +from . import validation From 389549ecddbdb4980742fd1f88944c35a3a45a1a Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Fri, 25 Sep 2020 16:05:11 +0200 Subject: [PATCH 78/85] fix flake8 --- gimmemotifs/__init__.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py index c7dafa84..c34b617e 100644 --- a/gimmemotifs/__init__.py +++ b/gimmemotifs/__init__.py @@ -54,21 +54,21 @@ def filter(self, record): del get_versions # easier import of gimme (config and cli left out) -from . import background -from . import cluster -from . import comparison -from . import denovo -from . import fasta -from . import maelstrom -from . import moap -from . import motif -from . import plot -from . import prediction -from . import rank -from . import report -from . import rocmetrics -from . import scanner -from . import shutils -from . import stats -from . import utils -from . import validation +from . import background # noqa: F401 +from . import cluster # noqa: F401 +from . import comparison # noqa: F401 +from . import denovo # noqa: F401 +from . import fasta # noqa: F401 +from . import maelstrom # noqa: F401 +from . import moap # noqa: F401 +from . import motif # noqa: F401 +from . import plot # noqa: F401 +from . import prediction # noqa: F401 +from . import rank # noqa: F401 +from . import report # noqa: F401 +from . import rocmetrics # noqa: F401 +from . import scanner # noqa: F401 +from . import shutils # noqa: F401 +from . import stats # noqa: F401 +from . import utils # noqa: F401 +from . import validation # noqa: F401 From 5367259e75e1436a00690e92577431be29656431 Mon Sep 17 00:00:00 2001 From: Maarten-vd-Sande Date: Fri, 25 Sep 2020 16:15:07 +0200 Subject: [PATCH 79/85] ignore indentation for black --- gimmemotifs/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gimmemotifs/__init__.py b/gimmemotifs/__init__.py index c34b617e..fcd514c2 100644 --- a/gimmemotifs/__init__.py +++ b/gimmemotifs/__init__.py @@ -53,6 +53,7 @@ def filter(self, record): __version__ = get_versions()["version"] del get_versions +# fmt: off # easier import of gimme (config and cli left out) from . import background # noqa: F401 from . import cluster # noqa: F401 @@ -72,3 +73,4 @@ def filter(self, record): from . import stats # noqa: F401 from . import utils # noqa: F401 from . import validation # noqa: F401 +# fmt: on From a8181d8ed97bc3e49a067058f2da2e18e7d570b0 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 28 Sep 2020 15:00:17 +0200 Subject: [PATCH 80/85] new black version --- gimmemotifs/comparison.py | 6 +- gimmemotifs/conversion.tryout.py | 103 +++++++++++++++++++++++++++++++ gimmemotifs/plot.py | 3 +- gimmemotifs/rank.py | 20 +++--- gimmemotifs/report.py | 10 ++- gimmemotifs/scanner.py | 3 +- gimmemotifs/utils.py | 9 ++- 7 files changed, 132 insertions(+), 22 deletions(-) create mode 100644 gimmemotifs/conversion.tryout.py diff --git a/gimmemotifs/comparison.py b/gimmemotifs/comparison.py index f964205d..911cd9d6 100644 --- a/gimmemotifs/comparison.py +++ b/gimmemotifs/comparison.py @@ -950,7 +950,11 @@ def select_nonredundant_motifs( y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0]))) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.4, random_state=2, shuffle=True, + X, + y, + test_size=0.4, + random_state=2, + shuffle=True, ) X_bla = X_train[keep] diff --git a/gimmemotifs/conversion.tryout.py b/gimmemotifs/conversion.tryout.py new file mode 100644 index 00000000..ed2a1487 --- /dev/null +++ b/gimmemotifs/conversion.tryout.py @@ -0,0 +1,103 @@ +# import mygene +import pandas as pd +import pybedtools +from genomepy import Genome +import sys +from gimmemotifs.fasta import Fasta + +# mg = mygene.MyGeneInfo() + +# xli = ["gata3"] + +# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all") + +# for hit in out: +# print(hit) +# # if "genomic_pos" in hit: +# # print("{}:{}-{}\t{}".format( +# # hit["genomic_pos"]["chr"], +# # hit["genomic_pos"]["start"], +# # hit["genomic_pos"]["end"], +# # hit["query"], +# # )) + +# sys.exit() +from functools import singledispatch + + +@singledispatch +def scan(obj): + # default implementation + raise NotImplementedError(f"Not implemented for {type(obj)}") + + +@scan.register(pd.DataFrame) +def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"): + if not set(columns).issubset(df.columns): + raise ValueError(f"Expected columns {columns}") + + if len(columns) == 3: + # Assume this is chromosome start, end + g = Genome(genome) + seqs = list( + ( + df[columns[0]] + + ":" + + df[columns[1]].astype(str) + + "-" + + df[columns[2]].astype(str) + ).values + ) + return g.track2fasta(seqs) + elif len(columns) == 1: + # Assume this is some kind of gene_id + return df[columns[0]].values + + +# @scan.register(pybedtools.BedTool) +# @profile +def _scan_bedtool(bed, genome="hg38"): + g = Genome(genome) + intervals = [g[f.chrom][f.start : f.stop] for f in bed] + return intervals + + +# @profile +def _scan_bedtool2(bed, genome="hg38"): + g = Genome(genome) + return Fasta(bed.sequence(fi=g.filename).seqfn).seqs + + +import requests + +rest_url = "https://rest.ensembl.org/info/species" +r = requests.get(rest_url, headers={"Content-Type": "application/json"}) + +if not r.ok: + r.raise_for_status() + +json = r.json() + +print(json) + + +# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822, +# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]}) +# b = pybedtools.BedTool("5k.bed") +# # for f in b: +# # print(f) +# # break +# seqs = _scan_bedtool(b, genome="Spur_3.1") +# seqs = _scan_bedtool2(b, genome="Spur_3.1") + +# g = Genome("hg19") +# print(g["chr1"][1000000:1000100]) + +# FASTA file +# BED file +# region file +# Gene file +# - promoter (all species) +# - closest accessible region (human) +# - sum / mean / max of regions within distance of promoter +# - sum / mean / max of regions within weighted distance of promoter diff --git a/gimmemotifs/plot.py b/gimmemotifs/plot.py index 6703b6e4..d27f9d21 100644 --- a/gimmemotifs/plot.py +++ b/gimmemotifs/plot.py @@ -32,8 +32,7 @@ def axes_off(ax): - """Get rid of all axis ticks, lines, etc. - """ + """Get rid of all axis ticks, lines, etc.""" ax.set_frame_on(False) ax.axes.get_yaxis().set_visible(False) ax.axes.get_xaxis().set_visible(False) diff --git a/gimmemotifs/rank.py b/gimmemotifs/rank.py index 1a8cfde6..5ae37690 100644 --- a/gimmemotifs/rank.py +++ b/gimmemotifs/rank.py @@ -81,16 +81,16 @@ def qStuart(r): def _rank_int(series, c=3.0 / 8, stochastic=True): # Based on code by Edward Mountjoy # See: https://github.com/edm1/rank-based-INT - """ Perform rank-based inverse normal transformation on pandas series. - If stochastic is True ties are given rank randomly, otherwise ties will - share the same value. NaN values are ignored. - Args: - param1 (pandas.Series): Series of values to transform - param2 (Optional[float]): Constand parameter (Bloms constant) - param3 (Optional[bool]): Whether to randomise rank of ties - - Returns: - pandas.Series + """Perform rank-based inverse normal transformation on pandas series. + If stochastic is True ties are given rank randomly, otherwise ties will + share the same value. NaN values are ignored. + Args: + param1 (pandas.Series): Series of values to transform + param2 (Optional[float]): Constand parameter (Bloms constant) + param3 (Optional[bool]): Whether to randomise rank of ties + + Returns: + pandas.Series """ # Check input diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index d0cdcfa8..cb03e713 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -914,7 +914,10 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): df_styled = ( ExtraStyler(df) .set_precision(2) - .convert_to_image(subset=["logo"], height=30,) + .convert_to_image( + subset=["logo"], + height=30, + ) .scaled_background_gradient( subset=value_cols, center_zero=True, low=1 / 1.75, high=1 / 1.75 ) @@ -1033,7 +1036,10 @@ def roc_html_report( if df.shape[0] > 0: f.write( ExtraStyler(df) - .convert_to_image(subset=["logo"], height=30,) + .convert_to_image( + subset=["logo"], + height=30, + ) .add_circle( subset=["% matches input", "%matches background"], vmax=100, diff --git a/gimmemotifs/scanner.py b/gimmemotifs/scanner.py index cfccc415..8397fb8f 100644 --- a/gimmemotifs/scanner.py +++ b/gimmemotifs/scanner.py @@ -250,8 +250,7 @@ def scan_to_file( zscore=True, gcnorm=True, ): - """Scan an inputfile with motifs. - """ + """Scan an inputfile with motifs.""" should_close = False if filepath_or_buffer is None: fo = sys.stdout diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index fd2ca25b..b6a9a336 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -50,8 +50,7 @@ def rc(seq): def narrowpeak_to_bed(inputfile, bedfile, size=0): - """Convert narrowPeak file to BED file. - """ + """Convert narrowPeak file to BED file.""" p = re.compile(r"^(#|track|browser)") warn_no_summit = True with open(bedfile, "w") as f_out: @@ -133,7 +132,7 @@ def phyper_single(k, good, bad, N): def phyper(k, good, bad, N): - """ Current hypergeometric implementation in scipy is broken, + """Current hypergeometric implementation in scipy is broken, so here's the correct version. """ pvalues = [phyper_single(x, good, bad, N) for x in range(k + 1, N + 1)] @@ -294,8 +293,8 @@ def motif_localization(fastafile, motif, size, outfile, cutoff=0.9): def parse_cutoff(motifs, cutoff, default=0.9): - """ Provide either a file with one cutoff per motif or a single cutoff - returns a hash with motif id as key and cutoff as value + """Provide either a file with one cutoff per motif or a single cutoff + returns a hash with motif id as key and cutoff as value """ cutoffs = {} From fcbf34d0b509ddac35a45d435db045a07f02c4f2 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 28 Sep 2020 15:08:25 +0200 Subject: [PATCH 81/85] fix tests --- gimmemotifs/utils.py | 8 ++++++-- test/test_maelstrom.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index b6a9a336..99c3a281 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -525,7 +525,11 @@ def _genomepy_convert(to_convert, genome, minsize=None): if genome is None: raise ValueError("input file is not a FASTA file, need a genome!") - g = Genome(genome) + if isinstance(genome, Genome): + g = genome + else: + g = Genome(genome) + tmpfile = NamedTemporaryFile() g.track2fasta(to_convert, tmpfile.name) @@ -646,7 +650,7 @@ def _as_seqdict_filename(to_convert, genome=None, minsize=None): raise IOError(f"empty file {to_convert}") if region_p.match(line.strip()): - regions = [l.strip() for l in [line] + f.readlines()] + regions = [myline.strip() for myline in [line] + f.readlines()] return _as_seqdict_genome_regions(regions, minsize=None) # Biopython parser resulted in empty dict diff --git a/test/test_maelstrom.py b/test/test_maelstrom.py index 66f602e2..53bffc77 100644 --- a/test/test_maelstrom.py +++ b/test/test_maelstrom.py @@ -34,7 +34,7 @@ def test1_maelstrom(self): df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) - self.assertEquals((623, 5), df.shape) + self.assertEquals((623, 8), df.shape) # Filter redundant motifs run_maelstrom( @@ -48,7 +48,7 @@ def test1_maelstrom(self): ) df = pd.read_table(self.outfile, index_col=0, comment="#") print(df.shape) - self.assertEquals((156, 5), df.shape) + self.assertEquals((156, 8), df.shape) for fname in glob(os.path.join(self.outdir, "activity*")): From 9cb4f6fafd1bb63b730fe4d9042465c4f7bf15e4 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 28 Sep 2020 15:22:52 +0200 Subject: [PATCH 82/85] style --- gimmemotifs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gimmemotifs/utils.py b/gimmemotifs/utils.py index 99c3a281..a1b3c71b 100644 --- a/gimmemotifs/utils.py +++ b/gimmemotifs/utils.py @@ -529,7 +529,7 @@ def _genomepy_convert(to_convert, genome, minsize=None): g = genome else: g = Genome(genome) - + tmpfile = NamedTemporaryFile() g.track2fasta(to_convert, tmpfile.name) From fb638a581253612de1e11c6a7123d05c156651bc Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 28 Sep 2020 15:23:11 +0200 Subject: [PATCH 83/85] fix unnecessary conversion --- gimmemotifs/report.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gimmemotifs/report.py b/gimmemotifs/report.py index cb03e713..e41951dc 100644 --- a/gimmemotifs/report.py +++ b/gimmemotifs/report.py @@ -907,9 +907,6 @@ def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=3): df.insert(0, "factors", motif_to_factor_series(df.index, pfmfile=pfmfile)) rename_columns = {"factors": FACTOR_TOOLTIP} - for col in df.columns: - if "% with motif" in col: - df[col] = df[col].astype(int) df_styled = ( ExtraStyler(df) From a84ab48e5cc1b4f9987e7c43f62316da48e3cc4f Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Mon, 28 Sep 2020 15:24:38 +0200 Subject: [PATCH 84/85] remove --- gimmemotifs/conversion.tryout.py | 103 ------------------------------- 1 file changed, 103 deletions(-) delete mode 100644 gimmemotifs/conversion.tryout.py diff --git a/gimmemotifs/conversion.tryout.py b/gimmemotifs/conversion.tryout.py deleted file mode 100644 index ed2a1487..00000000 --- a/gimmemotifs/conversion.tryout.py +++ /dev/null @@ -1,103 +0,0 @@ -# import mygene -import pandas as pd -import pybedtools -from genomepy import Genome -import sys -from gimmemotifs.fasta import Fasta - -# mg = mygene.MyGeneInfo() - -# xli = ["gata3"] - -# out = mg.querymany(xli, scopes="symbol", fields="genomic_pos", species="all") - -# for hit in out: -# print(hit) -# # if "genomic_pos" in hit: -# # print("{}:{}-{}\t{}".format( -# # hit["genomic_pos"]["chr"], -# # hit["genomic_pos"]["start"], -# # hit["genomic_pos"]["end"], -# # hit["query"], -# # )) - -# sys.exit() -from functools import singledispatch - - -@singledispatch -def scan(obj): - # default implementation - raise NotImplementedError(f"Not implemented for {type(obj)}") - - -@scan.register(pd.DataFrame) -def _scan_dataframe(df, columns=["chrom", "start", "end"], genome="hg38"): - if not set(columns).issubset(df.columns): - raise ValueError(f"Expected columns {columns}") - - if len(columns) == 3: - # Assume this is chromosome start, end - g = Genome(genome) - seqs = list( - ( - df[columns[0]] - + ":" - + df[columns[1]].astype(str) - + "-" - + df[columns[2]].astype(str) - ).values - ) - return g.track2fasta(seqs) - elif len(columns) == 1: - # Assume this is some kind of gene_id - return df[columns[0]].values - - -# @scan.register(pybedtools.BedTool) -# @profile -def _scan_bedtool(bed, genome="hg38"): - g = Genome(genome) - intervals = [g[f.chrom][f.start : f.stop] for f in bed] - return intervals - - -# @profile -def _scan_bedtool2(bed, genome="hg38"): - g = Genome(genome) - return Fasta(bed.sequence(fi=g.filename).seqfn).seqs - - -import requests - -rest_url = "https://rest.ensembl.org/info/species" -r = requests.get(rest_url, headers={"Content-Type": "application/json"}) - -if not r.ok: - r.raise_for_status() - -json = r.json() - -print(json) - - -# #'9:120165822-120173708, 'ensemblgene': 'MGP_SPRETEiJ_G0033934', 'start': 120165822, -# df = pd.DataFrame({"chrom":["chr1", "chr2"], "start":[100, 1000], "end":[200, 200]}) -# b = pybedtools.BedTool("5k.bed") -# # for f in b: -# # print(f) -# # break -# seqs = _scan_bedtool(b, genome="Spur_3.1") -# seqs = _scan_bedtool2(b, genome="Spur_3.1") - -# g = Genome("hg19") -# print(g["chr1"][1000000:1000100]) - -# FASTA file -# BED file -# region file -# Gene file -# - promoter (all species) -# - closest accessible region (human) -# - sum / mean / max of regions within distance of promoter -# - sum / mean / max of regions within weighted distance of promoter From faa31a3872d9d5004a46b1a197e8008df80d3e23 Mon Sep 17 00:00:00 2001 From: Simon van Heeringen Date: Tue, 29 Sep 2020 15:40:25 +0200 Subject: [PATCH 85/85] Update CHANGELOG --- CHANGELOG.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0638e19..a109ed18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,33 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +## [0.15.0] - 2020-09-29 + +### Added + +- Added additional columns to `gimme maelstrom` output for better intepretation (correlation of motif to signal and % of regions with motif). +- Added support for multi-species input in `genome@chrom:start-end` format. +- `gimme maelstrom` warns if data is not row-centered and will center by default. +- `gimme maelstrom` selects a set of non-redundant (or less redundant) motifs by default. +- Added SVR regressor for `gimme maelstrom`. +- Added quantile normalization to `coverage_table`. + +### Removed + +- Removed the lightning classifiers and regressors as the package is no longer actively maintained. + +### Changed + +- Visually improved HTML output. +- Score of `maelstrom` is now an aggregate z-score based on combining z-scores from individual methods using Stouffer's method. The z-scores of individual methods are generated using the inverse normal transform. +- Reorganized some classes and functions. + +### Fixed + +- Fixed minor issues with sorting columns in HTML output. +- `gimme motifs` doesn't crash when no motifs are found. +- Fixed error with Ensembl chromosome names in `combine_peaks`. + ## [0.14.4] - 2020-04-02 ### Fixed