From 295d7b8f86771cd5a05f7ef6c3eb71abb404e7f0 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 11:27:37 +0100 Subject: [PATCH 01/34] Fix tests for old tempfile --- tests/test_plots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_plots.py b/tests/test_plots.py index b1c3abd662..9d8bd43536 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -244,7 +244,7 @@ def test_linegraph_multiple_datasets(): ) @pytest.mark.filterwarnings("ignore:setDaemon") def test_flat_plot(tmp_path, monkeypatch, development, export_plot_formats, export_plots): - monkeypatch.setattr(tempfile, "mkdtemp", lambda: tmp_path) + monkeypatch.setattr(tempfile, "mkdtemp", lambda *args, **kwargs: tmp_path) plot_id = "test_plot" plot = linegraph.plot( From 30e7492f8e8973d3efb2c5f9fbfc85a6fd5ce504 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 11:37:57 +0100 Subject: [PATCH 02/34] Docker build: add workflow_dispatch, do not push on pull_request (#2962) --- .github/workflows/docker.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 02e1bc982c..8f356cd499 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -15,6 +15,7 @@ on: release: # Build release types: [published] + workflow_dispatch: jobs: build: @@ -35,12 +36,14 @@ jobs: - name: "Log in to Docker Hub" uses: docker/login-action@v2 + if: github.event_name != 'pull_request' with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: "Login to GitHub Container Registry" uses: docker/login-action@v2 + if: github.event_name != 'pull_request' with: registry: ghcr.io username: ${{ github.repository_owner }} @@ -48,7 +51,7 @@ jobs: - name: "Build dev" uses: docker/build-push-action@v3 - if: github.event_name != 'release' + if: github.event_name != 'pull_request' && github.event_name != 'release' with: # All available with python:3.X-slim are: # platforms: linux/386,linux/amd64,linux/arm/v5,linux/arm/v7,linux/arm64/v8,linux/ppc64le,linux/s390x @@ -62,7 +65,7 @@ jobs: - name: "Build release" uses: docker/build-push-action@v3 - if: github.event_name == 'release' + if: github.event_name != 'pull_request' && github.event_name == 'release' with: # All available with python:3.X-slim are: # platforms: linux/386,linux/amd64,linux/arm/v5,linux/arm/v7,linux/arm64/v8,linux/ppc64le,linux/s390x From 29c2c1840167d2cb649cf30425501ae66dbb00a3 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 13:40:18 +0100 Subject: [PATCH 03/34] Pin kaleido to 0.2.1 (0.4.1 requires external browser) (#2963) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 08c8b78333..9e4aa705b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dependencies = [ "humanize", "importlib_metadata", "jinja2>=3.0.0", - "kaleido", # for flat plot export + "kaleido==0.2.1", # for flat plot export "markdown", "numpy", "packaging", From b1fa683961dd233ff425751c3578dc322bbe0ca8 Mon Sep 17 00:00:00 2001 From: Peter Pruisscher <57712924+peterpru@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:13:55 +0100 Subject: [PATCH 04/34] Add module Haplocheck (#2933) * placeholder files * save progress * add parsing and color * remove print line * switch to search pattern * remove duplicate html * Unique IDs * Unique IDs, redundant namespace * SP * Update search_patterns.yaml * SP --------- Co-authored-by: Vlad Savelyev --- multiqc/config_defaults.yaml | 1 + multiqc/modules/haplocheck/__init__.py | 3 + multiqc/modules/haplocheck/haplocheck.py | 110 +++++++++++++++++++++++ multiqc/search_patterns.yaml | 3 + pyproject.toml | 1 + 5 files changed, 118 insertions(+) create mode 100644 multiqc/modules/haplocheck/__init__.py create mode 100644 multiqc/modules/haplocheck/haplocheck.py diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index 32857039ea..7fb67ec16c 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -406,6 +406,7 @@ module_order: - isoseq - lima - peddy + - haplocheck - somalier - methylqa - mosdepth diff --git a/multiqc/modules/haplocheck/__init__.py b/multiqc/modules/haplocheck/__init__.py new file mode 100644 index 0000000000..08071d8b4b --- /dev/null +++ b/multiqc/modules/haplocheck/__init__.py @@ -0,0 +1,3 @@ +from .haplocheck import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/haplocheck/haplocheck.py b/multiqc/modules/haplocheck/haplocheck.py new file mode 100644 index 0000000000..0f1291e831 --- /dev/null +++ b/multiqc/modules/haplocheck/haplocheck.py @@ -0,0 +1,110 @@ +from typing import Dict, Union +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.plots import table +import logging + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + def __init__(self): + super(MultiqcModule, self).__init__( + name="Haplocheck", + anchor="haplocheck", + href="https://github.com/genepi/haplocheck/", + info="Haplocheck detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.", + doi=["10.1101/gr.256545.119"], + ) + haplocheck_data: Dict = dict() + + for f in self.find_log_files("haplocheck"): + haplocheck_data = self.parse_logs(f) + + haplocheck_data = self.ignore_samples(haplocheck_data) + + if len(haplocheck_data) == 0: + raise ModuleNoSamplesFound + + log.info(f"Found {len(haplocheck_data)} Haplocheck reports") + + # Write data to file + self.write_data_file(haplocheck_data, "multiqc_haplocheck") + + self.add_software_version(None) + + # Headers dictionary for Contamination Data + headers = { + "Contamination Status": { + "title": "Contamination Status", + "description": "Indicates whether contamination was detected in the sample.", + "scale": False, + }, + "Contamination Level": { + "title": "Contamination Level", + "description": "Estimated level of contamination (if applicable).", + "min": 0, + "scale": "Oranges", + "format": "{:,.2f}", + }, + "Distance": { + "title": "Distance", + "description": "Genomic distance value associated with contamination.", + "min": 0, + "scale": "Greens", + "format": "{:,.0f}", + }, + "Sample Coverage": { + "title": "Sample Coverage", + "description": "The total coverage of the sample sequence.", + "min": 0, + "scale": "Blues", + "format": "{:,.0f}", + }, + } + + self.add_section( + name="Haplocheck", + anchor="haplocheck-section", + description="Haplocheck detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.", + plot=table.plot( + data=haplocheck_data, + headers=headers, + pconfig={ + "id": "haplocheck-table", + "title": "Haplocheck", + }, + ), + ) + + for header in headers.values(): + header["hidden"] = True + headers["Contamination Status"]["hidden"] = False + + self.general_stats_addcols(haplocheck_data, headers) + + def parse_logs(self, f: str) -> Dict[str, Dict[str, Union[float, str]]]: + parsed_data: Dict[str, Dict[str, Union[float, str]]] = {} + file_content = f["f"] + lines = file_content.strip().splitlines() + + # Assuming the first line contains headers + headers = [header.strip().replace('"', "") for header in lines[0].strip().split("\t")[1:]] + + for line in lines[1:]: + columns = line.strip().split("\t") + sample_name = columns[0] # First column is the sample name + s_name = sample_name.replace("_haplocheck", "").replace('"', "") # Clean sample name and remove quotes + values = columns[1:] + self.add_data_source(f, s_name) + + # Map headers to values for this sample + parsed_data[s_name] = { + key: ( + float(value.strip().replace('"', "")) + if value.strip().replace('"', "").replace(".", "", 1).isdigit() + else value.strip().replace('"', "") + ) + for key, value in zip(headers, values) + } + + return parsed_data diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index 1cc901a27a..699b618c21 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -384,6 +384,9 @@ goleft_indexcov/ped: fn: "*-indexcov.ped" gopeaks: fn: "*_gopeaks.json" +haplocheck: + contents: '"Sample" "Contamination Status" "Contamination Level" "Distance" "Sample Coverage"' + num_lines: 10 happy: fn: "*.summary.csv" contents: "Type,Filter,TRUTH" diff --git a/pyproject.toml b/pyproject.toml index 9e4aa705b5..2acecb4453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,6 +134,7 @@ gatk = "multiqc.modules.gatk:MultiqcModule" glimpse = "multiqc.modules.glimpse:MultiqcModule" goleft_indexcov = "multiqc.modules.goleft_indexcov:MultiqcModule" gopeaks = "multiqc.modules.gopeaks:MultiqcModule" +haplocheck = "multiqc.modules.haplocheck:MultiqcModule" happy = "multiqc.modules.happy:MultiqcModule" hicexplorer = "multiqc.modules.hicexplorer:MultiqcModule" hicpro = "multiqc.modules.hicpro:MultiqcModule" From e99541d5c13c6edfaa52832429f09fd5161cad0b Mon Sep 17 00:00:00 2001 From: Sebastian Schmeier <6385851+sschmeier@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:43:09 +0100 Subject: [PATCH 05/34] bcl2fastq: fix missing `R1_*`/`R2_*` metrics (#2965) * fix(bcl2fastq): Fixed the yield vars in the bcl2fastq.py module * style(bcl2fastq): changd the variable name in function split_data_by_lane_and_sample from r to r_dict as here it is a dict and in all other functions r denotes the read_number. * Rename the rest of r to r_num --------- Co-authored-by: Sebastian Schmeier Co-authored-by: Vlad Savelyev --- multiqc/modules/bcl2fastq/bcl2fastq.py | 72 ++++++++++++++------------ 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/multiqc/modules/bcl2fastq/bcl2fastq.py b/multiqc/modules/bcl2fastq/bcl2fastq.py index 58b321a178..898991b97c 100644 --- a/multiqc/modules/bcl2fastq/bcl2fastq.py +++ b/multiqc/modules/bcl2fastq/bcl2fastq.py @@ -207,10 +207,10 @@ def parse_file_as_json(self, f): } # simplify the population of dictionaries lsample = run_data[lane]["samples"][sample] - for r in range(1, 5): - lsample[f"R{r}_yield"] = 0 - lsample[f"R{r}_Q30"] = 0 - lsample[f"R{r}_trimmed_bases"] = 0 + for r_num in range(1, 5): + lsample[f"R{r_num}_yield"] = 0 + lsample[f"R{r_num}_Q30"] = 0 + lsample[f"R{r_num}_trimmed_bases"] = 0 rlane["total"] += demuxResult["NumberReads"] rlane["total_yield"] += demuxResult["Yield"] lsample["total"] += demuxResult["NumberReads"] @@ -219,20 +219,24 @@ def parse_file_as_json(self, f): rlane["perfectIndex"] += indexMetric["MismatchCounts"]["0"] lsample["perfectIndex"] += indexMetric["MismatchCounts"]["0"] for readMetric in demuxResult.get("ReadMetrics", []): - r = readMetric["ReadNumber"] + r_num = readMetric["ReadNumber"] rlane["yieldQ30"] += readMetric["YieldQ30"] rlane["qscore_sum"] += readMetric["QualityScoreSum"] lsample["yieldQ30"] += readMetric["YieldQ30"] lsample["qscore_sum"] += readMetric["QualityScoreSum"] - lsample[f"R{r}_yield"] += readMetric["Yield"] - lsample[f"R{r}_Q30"] += readMetric["YieldQ30"] - lsample[f"R{r}_trimmed_bases"] += readMetric["TrimmedBases"] + lsample[f"R{r_num}_yield"] += readMetric["Yield"] + lsample[f"R{r_num}_Q30"] += readMetric["YieldQ30"] + lsample[f"R{r_num}_trimmed_bases"] += readMetric["TrimmedBases"] # Remove unpopulated read keys - for r in range(1, 5): - if not lsample[f"R{r}_yield"] and not lsample[f"R{r}_Q30"] and not lsample[f"R{r}_trimmed_bases"]: - lsample.pop(f"R{r}_yield") - lsample.pop(f"R{r}_Q30") - lsample.pop(f"R{r}_trimmed_bases") + for r_num in range(1, 5): + if ( + not lsample[f"R{r_num}_yield"] + and not lsample[f"R{r_num}_Q30"] + and not lsample[f"R{r_num}_trimmed_bases"] + ): + lsample.pop(f"R{r_num}_yield") + lsample.pop(f"R{r_num}_Q30") + lsample.pop(f"R{r_num}_trimmed_bases") undeterminedYieldQ30 = 0 undeterminedQscoreSum = 0 undeterminedTrimmedBases = 0 @@ -282,8 +286,8 @@ def parse_file_as_json(self, f): sample["mean_qscore"] = "NA" def split_data_by_lane_and_sample(self): - for run_id, r in self.bcl2fastq_data.items(): - for lane_id, lane in r.items(): + for run_id, run_dict in self.bcl2fastq_data.items(): + for lane_id, lane in run_dict.items(): uniqLaneName = self.prepend_runid(run_id, lane_id) self.bcl2fastq_bylane[uniqLaneName] = { "total": lane["total"], @@ -319,9 +323,9 @@ def split_data_by_lane_and_sample(self): # Undetermined samples did not have R1 and R2 information for r_num in range(1, 5): try: - s[f"R{r}_yield"] += sample[f"R{r_num}_yield"] - s[f"R{r}_Q30"] += sample[f"R{r_num}_Q30"] - s[f"R{r}_trimmed_bases"] += sample[f"R{r_num}_trimmed_bases"] + s[f"R{r_num}_yield"] += sample[f"R{r_num}_yield"] + s[f"R{r_num}_Q30"] += sample[f"R{r_num}_Q30"] + s[f"R{r_num}_trimmed_bases"] += sample[f"R{r_num}_trimmed_bases"] except KeyError: pass try: @@ -345,7 +349,7 @@ def split_data_by_lane_and_sample(self): for r_num in range(1, 5): try: if ( - not self.bcl2fastq_bysample[sample_id][f"R{r}_yield"] + not self.bcl2fastq_bysample[sample_id][f"R{r_num}_yield"] and not self.bcl2fastq_bysample[sample_id][f"R{r_num}_Q30"] and not self.bcl2fastq_bysample[sample_id][f"R{r_num}_trimmed_bases"] ): @@ -359,14 +363,14 @@ def add_general_stats(self): data: Dict[str, Dict[str, ValueT]] = dict() for sample_id, sample in self.bcl2fastq_bysample.items(): percent_R_Q30 = dict() - for r in range(1, 5): + for r_num in range(1, 5): # Zero division is possible try: - percent_R_Q30[r] = "{0:.1f}".format( - float(100.0 * sample["R{}_Q30".format(r)] / sample["R{}_yield".format(r)]) + percent_R_Q30[r_num] = "{0:.1f}".format( + float(100.0 * sample[f"R{r_num}_Q30"] / sample[f"R{r_num}_yield"]) ) except ZeroDivisionError: - percent_R_Q30[r] = "0.0" + percent_R_Q30[r_num] = "0.0" except KeyError: pass try: @@ -379,10 +383,10 @@ def add_general_stats(self): "total": sample["total"], "perfectPercent": perfect_percent, } - for r in range(1, 5): + for r_num in range(1, 5): try: - data[sample_id][f"percent_R{r}_Q30"] = percent_R_Q30[r] - data[sample_id][f"R{r}_trimmed_bases"] = sample[f"R{r}_trimmed_bases"] + data[sample_id][f"percent_R{r_num}_Q30"] = percent_R_Q30[r_num] + data[sample_id][f"R{r_num}_trimmed_bases"] = sample[f"R{r_num}_trimmed_bases"] except KeyError: pass @@ -402,10 +406,10 @@ def add_general_stats(self): }, } # If no data for a column, header will be automatically ignored - for r in range(1, 5): - headers[f"percent_R{r}_Q30"] = { - "title": f"R{r} yield ≥ Q30", - "description": f"Percent of bases in R{r} with a Phred score of 30 or higher", + for r_num in range(1, 5): + headers[f"percent_R{r_num}_Q30"] = { + "title": f"R{r_num} yield ≥ Q30", + "description": f"Percent of bases in R{r_num} with a Phred score of 30 or higher", "scale": "RdYlGn", "max": 100, "min": 0, @@ -420,17 +424,17 @@ def add_general_stats(self): "suffix": "%", } # If no data for a column, header will be automatically ignored - for r in range(1, 5): + for r_num in range(1, 5): hide_col = True for sname in data: try: - if data[sname][f"R{r}_trimmed_bases"] > 0: + if data[sname][f"R{r_num}_trimmed_bases"] > 0: hide_col = False except KeyError: pass try: - headers[f"R{r}_trimmed_bases"] = { - "title": f"{config.base_count_prefix} R{r} trimmed", + headers[f"R{r_num}_trimmed_bases"] = { + "title": f"{config.base_count_prefix} R{r_num} trimmed", "description": f"Number of bases trimmed ({config.base_count_desc})", "scale": "RdYlBu", "modify": lambda x: x * 0.000001, From 9d8241ea0c82afa84dc3beea2afe1bdb5ec89065 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 19:14:07 +0100 Subject: [PATCH 06/34] Docs: remove outdated way to customize cc barplot category colors --- docs/markdown/reports/customisation.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/docs/markdown/reports/customisation.md b/docs/markdown/reports/customisation.md index 08eedcaf23..7bdd4f0cc1 100644 --- a/docs/markdown/reports/customisation.md +++ b/docs/markdown/reports/customisation.md @@ -656,19 +656,6 @@ custom_plot_config: color: "#c3e6c3" ``` -As of version 1.8, this also works for customising the config of bargraph categories: - -```yaml -custom_plot_config: - bowtie1_alignment: - reads_aligned: - color: "#d84e2f" - multimapped: - color: "#f2e63f" - not_aligned: - color: "#8bbc21" -``` - ## Customising tables Much like with the custom plot config above, you can override almost any configuration options for tables. From 7ce065530f31d94474085b77b094138c7e2a2ad5 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 19:36:22 +0100 Subject: [PATCH 07/34] featurecounts: missing section name and anchor (#2967) --- multiqc/modules/featurecounts/featurecounts.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/multiqc/modules/featurecounts/featurecounts.py b/multiqc/modules/featurecounts/featurecounts.py index 32d8bfafcb..5246e67440 100755 --- a/multiqc/modules/featurecounts/featurecounts.py +++ b/multiqc/modules/featurecounts/featurecounts.py @@ -76,7 +76,11 @@ def __init__(self): self.general_stats_table(data_by_sample) # Assignment bar plot - self.add_section(plot=featurecounts_chart(data_keys, data_by_sample)) + self.add_section( + name="Assignments", + anchor="featurecounts_assignments", + plot=featurecounts_chart(data_keys, data_by_sample), + ) def parse_featurecounts_report(self, f, data_by_sample, data_keys): """Parse the featureCounts log file.""" From 79c904dde5499d71c9aa6792fba62a3ddfa0c7e8 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 19:46:39 +0100 Subject: [PATCH 08/34] Clean .gitignore in favour of local .git/info/exclude --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index f6c97159f7..558444c5f2 100644 --- a/.gitignore +++ b/.gitignore @@ -3,11 +3,6 @@ multiqc_config.yaml multiqc_report.html multiqc_data/ multiqc_plots/ -/examples/ -/result -/benchmarks -/tmp -/playground # CI tooling .ruff_cache From 08b5da6abf0932c93d4b27a69120daf1db147701 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 20:02:49 +0100 Subject: [PATCH 09/34] Add conditions to satisfy mypy --- multiqc/core/write_results.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/multiqc/core/write_results.py b/multiqc/core/write_results.py index b8fa7e6f7f..a43258db75 100644 --- a/multiqc/core/write_results.py +++ b/multiqc/core/write_results.py @@ -65,14 +65,14 @@ def write_results() -> None: if len(report.modules) == 0: raise NoAnalysisFound("No analysis data for any module. Check that input files and directories exist") - output_names: OutputNames = _set_output_names() + output_file_names: OutputNames = _set_output_names() - render_and_export_plots(plots_dir_name=output_names.plots_dir_name) + render_and_export_plots(plots_dir_name=output_file_names.plots_dir_name) if not config.skip_generalstats: - _render_general_stats_table(plots_dir_name=output_names.plots_dir_name) + _render_general_stats_table(plots_dir_name=output_file_names.plots_dir_name) - paths: OutputPaths = _create_or_override_dirs(output_names) + paths: OutputPaths = _create_or_override_dirs(output_file_names) if config.make_data_dir and not paths.to_stdout and paths.data_dir: _write_data_files(paths.data_dir) @@ -233,9 +233,9 @@ def _create_or_override_dirs(output_names: OutputNames) -> OutputPaths: report_num += 1 if config.make_report and isinstance(paths.report_path, Path): output_names.output_fn_name = paths.report_path.name - if config.data_dir: + if config.data_dir and paths.data_dir: output_names.data_dir_name = paths.data_dir.name - if config.plots_dir: + if config.plots_dir and paths.plots_dir: output_names.plots_dir_name = paths.plots_dir.name logger.info("Existing reports found, adding suffix to filenames. Use '--force' to overwrite.") From 464766c5f640752d076e201faa3eb98605d9cd9e Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 20:10:41 +0100 Subject: [PATCH 10/34] Table: do not generate title from ID --- multiqc/plots/plotly/table.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/multiqc/plots/plotly/table.py b/multiqc/plots/plotly/table.py index 457a1435c7..b5d6aea6c6 100644 --- a/multiqc/plots/plotly/table.py +++ b/multiqc/plots/plotly/table.py @@ -51,8 +51,6 @@ def make_table( # empty_cells: Dict[ColumnKeyT, str] = dict() hidden_cols = 1 table_title = dt.pconfig.title - if table_title is None: - table_title = dt.id.replace("_", " ").title() def escape(s: str) -> str: return s.replace('"', """).replace("'", "'").replace("<", "<").replace(">", ">") @@ -287,8 +285,9 @@ def escape(s: str) -> str: # Put everything together # - # Buttons above the table html = "" + + # Buttons above the table if not config.simple_output and add_control_panel: # Copy Table Button buttons: List[str] = [] From a75ed60a77390072a080cc8255f6999bb3d1ee09 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 18 Nov 2024 20:25:42 +0100 Subject: [PATCH 11/34] Violin: add dt to Dataset objects instead of main_table_dt (#2968) --- multiqc/plots/plotly/violin.py | 37 +++++++++++++++++----------------- multiqc/report.py | 4 ++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/multiqc/plots/plotly/violin.py b/multiqc/plots/plotly/violin.py index 0a90f836b6..e2c98a93ed 100644 --- a/multiqc/plots/plotly/violin.py +++ b/multiqc/plots/plotly/violin.py @@ -8,6 +8,7 @@ import plotly.graph_objects as go # type: ignore from multiqc import config, report +from multiqc.base_module import Section from multiqc.plots.plotly.plot import BaseDataset, Plot, PlotType from multiqc.plots.plotly.table import make_table from multiqc.plots.table_object import ColumnAnchor, ColumnMeta, DataTable, ValueT, TableConfig @@ -62,6 +63,7 @@ class Dataset(BaseDataset): scatter_value_by_sample_by_metric: Dict[ColumnAnchor, Dict[SampleName, Union[int, float, str, None]]] all_samples: List[SampleName] # unique list of all samples in this dataset scatter_trace_params: Dict[str, Any] + dt: DataTable @staticmethod def values_and_headers_from_dt( @@ -220,6 +222,7 @@ def create( scatter_value_by_sample_by_metric=scatter_value_by_sample_by_metric, all_samples=sorted(all_samples), scatter_trace_params=dict(), + dt=dt, ) ds.trace_params.update( @@ -382,7 +385,6 @@ class ViolinPlot(Plot[Dataset, TableConfig]): show_table: bool show_table_by_default: bool n_samples: int - main_table_dt: DataTable @staticmethod def create( @@ -398,13 +400,11 @@ def create( ds_samples.update(section.rows_by_sgroup.keys()) samples_per_dataset.append(ds_samples) - main_table_dt: DataTable = dts[0] # used for the table - model: Plot[Dataset, TableConfig] = Plot.initialize( plot_type=PlotType.VIOLIN, - pconfig=main_table_dt.pconfig, + pconfig=dts[0].pconfig, n_samples_per_dataset=[len(x) for x in samples_per_dataset], - id=main_table_dt.id, + id=dts[0].id, default_tt_label=": %{x}", # Violins scale well, so can always keep them interactive and visible: defer_render_if_large=False, @@ -469,7 +469,6 @@ def create( show_table=show_table, show_table_by_default=show_table_by_default, n_samples=max_n_samples, - main_table_dt=main_table_dt, ) def buttons(self, flat: bool) -> List[str]: @@ -480,7 +479,7 @@ def buttons(self, flat: bool) -> List[str]: self._btn( cls="mqc_table_config_modal_btn", label=" Configure columns", - data_attrs={"toggle": "modal", "target": f"#{self.main_table_dt.anchor}_config_modal"}, + data_attrs={"toggle": "modal", "target": f"#{self.datasets[0].dt.anchor}_config_modal"}, ) ) if self.show_table: @@ -488,7 +487,7 @@ def buttons(self, flat: bool) -> List[str]: self._btn( cls="mqc-violin-to-table", label=" Table", - data_attrs={"table-anchor": self.main_table_dt.anchor, "violin-anchor": self.anchor}, + data_attrs={"table-anchor": self.datasets[0].dt.anchor, "violin-anchor": self.anchor}, ) ) @@ -510,9 +509,9 @@ def show( # we only support one dataset, and the flat mode is not applicable. data: Dict[str, Dict[str, Union[int, float, str, None]]] = {} - for idx, col_key, header in self.main_table_dt.get_headers_in_order(): + for idx, col_key, header in self.datasets[0].dt.get_headers_in_order(): rid = header.rid - for _, group_rows in self.main_table_dt.sections[idx].rows_by_sgroup.items(): + for _, group_rows in self.datasets[0].dt.sections[idx].rows_by_sgroup.items(): for row in group_rows: if col_key in row.raw_data: val = row.raw_data[col_key] @@ -544,16 +543,16 @@ def save( if self.show_table_by_default and violin is not True or table is True: # Make Plotly go.Table object and save it data: Dict[str, Dict[str, Union[int, float, str, None]]] = {} - for idx, metric, header in self.main_table_dt.get_headers_in_order(): + for idx, metric, header in self.datasets[0].dt.get_headers_in_order(): rid = header.rid - for _, group_rows in self.main_table_dt.sections[idx].rows_by_sgroup.items(): + for _, group_rows in self.datasets[0].dt.sections[idx].rows_by_sgroup.items(): for row in group_rows: if metric in row.raw_data: val = row.raw_data[metric] data.setdefault(row.sample, {})[rid] = val values: List[List[Any]] = [list(data.keys())] - for idx, metric, header in self.main_table_dt.get_headers_in_order(): + for idx, metric, header in self.datasets[0].dt.get_headers_in_order(): rid = header.rid values.append([data[s].get(rid, "") for s in data.keys()]) @@ -588,7 +587,7 @@ def save( else: super().save(filename, **kwargs) - def add_to_report(self, plots_dir_name: Optional[str] = None, clean_html_id: bool = True) -> str: + def add_to_report(self, plots_dir_name: Optional[str] = None, section: Optional[Section] = None) -> str: warning = "" if self.show_table_by_default and not self.show_table: warning = ( @@ -615,14 +614,14 @@ def add_to_report(self, plots_dir_name: Optional[str] = None, clean_html_id: boo # happen if violin.plot() is called directly, and "no_violin" is passed, which doesn't make sense. html = warning + super().add_to_report(plots_dir_name=plots_dir_name) elif self.no_violin: - assert self.main_table_dt is not None + assert self.datasets[0].dt is not None # Show table alone - table_html, configuration_modal = make_table(self.main_table_dt) + table_html, configuration_modal = make_table(self.datasets[0].dt) html = warning + table_html + configuration_modal else: - assert self.main_table_dt is not None + assert self.datasets[0].dt is not None # Render both, add a switch between table and violin - table_html, configuration_modal = make_table(self.main_table_dt, violin_anchor=self.anchor) + table_html, configuration_modal = make_table(self.datasets[0].dt, violin_anchor=self.anchor) violin_html = super().add_to_report(plots_dir_name=plots_dir_name) violin_visibility = "style='display: none;'" if self.show_table_by_default else "" @@ -630,7 +629,7 @@ def add_to_report(self, plots_dir_name: Optional[str] = None, clean_html_id: boo table_visibility = "style='display: none;'" if not self.show_table_by_default else "" html += ( - f"
{table_html}
" + f"
{table_html}
" ) html += configuration_modal diff --git a/multiqc/report.py b/multiqc/report.py index 31bda91b08..f95a5a0666 100644 --- a/multiqc/report.py +++ b/multiqc/report.py @@ -39,7 +39,7 @@ # This does not cause circular imports because BaseMultiqcModule is used only in # quoted type hints, and quoted type hints are lazily evaluated: -from multiqc.base_module import BaseMultiqcModule +from multiqc.base_module import BaseMultiqcModule, Section from multiqc.core import log_and_rich, tmp_dir from multiqc.core.exceptions import NoAnalysisFound from multiqc.core.log_and_rich import iterate_using_progress_bar @@ -1026,7 +1026,7 @@ def multiqc_dump_json(): return exported_data -def get_all_sections() -> List: +def get_all_sections() -> List[Section]: return [s for mod in modules for s in mod.sections if not mod.hidden] From d71b784d488538046cac8d152fc99036ee6ab4bf Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 20 Nov 2024 13:34:02 +0100 Subject: [PATCH 12/34] bclconvert: fix undetermined barcodes plot (#2976) * bclconvert: fix undetermined barcodes plot: show top 20 barcodes * Sort barcodes and show top 40 --- multiqc/modules/bclconvert/bclconvert.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/multiqc/modules/bclconvert/bclconvert.py b/multiqc/modules/bclconvert/bclconvert.py index 704f4a8a9f..11c20bf41e 100644 --- a/multiqc/modules/bclconvert/bclconvert.py +++ b/multiqc/modules/bclconvert/bclconvert.py @@ -238,23 +238,26 @@ def __init__(self): self._clusters_by_sample_barplot(data_by_sample) + TOP_N_UNDETERMINED_BARCODES = 40 + # Add section with undetermined barcodes if len(data_by_run) == 1: - undetermined_data = self.get_bar_data_from_undetermined(data_by_run) + undetermined_data = self.get_bar_data_from_undetermined(data_by_run, top_n=TOP_N_UNDETERMINED_BARCODES) if undetermined_data: self.add_section( - name="Undetermined barcodes by lane", + name="Undetermined barcodes", anchor="undetermine_by_lane", - description="Undetermined barcodes by lanes", + description=f"Top {TOP_N_UNDETERMINED_BARCODES} undetermined barcodes", plot=bargraph.plot( undetermined_data, None, { "id": "bclconvert_undetermined", - "title": "bclconvert: Undetermined barcodes by lane", + "title": f"bclconvert: top {TOP_N_UNDETERMINED_BARCODES} undetermined barcodes", "ylab": "Count", "use_legend": True, "tt_suffix": "reads", + "sort_samples": False, }, ), ) @@ -1073,7 +1076,9 @@ def get_bar_data_from_counts( return bar_data @staticmethod - def get_bar_data_from_undetermined(data_by_run: Dict[str, RunSummary]) -> Dict[str, Dict[str, int]]: + def get_bar_data_from_undetermined( + data_by_run: Dict[str, RunSummary], top_n: int = 20 + ) -> Dict[str, Dict[str, int]]: """ Get data to plot for undetermined barcodes. """ @@ -1083,11 +1088,13 @@ def get_bar_data_from_undetermined(data_by_run: Dict[str, RunSummary]) -> Dict[s for _, run in data_by_run.items(): for lane_id, lane in run.lanes.items(): try: - for barcode, count in islice(lane.top_unknown_barcodes.items(), 20): + for barcode, count in islice(lane.top_unknown_barcodes.items(), top_n): bar_data[barcode][lane_id] = count except AttributeError: pass except KeyError: pass - return {key: value for key, value in islice(bar_data.items(), 20)} + # Sort by value + bar_data = dict(sorted(bar_data.items(), key=lambda item: sum(item[1].values()), reverse=True)) + return {key: value for key, value in islice(bar_data.items(), top_n)} From c91615e0a2f6b912467b61f2f4c5ce3e33445a9a Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 20 Nov 2024 14:11:53 +0100 Subject: [PATCH 13/34] Release checklist: mention GITHUB_TOKEN --- .github/RELEASE_CHECKLIST.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 34fb8de2f2..5853bc4e27 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -4,9 +4,10 @@ This checklist is for my own reference, as I forget the steps every time. 1. Check that everything is up-to-date and ready to go 2. Update version numbers in `pyproject.toml` -3. Generate a new changelog section stub: +3. Generate a new changelog section stub (make sure to export a GitHub token to avoid rate limits): ```bash + export GITHUB_TOKEN= python scripts/print_changelog.py ``` From 7f20f44e905484d93ffcd3137adcc96253ccc6cb Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 20 Nov 2024 16:15:36 +0100 Subject: [PATCH 14/34] Update RELEASE_CHECKLIST.md: website examples and cache cleaning --- .github/RELEASE_CHECKLIST.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 5853bc4e27..cae5840394 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -19,17 +19,20 @@ This checklist is for my own reference, as I forget the steps every time. python scripts/make_module_docs.py ``` -5. Install the package again in the `install` mode: +5. Install the package again in the `install` mode. Make sure to remove the build directory and egg info that might cache outdated metadata such as entry point names: ```bash + rm -rf build/ *.egg-info pip install . ``` This removes the commit hash from the version number when MultiQC runs. 6. Run using test data + - Check for any command line or javascript errors - Check version numbers are printed correctly + 7. Create new demo reports for the website - Comment out any config in `~/.multiqc_config.yaml` @@ -38,9 +41,10 @@ This checklist is for my own reference, as I forget the steps every time. mv ~/.multiqc_config.yml ~/.multiqc_config.yml.bkup ``` - - Generate reports in the multiqc/website repo. + - Generate reports in the seqeralabs/web repo. ```bash + cd ../seqeralabs/web/packages/website bash update_examples.sh ``` From 51864f5f147535212ffe690c074cc1160d6c9081 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 20 Nov 2024 16:16:37 +0100 Subject: [PATCH 15/34] Bump v1.25.2 (#2977) * Bump v1.25.2 * Add haplocheck docs * No need to wrap in Anchor --- CHANGELOG.md | 50 +++++++++++++++++++ docs/markdown/modules.mdx | 10 +++- docs/markdown/modules/haplocheck.md | 30 +++++++++++ .../modules/featurecounts/featurecounts.py | 4 +- pyproject.toml | 37 ++++++++------ 5 files changed, 113 insertions(+), 18 deletions(-) create mode 100644 docs/markdown/modules/haplocheck.md diff --git a/CHANGELOG.md b/CHANGELOG.md index fd8cbd34f1..d1cd15ec5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,55 @@ # MultiQC Version History +## [MultiQC v1.25.2](https://github.com/MultiQC/MultiQC/releases/tag/v1.25.2) - 2024-11-20 + +Multiple bug fixes and minor updates. + +### Updates + +- Add natural sort for sample sorting ([#2959](https://github.com/MultiQC/MultiQC/pull/2959)) +- Custom content: for `plot_type: image`, support `custom_data` config with section name and description. Fix misleading logging ([#2939](https://github.com/MultiQC/MultiQC/pull/2939)) +- Config validation improvements (group messages, cast types, validate column headers) ([#2899](https://github.com/MultiQC/MultiQC/pull/2899)) + +### Fixes + +- Workaround for displaying sample grouping in Safari because of missing `visibility: collapse` ([#2941](https://github.com/MultiQC/MultiQC/pull/2941)) +- Fix table CSV export where a title contains a comma ([#2911](https://github.com/MultiQC/MultiQC/pull/2911)) +- Showing table in notebooks: respect `col1_header` ([#2914](https://github.com/MultiQC/MultiQC/pull/2914)) +- Customizing `custom_table_header_config`: fix docs, support both the old and the new ways ([#2955](https://github.com/MultiQC/MultiQC/pull/2955)) +- Table scatter mini-plots: fix rounding and range ([#2956](https://github.com/MultiQC/MultiQC/pull/2956)) +- File line block iterator: fix reading long lines that do not fit one block ([#2935](https://github.com/MultiQC/MultiQC/pull/2935)) +- Fix `cond_formatting_rules` type hint to avoid validation error ([#2922](https://github.com/MultiQC/MultiQC/pull/2922)) +- Fix `config.prepend_dirs` or `-d -dd 1` ([#2913](https://github.com/MultiQC/MultiQC/pull/2913)) +- Sample grouping fixes ([#2920](https://github.com/MultiQC/MultiQC/pull/2920)): + - Keep sample name column fix width to avoid jumping + - Fix hiding columns through the modal +- Custom content fixes: + - Avoid showing `section_comment` both for module and section when they have the same ID ([#2954](https://github.com/MultiQC/MultiQC/pull/2954)) + - Address issue of sections without search patterns and headers in files ([#2921](https://github.com/MultiQC/MultiQC/pull/2921)) + - Fix duplicated custom content sections in the report ([#2921](https://github.com/MultiQC/MultiQC/pull/2921)) + - Fix support for `plot_type: violin` ([#2957](https://github.com/MultiQC/MultiQC/pull/2957)) + +### Module updates + +- ngsbits: add submodule samplegender ([#2854](https://github.com/MultiQC/MultiQC/pull/2854)) +- nanoq: change lineplots for barplots ([#2934](https://github.com/MultiQC/MultiQC/pull/2934)) +- Qualimap: clarify the direction of the transcript in coverage plot ([#2946](https://github.com/MultiQC/MultiQC/pull/2946)) +- picard: add table with all metrics to VariantCallingMetrics section ([#2885](https://github.com/MultiQC/MultiQC/pull/2885)) +- Nanostat: add general stats columns ([#2961](https://github.com/MultiQC/MultiQC/pull/2961)) +- Samtools: add insert size to general stats table ([#2905](https://github.com/MultiQC/MultiQC/pull/2905)) + +### Module fixes + +- bcl2fastq: fix missing `R1_*`/`R2_*` metrics ([#2965](https://github.com/MultiQC/MultiQC/pull/2965)) +- Cutadapt: fix for null values from r2 data ([#2936](https://github.com/MultiQC/MultiQC/pull/2936)) +- Qualimap: fix parsing ∞ value ([#2937](https://github.com/MultiQC/MultiQC/pull/2937)) +- bclconvert: fix undetermined barcodes plot ([#2976](https://github.com/MultiQC/MultiQC/pull/2976)) +- featurecounts: fix missing section name and anchor ([#2967](https://github.com/MultiQC/MultiQC/pull/2967)) + +### Infrastructure + +- Pin kaleido to 0.2.1 (new 0.4.1 does not embed a browser and thus not portable) ([#2963](https://github.com/MultiQC/MultiQC/pull/2963)) + ## [MultiQC v1.25.1](https://github.com/MultiQC/MultiQC/releases/tag/v1.25.1) - 2024-09-30 Python 3.13, bugs fixed, improved sample grouping UI, and handling freezes in containers with incompatible architectures. diff --git a/docs/markdown/modules.mdx b/docs/markdown/modules.mdx index 8543c9758b..c059fd5f79 100644 --- a/docs/markdown/modules.mdx +++ b/docs/markdown/modules.mdx @@ -10,7 +10,7 @@ This file is autogenerated. Do not edit the markdown, it will be overwritten. ~~~~~~~~~~~~~~~~~~~~~~~ --> -MultiQC currently has modules to support 153 bioinformatics tools, listed below. +MultiQC currently has modules to support 154 bioinformatics tools, listed below. Click the tool name to go to the MultiQC documentation for that tool. @@ -282,6 +282,14 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; }, }, { id: "modules/gopeaks", data: { name: "GoPeaks", summary: "Calls peaks in CUT&TAG/CUT&RUN datasets" } }, + { + id: "modules/haplocheck", + data: { + name: "Haplocheck", + summary: + "Haplocheck detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content", + }, + }, { id: "modules/happy", data: { name: "hap.py", summary: "Benchmarks variant calls against gold standard truth datasets" }, diff --git a/docs/markdown/modules/haplocheck.md b/docs/markdown/modules/haplocheck.md new file mode 100644 index 0000000000..49a85dccde --- /dev/null +++ b/docs/markdown/modules/haplocheck.md @@ -0,0 +1,30 @@ +--- +title: Haplocheck +displayed_sidebar: multiqcSidebar +description: > + Haplocheck detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content +--- + + + +:::note +Haplocheck detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content + +[https://github.com/genepi/haplocheck/](https://github.com/genepi/haplocheck/) +::: + +### File search patterns + +```yaml +haplocheck: + contents: "\"Sample\"\t\"Contamination Status\"\t\"Contamination Level\"\t\"Distance\"\ + \t\"Sample Coverage\"" + num_lines: 10 +``` diff --git a/multiqc/modules/featurecounts/featurecounts.py b/multiqc/modules/featurecounts/featurecounts.py index 5246e67440..1c8af3c6c4 100755 --- a/multiqc/modules/featurecounts/featurecounts.py +++ b/multiqc/modules/featurecounts/featurecounts.py @@ -5,7 +5,7 @@ from multiqc import config from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound from multiqc.plots import bargraph -from multiqc.types import Anchor, ColumnKey +from multiqc.types import ColumnKey log = logging.getLogger(__name__) @@ -46,7 +46,7 @@ class MultiqcModule(BaseMultiqcModule): def __init__(self): super(MultiqcModule, self).__init__( name="featureCounts", - anchor=Anchor("featurecounts"), + anchor="featurecounts", target="Subread featureCounts", href="http://subread.sourceforge.net/", info="Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, " diff --git a/pyproject.toml b/pyproject.toml index 2acecb4453..e7705764a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,38 +4,45 @@ build-backend = "setuptools.build_meta" [project] name = "multiqc" -version = "1.26dev" +version = "1.25.2" dependencies = [ "click", "humanize", "importlib_metadata", "jinja2>=3.0.0", - "kaleido==0.2.1", # for flat plot export + "kaleido==0.2.1", # for flat plot export "markdown", "numpy", "packaging", "requests", - "Pillow>=10", # to add a logo into flat plots + "Pillow>=10", # to add a logo into flat plots "plotly>=5.18", "pyyaml>=4", "rich>=10", "rich-click", - "coloredlogs", # logging in notebooks + "coloredlogs", # logging in notebooks "spectra>=0.0.10", "pydantic>=2.7.0", "typeguard", - "tqdm", # progress bar in notebooks + "tqdm", # progress bar in notebooks "natsort", ] requires-python = ">=3.8" authors = [ - {name = "Phil Ewels", email = "phil.ewels@seqera.io"}, - {name = "Vlad Savelyev", email = "vladislav.savelyev@seqera.io"}, + { name = "Phil Ewels", email = "phil.ewels@seqera.io" }, + { name = "Vlad Savelyev", email = "vladislav.savelyev@seqera.io" }, ] description = "Create aggregate bioinformatics analysis reports across many samples and tools" readme = "README.md" -license = {file = "LICENSE"} -keywords = ["bioinformatics", "biology", "sequencing", "NGS", "next generation sequencing", "quality control"] +license = { file = "LICENSE" } +keywords = [ + "bioinformatics", + "biology", + "sequencing", + "NGS", + "next generation sequencing", + "quality control", +] classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", @@ -60,10 +67,10 @@ dev = [ "pdoc3", "pytest", "pytest-cov", - "pytest-xdist", # for parallel testing with `pytest -n` - "syrupy", # to compare html snpashots, alternative to pytest-snapshot that + "pytest-xdist", # for parallel testing with `pytest -n` + "syrupy", # to compare html snpashots, alternative to pytest-snapshot that # does work better with parametrized tests - "pygithub", # to generate changelog + "pygithub", # to generate changelog "mypy", "types-PyYAML", "types-tqdm", @@ -109,7 +116,7 @@ checkqc = "multiqc.modules.checkqc:MultiqcModule" clipandmerge = "multiqc.modules.clipandmerge:MultiqcModule" clusterflow = "multiqc.modules.clusterflow:MultiqcModule" conpair = "multiqc.modules.conpair:MultiqcModule" -custom_content = "multiqc.modules.custom_content:custom_module_classes" # special case +custom_content = "multiqc.modules.custom_content:custom_module_classes" # special case cutadapt = "multiqc.modules.cutadapt:MultiqcModule" damageprofiler = "multiqc.modules.damageprofiler:MultiqcModule" dedup = "multiqc.modules.dedup:MultiqcModule" @@ -261,12 +268,12 @@ geo = "multiqc.templates.geo" [tool.setuptools] packages = ["multiqc"] -package-data = {multiqc = ["py.typed"]} +package-data = { multiqc = ["py.typed"] } [tool.ruff] line-length = 120 target-version = "py312" -lint.ignore = ["F401"] # unused-import +lint.ignore = ["F401"] # unused-import [tool.mypy] check_untyped_defs = true From 85299ee83db20b0c539116ca9d2a77b7e451d61f Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 25 Nov 2024 15:52:21 +0100 Subject: [PATCH 16/34] Dedup: more specific search pattern (#2988) --- multiqc/search_patterns.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index 699b618c21..f8ab5973a1 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -244,7 +244,9 @@ cutadapt: damageprofiler: fn: "*dmgprof.json" dedup: - fn: "*dedup.json" + fn: "*.json" + contents: '"tool_name": "DeDup"' + num_lines: 20 deeptools/bamPEFragmentSizeTable: contents: " Frag. Sampled Frag. Len. Min. Frag. Len. 1st. Qu. Frag. Len. Mean Frag. Len. Median Frag. Len. 3rd Qu." num_lines: 1 From 18faffbf8e1fbe1981b6246a77f2e3ede8263292 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Mon, 25 Nov 2024 16:41:29 +0100 Subject: [PATCH 17/34] Qualimap BamQC: add dups number rate, Ns, mapped paired reads (#2989) --- multiqc/modules/qualimap/QM_BamQC.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/multiqc/modules/qualimap/QM_BamQC.py b/multiqc/modules/qualimap/QM_BamQC.py index cf84f6b8cf..ff171e9006 100644 --- a/multiqc/modules/qualimap/QM_BamQC.py +++ b/multiqc/modules/qualimap/QM_BamQC.py @@ -85,8 +85,11 @@ def parse_genome_results(module: BaseMultiqcModule) -> Tuple[Dict, Dict]: "number of mapped reads": "mapped_reads", "number of mapped reads inside": "regions_mapped_reads", "number of mapped bases": "mapped_bases", + "number of mapped paired reads (both in pair)": "mapped_paired_reads", "number of sequenced bases": "sequenced_bases", "regions size": "regions_size", + "number of N's": "ns", + "number of duplicated reads (flagged)": "duplicated_reads_flagged", } float_metrics = { "mean insert size": "mean_insert_size", @@ -170,12 +173,16 @@ def parse_genome_results(module: BaseMultiqcModule) -> Tuple[Dict, Dict]: in [ "total_reads", "mapped_reads", + "mapped_paired_reads", "general_error_rate", "mean_coverage", "regions_size", "regions_mapped_reads", "percentage_aligned", "percentage_aligned_on_target", + "ns", + "duplication_rate", + "duplicated_reads_flagged", ] } @@ -694,6 +701,13 @@ def general_stats_headers(threshs, hidden_threshs) -> Dict: "shared_key": "read_count", "hidden": True, } + headers["mapped_paired_reads"] = { + "title": f"{config.read_count_prefix} Paired", + "description": f"Number of mapped paired reads ({config.read_count_desc})", + "scale": "RdYlGn", + "shared_key": "read_count", + "hidden": True, + } headers["total_reads"] = { "title": f"{config.read_count_prefix} Total reads", "description": f"Number of reads ({config.read_count_desc})", @@ -701,6 +715,24 @@ def general_stats_headers(threshs, hidden_threshs) -> Dict: "shared_key": "read_count", "hidden": True, } + headers["ns"] = { + "title": "N's", + "description": "Number of N's", + "scale": "RdYlGn", + "hidden": True, + } + headers["duplicated_reads_flagged"] = { + "title": "Duplicated", + "description": "Number of duplicated reads (flagged)", + "scale": "RdYlGn", + "hidden": True, + } + headers["duplication_rate"] = { + "title": "Dup rate", + "description": "Duplication rate", + "suffix": "%", + "scale": "RdYlGn", + } return headers From 2714562b0e0c1e023e23aebe801b6e0aff7f8846 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Mon, 25 Nov 2024 17:44:19 +0200 Subject: [PATCH 18/34] Docs: fix `table_headers_txt_mqc.txt` reference (#2986) Signed-off-by: Emmanuel Ferdman Co-authored-by: Vlad Savelyev --- docs/markdown/custom_content/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/markdown/custom_content/index.md b/docs/markdown/custom_content/index.md index 3485850c6e..d8354123f0 100644 --- a/docs/markdown/custom_content/index.md +++ b/docs/markdown/custom_content/index.md @@ -448,4 +448,4 @@ The MultiQC automated testing runs with a which you can look through for inspiration. For example, to see a file which generates a table in a report by itself, you can -have a look at `embedded_config/table_headers_mqc.txt` ([link](https://github.com/MultiQC/test-data/blob/main/data/custom_content/embedded_config/table_headers_mqc.txt)). +have a look at `embedded_config/table_headers_txt_mqc.txt` ([link](https://github.com/MultiQC/test-data/blob/main/data/custom_content/embedded_config/table_headers_txt_mqc.txt)). From 8ec5406336f7dc3f422241342d9ca4593970fb63 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Mon, 25 Nov 2024 16:47:49 +0100 Subject: [PATCH 19/34] Change heading in the changelog (#2985) Instead of "Updates" use "Feature updates and improvements" for cross-product changelog consistency in the Seqera docs. --- scripts/print_changelog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/print_changelog.py b/scripts/print_changelog.py index 7728eeda17..a2a9c6ca7d 100755 --- a/scripts/print_changelog.py +++ b/scripts/print_changelog.py @@ -86,7 +86,7 @@ def main(): "module: new": "New modules", "bug: module": "Module fixes", "module: enhancement": "Module updates", - "module: change": "Updates", + "module: change": "### Feature updates and improvements", "bug: core": "Fixes", "core: infrastructure": "Infrastructure and packaging", "core: refactoring": "Refactoring and typing", @@ -94,7 +94,7 @@ def main(): } sections_to_prs: Dict[str, List[PullRequest]] = { "New modules": [], - "Updates": [], + "Feature updates and improvements": [], "Module updates": [], "Fixes": [], "Module fixes": [], @@ -107,10 +107,10 @@ def main(): continue if pr.labels: for label in pr.labels: - section_name = label_to_section.get(label.name, "Updates") + section_name = label_to_section.get(label.name, "Feature updates and improvements") sections_to_prs[section_name].append(pr) else: - sections_to_prs["Updates"].append(pr) + sections_to_prs["Feature updates and improvements"].append(pr) print("") print(f"## [MultiQC {current_tag}]({REPO_URL}/releases/tag/{current_tag}) - {datetime.date.today().isoformat()}") From 2984c39ff786bfd1599c883df1c00730a92f874d Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 27 Nov 2024 21:43:33 +0100 Subject: [PATCH 20/34] Workflow to sync changelog with Seqera docs (#2993) * Add automated workflow to sync changelog with Seqera docs * Add temporary branch trigger for testing docs update workflow * Use SEQERALABS_DOCS PAT * Prefix with v * Remove push trigger * Move script to workflows folder * Try as a personal user * Remove push trigger --- .github/workflows/seqera_docs_changelog.py | 94 +++++++++++++++++++++ .github/workflows/seqera_docs_changelog.yml | 42 +++++++++ CHANGELOG.md | 2 +- 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/seqera_docs_changelog.py create mode 100644 .github/workflows/seqera_docs_changelog.yml diff --git a/.github/workflows/seqera_docs_changelog.py b/.github/workflows/seqera_docs_changelog.py new file mode 100644 index 0000000000..27f1e8fcf3 --- /dev/null +++ b/.github/workflows/seqera_docs_changelog.py @@ -0,0 +1,94 @@ +import re +from pathlib import Path +import subprocess +from typing import TypedDict + + +def extract_first_second_level_section(markdown) -> str: + # Match the first second-level section and capture its content + pattern = r"(##\s+.*?)(?=(\n##\s+|$))" + match = re.search(pattern, markdown, re.DOTALL) + + if not match: + raise ValueError("Could not find first second-level section in changelog") + + # Return the matched section + return match.group(1).strip() + + +class ChangelogData(TypedDict): + version: str + url: str + date: str + summary: str + the_rest: str + + +def extract_latest_version(changelog_content) -> ChangelogData: + """Extract the latest version section from the changelog.""" + # Skip the first header line + last_version_changelog = extract_first_second_level_section(changelog_content) + + # Find the first version section and extract the version, url and date + matches = re.search(r"^## \[MultiQC v([\d.]+)\]\((.*?)\) - ([\d-]+)", last_version_changelog) + if not matches: + raise ValueError("Could not find version information in changelog") + version, url, date = matches.groups() + + # Remove the first line + last_version_changelog = "\n".join(last_version_changelog.splitlines()[1:]).strip() + # Get data until the third-level header + next_header_index = last_version_changelog.find("###") + if next_header_index == -1: + raise ValueError("Could not find next header in changelog") + summary = last_version_changelog[:next_header_index].strip() + the_rest = last_version_changelog[next_header_index:].strip() + + return {"version": version, "date": date, "url": url, "summary": summary, "the_rest": the_rest} + + +# Read CHANGELOG.md +changelog_path = Path("CHANGELOG.md") +if not changelog_path.exists(): + raise FileNotFoundError("CHANGELOG.md not found") + +# Create seqeralabs-docs directory if it doesn't exist +docs_dir = Path("seqeralabs-docs") +if not docs_dir.exists(): + raise FileNotFoundError("seqeralabs-docs directory not found. Cloning the repository") + +# Clone seqeralabs-docs repo if it doesn't exist +if not docs_dir.exists(): + print("Cloning seqeralabs-docs repository...") + repo_url = "https://github.com/seqeralabs/seqeralabs-docs.git" + try: + subprocess.run(["git", "clone", repo_url, str(docs_dir)], check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to clone repository: {e}") + +# Extract latest version +with open(changelog_path) as f: + changelog_data: ChangelogData = extract_latest_version(f.read()) + +# Create output directory +output_dir = docs_dir / "changelog" / "multiqc" +output_dir.mkdir(parents=True, exist_ok=True) + +# Create output file +mdx_content: str = f"""--- +title: MultiQC v{changelog_data['version']} +date: {changelog_data['date']} +tags: [multiqc] +--- + +{changelog_data['summary']} + +{{/* truncate */}} + +{changelog_data['the_rest']} +""" +with open(output_path := output_dir / f"v{changelog_data['version']}.mdx", "w") as f: + f.write(mdx_content) + +# Print version for GitHub Actions +print(f"::set-output name=version::{changelog_data['version']}") diff --git a/.github/workflows/seqera_docs_changelog.yml b/.github/workflows/seqera_docs_changelog.yml new file mode 100644 index 0000000000..442a1dcc2b --- /dev/null +++ b/.github/workflows/seqera_docs_changelog.yml @@ -0,0 +1,42 @@ +name: Push changelog to Seqera Docs + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + update-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Clone seqeralabs/docs + run: | + git clone https://github.com/seqeralabs/docs.git seqeralabs-docs + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + + - name: Convert changelog + id: convert + run: python ${GITHUB_WORKSPACE}/.github/workflows/seqera_docs_changelog.py + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.SEQERALABS_DOCS }} + path: seqeralabs-docs + commit-message: "Changelog: MultiQC ${{ steps.convert.outputs.version }}" + title: "Changelog: MultiQC ${{ steps.convert.outputs.version }}" + body: | + This PR adds the changelog for MultiQC ${{ steps.convert.outputs.version }} to the Seqera documentation. + + This is an automated PR created from the MultiQC repository. + branch: changelog-multiqc-${{ steps.convert.outputs.version }} + base: master + delete-branch: true diff --git a/CHANGELOG.md b/CHANGELOG.md index d1cd15ec5b..4451ce3b02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ Multiple bug fixes and minor updates. -### Updates +### Feature updates and improvements - Add natural sort for sample sorting ([#2959](https://github.com/MultiQC/MultiQC/pull/2959)) - Custom content: for `plot_type: image`, support `custom_data` config with section name and description. Fix misleading logging ([#2939](https://github.com/MultiQC/MultiQC/pull/2939)) From 6b3a5586f81fd1f225eef132ba2e21d6db18d864 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 27 Nov 2024 23:59:33 +0100 Subject: [PATCH 21/34] Fix changelog scripts --- .github/workflows/seqera_docs_changelog.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/seqera_docs_changelog.py b/.github/workflows/seqera_docs_changelog.py index 27f1e8fcf3..7196bc3d76 100644 --- a/.github/workflows/seqera_docs_changelog.py +++ b/.github/workflows/seqera_docs_changelog.py @@ -54,13 +54,10 @@ def extract_latest_version(changelog_content) -> ChangelogData: # Create seqeralabs-docs directory if it doesn't exist docs_dir = Path("seqeralabs-docs") -if not docs_dir.exists(): - raise FileNotFoundError("seqeralabs-docs directory not found. Cloning the repository") - # Clone seqeralabs-docs repo if it doesn't exist if not docs_dir.exists(): print("Cloning seqeralabs-docs repository...") - repo_url = "https://github.com/seqeralabs/seqeralabs-docs.git" + repo_url = "https://github.com/seqeralabs/docs.git" try: subprocess.run(["git", "clone", repo_url, str(docs_dir)], check=True) except subprocess.CalledProcessError as e: From 9bd1cf8416d04d2882a36c233089dd44cd5c2e58 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 28 Nov 2024 00:04:53 +0100 Subject: [PATCH 22/34] Fix docker build: remove docs from .dockerignore (#2994) --- .dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 2372587f01..00a58a99f2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -28,5 +28,4 @@ demo # Other things .idea .vscode -docs test-data From 057de1245cb04993b476cb4d5250a08934920094 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 28 Nov 2024 00:16:27 +0100 Subject: [PATCH 23/34] Interop: remove % suffix for nan values (#2995) --- multiqc/modules/interop/interop.py | 34 ++++++++++++------------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/multiqc/modules/interop/interop.py b/multiqc/modules/interop/interop.py index 15758a5d8a..8d4688244f 100755 --- a/multiqc/modules/interop/interop.py +++ b/multiqc/modules/interop/interop.py @@ -103,6 +103,16 @@ def __init__(self): plot=self.index_metrics_details_table(self.indexSummary), ) + @staticmethod + def parse_value(value: str): + try: + return int(value) + except ValueError: + try: + return float(value) + except ValueError: + return value + @staticmethod def parse_summary_csv(f): metrics = {"summary": {}, "details": {}} @@ -130,13 +140,7 @@ def parse_summary_csv(f): # process summary metrics["summary"][data[0]] = {} for idx in range(1, len(data)): - try: - metrics["summary"][data[0]][header[idx]] = int(data[idx]) - except ValueError: - try: - metrics["summary"][data[0]][header[idx]] = float(data[idx]) - except ValueError: - metrics["summary"][data[0]][header[idx]] = data[idx] + metrics["summary"][data[0]][header[idx]] = MultiqcModule.parse_value(data[idx]) if line.startswith("Total"): section = None elif line.startswith("Read") and (section is None or section == "details"): @@ -168,19 +172,10 @@ def parse_summary_csv(f): vals = data[idx].split(" - ") linedata[header[idx]] = max(int(v) for v in vals) else: - try: - linedata[header[idx]] = int(data[idx]) - except ValueError: - linedata[header[idx]] = float(data[idx]) + linedata[header[idx]] = MultiqcModule.parse_value(data[idx]) except ValueError: val = re.sub(pattern=r"\+/-.*", repl="", string=data[idx]).strip() - try: - linedata[header[idx]] = int(val) - except ValueError: - try: - linedata[header[idx]] = float(val) - except ValueError: - linedata[header[idx]] = val + linedata[header[idx]] = MultiqcModule.parse_value(val) metrics["details"][f"Lane {data[0]} - {read}"] = linedata return metrics, version @@ -259,7 +254,6 @@ def run_metrics_summary_table(data): "description": "", "min": 0, "max": 100, - "suffix": "%", "scale": "OrRd", }, "Intensity C1": { @@ -356,7 +350,6 @@ def run_metrics_details_table(data): "description": "The percentage that aligned to the PhiX genome.", "min": 0, "max": 100, - "suffix": "%", "scale": "PiYG", }, "Error": { @@ -364,7 +357,6 @@ def run_metrics_details_table(data): "description": "The calculated error rate, as determined by the PhiX alignment.", "min": 0, "max": 100, - "suffix": "%", "scale": "OrRd", }, "Error (35)": { From 25eb3cff74e043fa8e7fae0142dd9811d3907393 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Mon, 2 Dec 2024 11:28:03 +0100 Subject: [PATCH 24/34] Remove conda defaults channel (#2996) --- docs/markdown/getting_started/quick_start.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/markdown/getting_started/quick_start.md b/docs/markdown/getting_started/quick_start.md index 9371f5842e..53ea83429c 100644 --- a/docs/markdown/getting_started/quick_start.md +++ b/docs/markdown/getting_started/quick_start.md @@ -20,7 +20,6 @@ Arguably, the easiest way to do this is with Conda 3. Restart your terminal shell. 4. [Configure your conda channels](https://bioconda.github.io/#usage) to work with BioConda: ```bash - conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge conda config --set channel_priority strict From 99e4a15fc978eae3b14d29c08427d9e32802d808 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 5 Dec 2024 01:13:10 +0100 Subject: [PATCH 25/34] Naturally sort bargraph samples (#2999) --- multiqc/plots/bargraph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/multiqc/plots/bargraph.py b/multiqc/plots/bargraph.py index 40867174b1..0ab518158f 100644 --- a/multiqc/plots/bargraph.py +++ b/multiqc/plots/bargraph.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Union, cast from importlib_metadata import EntryPoint +from natsort import natsorted from multiqc import config from multiqc.core.exceptions import RunError @@ -135,7 +136,7 @@ def plot( # want to keep the sample order https://github.com/MultiQC/MultiQC/issues/2204 pass elif pconf.sort_samples: - hc_samples = sorted([SampleName(s) for s in d.keys()]) + hc_samples = natsorted([SampleName(s) for s in d.keys()]) hc_data: List[CatDataDict] = list() sample_d_count: Dict[SampleName, int] = dict() for cat_idx, c in enumerate(categories_per_ds[ds_idx].keys()): From f2a7997a37b5b4128608c73f4ca2ca49468348bb Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 6 Dec 2024 14:24:36 +0100 Subject: [PATCH 26/34] Tooltips about merged values in groupped samples (#3002) --- multiqc/base_module.py | 13 +++++++++++++ multiqc/plots/table_object.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/multiqc/base_module.py b/multiqc/base_module.py index 881b77c935..a794e9676a 100755 --- a/multiqc/base_module.py +++ b/multiqc/base_module.py @@ -1004,6 +1004,19 @@ def general_stats_addcols( if "description" not in _headers[col_id]: _headers[col_id]["description"] = _col["title"] if "title" in _col else col_id + # Add grouping information to description if table_sample_merge is enabled + if config.table_sample_merge: + desc = _headers[col_id].get("description", "") + if group_samples_config.cols_to_weighted_average and any( + col_id == c for c, _ in group_samples_config.cols_to_weighted_average + ): + desc += " (weighted average for grouped samples)" + elif group_samples_config.cols_to_average and col_id in group_samples_config.cols_to_average: + desc += " (averaged for grouped samples)" + elif group_samples_config.cols_to_sum and col_id in group_samples_config.cols_to_sum: + desc += " (summed for grouped samples)" + _headers[col_id]["description"] = desc + # Append to report.general_stats for later assembly into table report.general_stats_data.append(rows_by_group) report.general_stats_headers.append(_headers) # type: ignore diff --git a/multiqc/plots/table_object.py b/multiqc/plots/table_object.py index b1acd61d8e..ac4c69f654 100644 --- a/multiqc/plots/table_object.py +++ b/multiqc/plots/table_object.py @@ -260,7 +260,7 @@ class InputRow(BaseModel): """ sample: SampleName - data: Dict[ColumnKey, Optional[ValueT]] = dict() + data: Dict[ColumnKey, Optional[ValueT]] = Field(default_factory=dict) def __init__(self, sample: SampleName, data: Mapping[Union[str, ColumnKey], Any]): super().__init__( From a71cb106c3acd59a3e0c2e470dd7eec00f6cc897 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 6 Dec 2024 17:57:07 +0100 Subject: [PATCH 27/34] fastp: Add overrpresented sequences plot and table (#3000) --- multiqc/modules/fastp/fastp.py | 125 +++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 4 deletions(-) diff --git a/multiqc/modules/fastp/fastp.py b/multiqc/modules/fastp/fastp.py index c01ede09ff..73e09ad20e 100644 --- a/multiqc/modules/fastp/fastp.py +++ b/multiqc/modules/fastp/fastp.py @@ -1,12 +1,13 @@ +from collections import Counter, defaultdict import json import logging import re -from typing import Dict, Optional, Tuple +from typing import Any, Dict, Optional, Tuple from multiqc import config from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound -from multiqc.plots import bargraph, linegraph -from multiqc.plots.table_object import ColumnDict +from multiqc.plots import bargraph, linegraph, table +from multiqc.types import ColumnKey log = logging.getLogger(__name__) @@ -61,6 +62,7 @@ def __init__(self): self.fastp_qual_plotdata = dict() self.fastp_gc_content_data = dict() self.fastp_n_content_data = dict() + self.fastp_overrepresented_sequences = dict() for k in [ "read1_before_filtering", "read2_before_filtering", @@ -71,6 +73,7 @@ def __init__(self): self.fastp_qual_plotdata[k] = dict() self.fastp_gc_content_data[k] = dict() self.fastp_n_content_data[k] = dict() + self.fastp_overrepresented_sequences[k] = dict() for s_name, parsed_json in data_by_sample.items(): self.process_parsed_data(parsed_json, s_name) @@ -163,6 +166,17 @@ def __init__(self): except RuntimeError: log.debug("No data found for 'N content' plot") + # Overrepresented sequences plot + try: + self.add_section( + name="Overrepresented Sequences", + anchor="fastp-overrepresented-sequences", + description="Overrepresented sequences in the reads.", + plot=self.fastp_overrepresented_sequences_plot(), + ) + except RuntimeError: + log.debug("No data found for 'Overrepresented Sequences' plot") + def parse_fastp_log(self, f) -> Tuple[Optional[str], Dict]: """Parse the JSON output from fastp and save the summary statistics""" try: @@ -339,6 +353,12 @@ def process_parsed_data(self, parsed_json: Dict, s_name: str): except KeyError: log.debug(f"Content curve data {k} not found: {s_name}") + # Overrepresented sequences + try: + self.fastp_overrepresented_sequences[k][s_name] = parsed_json[k]["overrepresented_sequences"] + except KeyError: + log.debug(f"Overrepresented sequences data {k} not found: {s_name}") + # Remove empty dicts if len(self.fastp_data[s_name]) == 0: del self.fastp_data[s_name] @@ -477,6 +497,103 @@ def fastp_read_n_plot(self): } return linegraph.plot(pdata, pconfig) + def fastp_overrepresented_sequences_plot(self): + cnt_by_sample = defaultdict(Counter) + cnt_by_seq = Counter() + pct_by_seq = Counter() + samples_by_seq = defaultdict(set) + + for read_name, by_sample in self.fastp_overrepresented_sequences.items(): + for s_name, by_seq in by_sample.items(): + for seq, count in by_seq.items(): + cnt_by_seq[seq] += count + pct_by_seq[seq] += count / self.fastp_data[s_name]["before_filtering_total_reads"] + cnt_by_sample[read_name][s_name] += count + samples_by_seq[seq].add(s_name) + + data_labels, cnt_by_sample_pdata = self.filter_pconfig_pdata_subplots( + cnt_by_sample, "Overrepresented Sequences" + ) + self.add_section( + name="Overrepresented sequences by sample", + anchor="fastp_overrepresented_sequences", + description="The total amount of overrepresented sequences found in each library.", + plot=bargraph.plot( + [ + {sn: {"overrepresented_sequences": cnt} for sn, cnt in cnt_by_sample.items()} + for cnt_by_sample in cnt_by_sample_pdata + ], + { + "overrepresented_sequences": {"name": "Overrepresented sequences"}, + }, + { + "id": "fastp_overrepresented_sequences_plot", + "title": "Fastp: Overrepresented sequences", + "cpswitch": False, + "ylab": "Number of overrepresented sequences", + "data_labels": data_labels, + "tt_decimals": 0, + }, + ), + ) + + # Top overrepresented sequences across all samples + top_n = getattr(config, "fastp_config", {}).get("top_overrepresented_sequences", 20) + top_seqs = cnt_by_seq.most_common(top_n) + table_data: Dict[str, Dict[str, Any]] = { + seq: { + "sequence": seq, + "total_percent": pct_by_seq[seq], + "total_count": cnt_by_seq[seq], + "samples": len(samples_by_seq[seq]), + } + for seq, _ in top_seqs + } + + self.add_section( + name="Top overrepresented sequences", + anchor="fastp_top_overrepresented_sequences", + description=f""" + Top overrepresented sequences across all samples. The table shows {top_n} + most overrepresented sequences across all samples, ranked by the number of occurrences across all samples. + """, + plot=table.plot( + table_data, + headers={ + ColumnKey("samples"): { + "title": "Reports", + "description": "Number of fastp reports where this sequence is found as overrepresented", + "scale": "Greens", + "min": 0, + "format": "{:,d}", + }, + ColumnKey("total_count"): { + "title": "Occurrences", + "description": "Total number of occurrences of the sequence (among the samples where the sequence is overrepresented)", + "scale": "Blues", + "min": 0, + "format": "{:,d}", + }, + ColumnKey("total_percent"): { + "title": "% of all reads", + "description": "Total number of occurrences as the percentage of all reads (among samples where the sequence is overrepresented)", + "scale": "Blues", + "min": 0, + "max": 100, + "suffix": "%", + "format": "{:,.4f}", + }, + }, + pconfig={ + "namespace": self.name, + "id": "fastp_top_overrepresented_sequences_table", + "title": "fastp: Top overrepresented sequences", + "col1_header": "Overrepresented sequence", + "sort_rows": False, + }, + ), + ) + @staticmethod def filter_pconfig_pdata_subplots(data, label): data_labels = [] @@ -503,7 +620,7 @@ def filter_pconfig_pdata_subplots(data, label): "ylab": f"{label}", }, }.items(): - if sum([len(data[k][x]) for x in data[k]]) > 0: + if any(data[k][x] for x in data[k]): data_labels.append(dl) pdata.append(data[k]) From 78e26e80cc886040ff9d511cd8f6bfbb04e1d269 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Sat, 7 Dec 2024 20:58:46 +0100 Subject: [PATCH 28/34] fastqc: round values for the seq_content plot dump to save space --- multiqc/modules/fastqc/fastqc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/multiqc/modules/fastqc/fastqc.py b/multiqc/modules/fastqc/fastqc.py index 88e31c2195..15fa9584c5 100755 --- a/multiqc/modules/fastqc/fastqc.py +++ b/multiqc/modules/fastqc/fastqc.py @@ -737,6 +737,8 @@ def sequence_content_plot(self): for base in ["a", "c", "t", "g"]: if math.isnan(float(data_by_sample[s_name][b][base])): data_by_sample[s_name][b][base] = 0 + else: + data_by_sample[s_name][b][base] = round(data_by_sample[s_name][b][base], 2) if len(data_by_sample) == 0: log.debug("sequence_content not found in FastQC reports") From 6af15b324375100e48862c73d4efacd5f8a7f6f9 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 11 Dec 2024 12:20:22 +0100 Subject: [PATCH 29/34] Document barplot sort_samples (#3006) --- docs/markdown/development/plots.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/markdown/development/plots.md b/docs/markdown/development/plots.md index c93d06ed5e..867153ac9b 100644 --- a/docs/markdown/development/plots.md +++ b/docs/markdown/development/plots.md @@ -127,6 +127,7 @@ config = { "xsuffix": "%", # Suffix for the X-axis values and labels. Parsed from tt_label by default "tt_label": "{x}: {y:.2f}%", # Customise tooltip label, e.g. '{point.x} base pairs' "stacking": "relative", # Set to "group" to have category bars side by side + "sort_samples": True, # Sort samples by name "tt_decimals": 0, # Number of decimal places to use in the tooltip number "tt_suffix": "", # Suffix to add after tooltip number "height": 500 # The default height of the plot, in pixels From 1665bd93e65aaf695b271517d1fd42a974f2e48f Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 11 Dec 2024 12:44:03 +0100 Subject: [PATCH 30/34] Nanostat new format: fix parsing sample name (#3007) --- multiqc/modules/nanostat/nanostat.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/multiqc/modules/nanostat/nanostat.py b/multiqc/modules/nanostat/nanostat.py index 288d127906..19f3c5ba1f 100644 --- a/multiqc/modules/nanostat/nanostat.py +++ b/multiqc/modules/nanostat/nanostat.py @@ -94,28 +94,33 @@ def parse_nanostat_log(self, f): Note: Tool can be run in two different modes, giving two variants to the output. To avoid overwriting keys from different modes, keys are given a suffix. """ - snames = [f["s_name"]] - nano_stats_by_sname = defaultdict(dict) + sname = f["s_name"] + nano_stats_by_sname = {sname: dict()} for line in f["f"]: + if line.strip() == "": + continue parts = line.strip().split("\t") if len(parts) < 2: + log.warning(f"Expected at least 2 parts in line, got: {len(parts)}: {parts} - in {f['root']}/{f['fn']}") + continue + if len(parts) > 2: + log.warning(f"Expected at most 2 parts in line, got: {len(parts)}: {parts} - in {f['root']}/{f['fn']}") continue - key = parts[0] + key, val = parts if key == "Metrics": - snames = parts[1:] + # A static header line "Metrics dataset", ignore. + pass elif key in self._KEYS_MAPPING.keys(): key = self._KEYS_MAPPING.get(key) if key: - for sname, val in zip(snames, parts[1:]): - nano_stats_by_sname[sname][key] = float(val) + nano_stats_by_sname[sname][key] = float(val) else: key = key.replace("Reads ", "").strip(":") if key.startswith(">Q"): - for sname, val in zip(snames, parts[1:]): - # Number of reads above Q score cutoff - nano_stats_by_sname[sname][key] = int(val.split()[0]) + # Number of reads above Q score cutoff + nano_stats_by_sname[sname][key] = int(val.split()[0]) return nano_stats_by_sname def parse_legacy_nanostat_log(self, f): From 521534dfc4c0a380756ff9bd3630a536e9aad01d Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 11 Dec 2024 23:40:54 +0100 Subject: [PATCH 31/34] GATK BQSR: support Sentieon QualCal output (#3008) * GATK: refactor without mixins * Support sentieon qualcal * Fix typing --- .../gatk/analyze_saturation_mutagenesis.py | 792 +++++++++--------- multiqc/modules/gatk/base_recalibrator.py | 373 +++++---- multiqc/modules/gatk/gatk.py | 63 +- multiqc/modules/gatk/utils.py | 48 ++ multiqc/modules/gatk/varianteval.py | 135 ++- multiqc/plots/plotly/violin.py | 2 +- multiqc/search_patterns.yaml | 6 +- 7 files changed, 718 insertions(+), 701 deletions(-) create mode 100644 multiqc/modules/gatk/utils.py diff --git a/multiqc/modules/gatk/analyze_saturation_mutagenesis.py b/multiqc/modules/gatk/analyze_saturation_mutagenesis.py index 5f3fea8580..89f9e0b705 100644 --- a/multiqc/modules/gatk/analyze_saturation_mutagenesis.py +++ b/multiqc/modules/gatk/analyze_saturation_mutagenesis.py @@ -4,404 +4,406 @@ from collections import defaultdict from multiqc import config +from multiqc.base_module import BaseMultiqcModule from multiqc.plots import bargraph, table # Initialise the logger log = logging.getLogger(__name__) -class AnalyzeSaturationMutagenesisMixin: - """Mixin class to add GATK AnalyzeSaturationMutagenesis reports to MultiQC.""" - - def parse_gatk_analyze_saturation_mutagenesis(self): - """Find GATK AnalyzeSaturationMutagenesis logs and parse their data""" - - self.gatk_analyze_saturation_mutagenesis = {} - - for file_handle in self.find_log_files("gatk/analyze_saturation_mutagenesis", filehandles=True): - parsed_data = self.parse_read_counts_file(file_handle["f"]) - if len(parsed_data) > 1: - if file_handle["s_name"] in self.gatk_analyze_saturation_mutagenesis: - log.debug(f"Duplicate sample name found! Overwriting: {file_handle['s_name']}") - self.add_data_source(file_handle, section="analyze_saturation_mutagenesis") - self.gatk_analyze_saturation_mutagenesis[file_handle["s_name"]] = parsed_data - - # Filter to strip out ignored sample names - self.gatk_analyze_saturation_mutagenesis = self.ignore_samples(self.gatk_analyze_saturation_mutagenesis) - - n_reports_found = len(self.gatk_analyze_saturation_mutagenesis) - if n_reports_found == 0: - return 0 - - log.info("Found %d AnalyzeSaturationMutagenesis reports", n_reports_found) - - # Superfluous function call to confirm that it is used in this module - # Replace None with actual version if it is available - self.add_software_version(None) - - # Write parsed report data to a file (restructure first) - self.write_data_file(self.gatk_analyze_saturation_mutagenesis, "multiqc_gatk_analyze_saturation_mutagenesis") - - self.gatk_analyze_saturation_mutagenesis_table(self.gatk_analyze_saturation_mutagenesis) - # Add plots - self.gatk_analyze_saturation_mutagenesis_plot_reads(self.gatk_analyze_saturation_mutagenesis) - self.gatk_analyze_saturation_mutagenesis_plot_base_calls(self.gatk_analyze_saturation_mutagenesis) - - return n_reports_found - - def parse_read_counts_file(self, file_handle): - """Parse a readCounts output file from GATK AnalyzeSaturationMutagenesis - These files are tab delimited, with some hierarchical structuring. - - This does not parse percentages, since those are readily calculated - and MultiQC automatically generates them for plotting.""" - - label_to_key = { - "Total Reads:": "total_reads", - ">Unmapped Reads:": "unmapped_reads", - ">LowQ Reads:": "lowq_reads", - ">Evaluable Reads:": "evaluable_reads", - ">>Reads in disjoint pairs evaluated separately:": "disjoint_pairs", - ">>Reads in overlapping pairs evaluated together:": "overlapping_pairs", - ">>>Wild type:": "wt_reads", - ">>>Called variants:": "called_variants", - ">>>Mate ignored:": "mate_ignored", - ">>>Inconsistent pair:": "inconsistent", - ">>>Low quality variation:": "low_quality_variation", - ">>>Insufficient flank:": "insufficient_flank", - "Total base calls:": "total_base_calls", - ">Base calls evaluated for variants:": "evaluated_base_calls", - ">Base calls unevaluated:": "unevaluated_base_calls", - } - - data = defaultdict(int) - - # For keeping track of whether we are in disjoint or overlapping reads - # Disjoint are first, so set to True initially - disjoint_reads = True - - for line in file_handle: - fields = line.split("\t") - label = fields[0] - key = label_to_key[label] - if key == "disjoint_pairs": - disjoint_reads = True - elif key == "overlapping_pairs": - disjoint_reads = False - elif label.startswith(">>>"): - key = key + "_" + ("disjoint" if disjoint_reads else "overlapping") - data[key] = int(fields[1]) - - # Create some summary fields - data["wt_total_reads"] = data["wt_reads_disjoint"] + data["wt_reads_overlapping"] - data["variants_total_reads"] = data["called_variants_disjoint"] + data["called_variants_overlapping"] - data["filtered_reads"] = sum( - [ - data["lowq_reads"], - data["mate_ignored_disjoint"], - data["inconsistent_overlapping"], - data["low_quality_variation_disjoint"], - data["low_quality_variation_overlapping"], - data["insufficient_flank_disjoint"], - data["insufficient_flank_overlapping"], - ] - ) - return data - - def gatk_analyze_saturation_mutagenesis_plot_reads(self, data): - """Make the plot for GATK AnalyzeSaturationMutagenesis read counts and add the section.""" - cats = { - "filtered_reads": {"name": "Filtered reads"}, - "wt_total_reads": {"name": "WT reads"}, - "variants_total_reads": {"name": "Variant reads"}, - } - - pconfig = { - "id": "gatk_ASM_reads_plot", - "title": "GATK AnalyzeSaturationMutagenesis: Read counts", - "ylab": "Number of reads", - "cpswitch_counts_label": "Counts", - } - - self.add_section( - name="Read counts", - anchor="gatk-asm-read-counts", - description="Read counts and read fate. Filtered reads include unmapped, low quality, and other pathologies.", - helptext="""Reads can be filtered by GATK AnalyzeSaturationMutagenesis for a number of reasons, including low quality, insufficient flank, and other pathologies. - This plot shows the number of reads that were mapped to WT, called as variants, and the number that were filtered.""", - plot=bargraph.plot(data, cats, pconfig), - ) - - def gatk_analyze_saturation_mutagenesis_plot_base_calls(self, data): - """Make the plot for GATK AnalyzeSaturationMutagenesis base calls and add the section.""" - cats = { - "evaluated_base_calls": {"name": "Base calls evaluated for variants"}, - "unevaluated_base_calls": {"name": "Base calls not evaluated for variants"}, - } - - pconfig = { - "id": "gatk_ASM_base_calls_plot", - "title": "GATK AnalyzeSaturationMutagenesis: Base calls", - "ylab": "Number of bases", - "cpswitch_counts_label": "Counts", - } - - self.add_section( - name="Base calls", - anchor="gatk-asm-bases", - description="Base calls evaluated for variants and base calls not evaluated for variants.", - helptext="""Bases can be filtered by GATK AnalyzeSaturationMutagenesis for a number of reasons, including low quality, insufficient flank, and other pathologies. - This plot shows the number of base calls that were evaluated for variants and the number of base calls that were not evaluated for variants.""", - plot=bargraph.plot(data, cats, pconfig), - ) - - def gatk_analyze_saturation_mutagenesis_table(self, data): - """Make the table for GATK AnalyzeSaturationMutagenesis and add the section.""" - asm_headers = { - "total_reads": { - "title": f"Total reads ({config.read_count_prefix})", - "description": f"Total reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Greys", - "shared_key": "read_count", - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "wt_total_reads": { - "title": f"WT reads ({config.read_count_prefix})", - "description": f"Total evaluated reads mapped to WT ({config.read_count_desc})", - "min": 0, - "scale": "Blues", - "shared_key": "read_count", - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "variants_total_reads": { - "title": f"Variant reads ({config.read_count_prefix})", - "description": f"Reads with a variant called ({config.read_count_desc})", - "min": 0, - "scale": "Greens", - "shared_key": "read_count", - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "filtered_reads": { - "title": f"Filtered reads ({config.read_count_prefix})", - "description": f"Reads filtered from sample ({config.read_count_desc})", - "min": 0, - "scale": "Reds-rev", - "shared_key": "read_count", - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "unmapped_reads": { - "title": f"Unmapped ({config.read_count_prefix})", - "description": f"Unmapped reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Purples-rev", - "shared_key": "read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "lowq_reads": { - "title": f"LowQ ({config.read_count_prefix})", - "description": f"Low quality reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Blues-rev", - "shared_key": "read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "evaluable_reads": { - "title": f"Evaluable ({config.read_count_prefix})", - "description": f"Evaluable reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Greys", - "shared_key": "read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "disjoint_pairs": { - "title": f"Disjoint ({config.read_count_prefix})", - "description": f"Reads from disjoint (non-overlapping) paired-end reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Oranges", - "shared_key": "read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "wt_reads_disjoint": { - "title": f"WT (disjoint) ({config.read_count_prefix})", - "description": f"WT reads called from disjoint pairs ({config.read_count_desc})", - "min": 0, - "scale": "Purples", - "shared_key": "disjoint_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "called_variants_disjoint": { - "title": f"Called variants (disjoint) ({config.read_count_prefix})", - "description": f"Reads with variants called from disjoint pairs ({config.read_count_desc})", - "min": 0, - "scale": "Blues", - "shared_key": "disjoint_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "mate_ignored_disjoint": { - "title": f"Mateless ({config.read_count_prefix})", - "description": f"Reads with ignored mates called from disjoint pairs ({config.read_count_desc})", - "min": 0, - "scale": "Greens-rev", - "shared_key": "disjoint_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "low_quality_variation_disjoint": { - "title": f"LowQ (disjoint) ({config.read_count_prefix})", - "description": f"Reads with low quality variation from disjoint pairs ({config.read_count_desc})", - "min": 0, - "scale": "Reds-rev", - "shared_key": "disjoint_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "insufficient_flank_disjoint": { - "title": f"No flank (disjoint) ({config.read_count_prefix})", - "description": f"Reads with insufficient flank from disjoint pairs ({config.read_count_desc})", - "min": 0, - "scale": "Blues-rev", - "shared_key": "disjoint_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "overlapping_pairs": { - "title": f"Overlapping ({config.read_count_prefix})", - "description": f"Reads from overlapping paired-end reads in sample ({config.read_count_desc})", - "min": 0, - "scale": "Oranges", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "wt_reads_overlapping": { - "title": f"WT (overlapping) ({config.read_count_prefix})", - "description": f"WT reads called from overlapping pairs ({config.read_count_desc})", - "min": 0, - "scale": "Purples", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "called_variants_overlapping": { - "title": f"Called variants (overlapping) ({config.read_count_prefix})", - "description": f"Reads with variants called from overlapping pairs ({config.read_count_desc})", - "min": 0, - "scale": "Blues", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "inconsistent_overlapping": { - "title": f"Inconsistent (overlapping) ({config.read_count_prefix})", - "description": f"Reads with inconsistent pairs from overlapping pairs ({config.read_count_desc})", - "min": 0, - "scale": "Greens-rev", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "low_quality_variation_overlapping": { - "title": f"Low quality reads (overlapping) ({config.read_count_prefix})", - "description": f"Reads with low quality variation from overlapping pairs ({config.read_count_desc})", - "min": 0, - "scale": "Reds-rev", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "insufficient_flank_overlapping": { - "title": f"No flank (overlapping) ({config.read_count_prefix})", - "description": f"Reads with insufficient flank from overlapping pairs ({config.read_count_desc})", - "min": 0, - "scale": "Blues-rev", - "shared_key": "overlapping_read_count", - "hidden": True, - "modify": lambda x: x * config.read_count_multiplier, - "namespace": "GATK", - }, - "total_base_calls": { - "title": f"Total bases ({config.base_count_prefix})", - "description": f"Total base calls in sample ({config.base_count_desc})", - "min": 0, - "scale": "Greys", - "shared_key": "base_calls", - "hidden": True, - "modify": lambda x: x * config.base_count_multiplier, - "namespace": "GATK", - }, - "evaluated_base_calls": { - "title": f"Evaluated bases ({config.base_count_prefix})", - "description": f"Evaluated base calls in sample ({config.base_count_desc})", - "min": 0, - "scale": "Blues", - "hidden": True, - "shared_key": "base_calls", - "modify": lambda x: x * config.base_count_multiplier, - "namespace": "GATK", - }, - "unevaluated_base_calls": { - "title": f"Unevaluated bases ({config.base_count_prefix})", - "description": f"Unevaluated base calls in sample ({config.base_count_desc})", - "min": 0, - "scale": "Reds-rev", - "hidden": True, - "shared_key": "base_calls", - "modify": lambda x: x * config.base_count_multiplier, - "namespace": "GATK", - }, - } - - # Add module specific prefix to all keys to be safe - prefix = "gatk_ask_" - asm_headers = {f"{prefix}{k}": v for k, v in asm_headers.items()} - data = {sn: {f"{prefix}{k}": v for k, v in d.items()} for sn, d in data.items()} - - pconfig = {"id": f"{prefix}stats", "namespace": "GATK", "title": "GATK ASM counts"} - - self.add_section( - name="GATK ASM counts", - anchor="gatk-asm-stats", - description="Per-sample read count and base call fates from GATK AnalyzeSaturationMutagenesis.", - helptext=""" - This table shows the distribution of calls (for reads or for bases) across all samples. - Reads are categorized as WT, a variant, or filtered. Bases can be either evaluated or unevaluated, corresponding to the reads they come from. - - Reads are filtered for the following reasons: - - * Unmapped: the map quality is below the minimum MapQ (default = 4)' - * Low quality reads: Reads are trimmed for quality > minQ (default 30) before calling variants. If the final size is less than the minimum length (default 15), or if the remaining segment does not cover the ORF, the read is filtered. - - Paired reads are also split into overlapping and disjoint sets, with further filters for both. - - * Inconsistent: If overlapping reads disagree on the called variant, the read is filtered. - * Ignored mate: If the pairs are disjoint, the first read of the pair is used for analysis, and the second is ignored. - * Low quality variation: If the variant includes ambiguous bases (not A, C, G, or T, or -), the read is filtered. - * Insufficient flank: If the variant does not include a certain number of WT bases (default 2) flanking the variant, the read is filtered. - """, - plot=table.plot(data, asm_headers, pconfig), - ) +def parse_gatk_analyze_saturation_mutagenesis(module: BaseMultiqcModule) -> int: + """Find GATK AnalyzeSaturationMutagenesis logs and parse their data""" + + result = {} + + for file_handle in module.find_log_files("gatk/analyze_saturation_mutagenesis", filehandles=True): + parsed_data = parse_read_counts_file(file_handle["f"]) + if len(parsed_data) > 1: + if file_handle["s_name"] in result: + log.debug(f"Duplicate sample name found! Overwriting: {file_handle['s_name']}") + module.add_data_source(file_handle, section="analyze_saturation_mutagenesis") + result[file_handle["s_name"]] = parsed_data + + # Filter to strip out ignored sample names + result = module.ignore_samples(result) + + n_reports_found = len(result) + if n_reports_found == 0: + return 0 + + log.info("Found %d AnalyzeSaturationMutagenesis reports", n_reports_found) + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + module.add_software_version(None) + + # Write parsed report data to a file (restructure first) + module.write_data_file(result, "multiqc_gatk_analyze_saturation_mutagenesis") + + gatk_analyze_saturation_mutagenesis_table(module, result) + # Add plots + gatk_analyze_saturation_mutagenesis_plot_reads(module, result) + gatk_analyze_saturation_mutagenesis_plot_base_calls(module, result) + + return n_reports_found + + +def parse_read_counts_file(file_handle): + """Parse a readCounts output file from GATK AnalyzeSaturationMutagenesis + These files are tab delimited, with some hierarchical structuring. + + This does not parse percentages, since those are readily calculated + and MultiQC automatically generates them for plotting.""" + + label_to_key = { + "Total Reads:": "total_reads", + ">Unmapped Reads:": "unmapped_reads", + ">LowQ Reads:": "lowq_reads", + ">Evaluable Reads:": "evaluable_reads", + ">>Reads in disjoint pairs evaluated separately:": "disjoint_pairs", + ">>Reads in overlapping pairs evaluated together:": "overlapping_pairs", + ">>>Wild type:": "wt_reads", + ">>>Called variants:": "called_variants", + ">>>Mate ignored:": "mate_ignored", + ">>>Inconsistent pair:": "inconsistent", + ">>>Low quality variation:": "low_quality_variation", + ">>>Insufficient flank:": "insufficient_flank", + "Total base calls:": "total_base_calls", + ">Base calls evaluated for variants:": "evaluated_base_calls", + ">Base calls unevaluated:": "unevaluated_base_calls", + } + + data = defaultdict(int) + + # For keeping track of whether we are in disjoint or overlapping reads + # Disjoint are first, so set to True initially + disjoint_reads = True + + for line in file_handle: + fields = line.split("\t") + label = fields[0] + key = label_to_key[label] + if key == "disjoint_pairs": + disjoint_reads = True + elif key == "overlapping_pairs": + disjoint_reads = False + elif label.startswith(">>>"): + key = key + "_" + ("disjoint" if disjoint_reads else "overlapping") + data[key] = int(fields[1]) + + # Create some summary fields + data["wt_total_reads"] = data["wt_reads_disjoint"] + data["wt_reads_overlapping"] + data["variants_total_reads"] = data["called_variants_disjoint"] + data["called_variants_overlapping"] + data["filtered_reads"] = sum( + [ + data["lowq_reads"], + data["mate_ignored_disjoint"], + data["inconsistent_overlapping"], + data["low_quality_variation_disjoint"], + data["low_quality_variation_overlapping"], + data["insufficient_flank_disjoint"], + data["insufficient_flank_overlapping"], + ] + ) + return data + + +def gatk_analyze_saturation_mutagenesis_plot_reads(module, data): + """Make the plot for GATK AnalyzeSaturationMutagenesis read counts and add the section.""" + cats = { + "filtered_reads": {"name": "Filtered reads"}, + "wt_total_reads": {"name": "WT reads"}, + "variants_total_reads": {"name": "Variant reads"}, + } + + pconfig = { + "id": "gatk_ASM_reads_plot", + "title": "GATK AnalyzeSaturationMutagenesis: Read counts", + "ylab": "Number of reads", + "cpswitch_counts_label": "Counts", + } + + module.add_section( + name="Read counts", + anchor="gatk-asm-read-counts", + description="Read counts and read fate. Filtered reads include unmapped, low quality, and other pathologies.", + helptext="""Reads can be filtered by GATK AnalyzeSaturationMutagenesis for a number of reasons, including low quality, insufficient flank, and other pathologies. + This plot shows the number of reads that were mapped to WT, called as variants, and the number that were filtered.""", + plot=bargraph.plot(data, cats, pconfig), + ) + + +def gatk_analyze_saturation_mutagenesis_plot_base_calls(module, data): + """Make the plot for GATK AnalyzeSaturationMutagenesis base calls and add the section.""" + cats = { + "evaluated_base_calls": {"name": "Base calls evaluated for variants"}, + "unevaluated_base_calls": {"name": "Base calls not evaluated for variants"}, + } + + pconfig = { + "id": "gatk_ASM_base_calls_plot", + "title": "GATK AnalyzeSaturationMutagenesis: Base calls", + "ylab": "Number of bases", + "cpswitch_counts_label": "Counts", + } + + module.add_section( + name="Base calls", + anchor="gatk-asm-bases", + description="Base calls evaluated for variants and base calls not evaluated for variants.", + helptext="""Bases can be filtered by GATK AnalyzeSaturationMutagenesis for a number of reasons, including low quality, insufficient flank, and other pathologies. + This plot shows the number of base calls that were evaluated for variants and the number of base calls that were not evaluated for variants.""", + plot=bargraph.plot(data, cats, pconfig), + ) + + +def gatk_analyze_saturation_mutagenesis_table(module, data): + """Make the table for GATK AnalyzeSaturationMutagenesis and add the section.""" + asm_headers = { + "total_reads": { + "title": f"Total reads ({config.read_count_prefix})", + "description": f"Total reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Greys", + "shared_key": "read_count", + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "wt_total_reads": { + "title": f"WT reads ({config.read_count_prefix})", + "description": f"Total evaluated reads mapped to WT ({config.read_count_desc})", + "min": 0, + "scale": "Blues", + "shared_key": "read_count", + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "variants_total_reads": { + "title": f"Variant reads ({config.read_count_prefix})", + "description": f"Reads with a variant called ({config.read_count_desc})", + "min": 0, + "scale": "Greens", + "shared_key": "read_count", + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "filtered_reads": { + "title": f"Filtered reads ({config.read_count_prefix})", + "description": f"Reads filtered from sample ({config.read_count_desc})", + "min": 0, + "scale": "Reds-rev", + "shared_key": "read_count", + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "unmapped_reads": { + "title": f"Unmapped ({config.read_count_prefix})", + "description": f"Unmapped reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Purples-rev", + "shared_key": "read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "lowq_reads": { + "title": f"LowQ ({config.read_count_prefix})", + "description": f"Low quality reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Blues-rev", + "shared_key": "read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "evaluable_reads": { + "title": f"Evaluable ({config.read_count_prefix})", + "description": f"Evaluable reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Greys", + "shared_key": "read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "disjoint_pairs": { + "title": f"Disjoint ({config.read_count_prefix})", + "description": f"Reads from disjoint (non-overlapping) paired-end reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Oranges", + "shared_key": "read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "wt_reads_disjoint": { + "title": f"WT (disjoint) ({config.read_count_prefix})", + "description": f"WT reads called from disjoint pairs ({config.read_count_desc})", + "min": 0, + "scale": "Purples", + "shared_key": "disjoint_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "called_variants_disjoint": { + "title": f"Called variants (disjoint) ({config.read_count_prefix})", + "description": f"Reads with variants called from disjoint pairs ({config.read_count_desc})", + "min": 0, + "scale": "Blues", + "shared_key": "disjoint_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "mate_ignored_disjoint": { + "title": f"Mateless ({config.read_count_prefix})", + "description": f"Reads with ignored mates called from disjoint pairs ({config.read_count_desc})", + "min": 0, + "scale": "Greens-rev", + "shared_key": "disjoint_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "low_quality_variation_disjoint": { + "title": f"LowQ (disjoint) ({config.read_count_prefix})", + "description": f"Reads with low quality variation from disjoint pairs ({config.read_count_desc})", + "min": 0, + "scale": "Reds-rev", + "shared_key": "disjoint_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "insufficient_flank_disjoint": { + "title": f"No flank (disjoint) ({config.read_count_prefix})", + "description": f"Reads with insufficient flank from disjoint pairs ({config.read_count_desc})", + "min": 0, + "scale": "Blues-rev", + "shared_key": "disjoint_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "overlapping_pairs": { + "title": f"Overlapping ({config.read_count_prefix})", + "description": f"Reads from overlapping paired-end reads in sample ({config.read_count_desc})", + "min": 0, + "scale": "Oranges", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "wt_reads_overlapping": { + "title": f"WT (overlapping) ({config.read_count_prefix})", + "description": f"WT reads called from overlapping pairs ({config.read_count_desc})", + "min": 0, + "scale": "Purples", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "called_variants_overlapping": { + "title": f"Called variants (overlapping) ({config.read_count_prefix})", + "description": f"Reads with variants called from overlapping pairs ({config.read_count_desc})", + "min": 0, + "scale": "Blues", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "inconsistent_overlapping": { + "title": f"Inconsistent (overlapping) ({config.read_count_prefix})", + "description": f"Reads with inconsistent pairs from overlapping pairs ({config.read_count_desc})", + "min": 0, + "scale": "Greens-rev", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "low_quality_variation_overlapping": { + "title": f"Low quality reads (overlapping) ({config.read_count_prefix})", + "description": f"Reads with low quality variation from overlapping pairs ({config.read_count_desc})", + "min": 0, + "scale": "Reds-rev", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "insufficient_flank_overlapping": { + "title": f"No flank (overlapping) ({config.read_count_prefix})", + "description": f"Reads with insufficient flank from overlapping pairs ({config.read_count_desc})", + "min": 0, + "scale": "Blues-rev", + "shared_key": "overlapping_read_count", + "hidden": True, + "modify": lambda x: x * config.read_count_multiplier, + "namespace": "GATK", + }, + "total_base_calls": { + "title": f"Total bases ({config.base_count_prefix})", + "description": f"Total base calls in sample ({config.base_count_desc})", + "min": 0, + "scale": "Greys", + "shared_key": "base_calls", + "hidden": True, + "modify": lambda x: x * config.base_count_multiplier, + "namespace": "GATK", + }, + "evaluated_base_calls": { + "title": f"Evaluated bases ({config.base_count_prefix})", + "description": f"Evaluated base calls in sample ({config.base_count_desc})", + "min": 0, + "scale": "Blues", + "hidden": True, + "shared_key": "base_calls", + "modify": lambda x: x * config.base_count_multiplier, + "namespace": "GATK", + }, + "unevaluated_base_calls": { + "title": f"Unevaluated bases ({config.base_count_prefix})", + "description": f"Unevaluated base calls in sample ({config.base_count_desc})", + "min": 0, + "scale": "Reds-rev", + "hidden": True, + "shared_key": "base_calls", + "modify": lambda x: x * config.base_count_multiplier, + "namespace": "GATK", + }, + } + + # Add module specific prefix to all keys to be safe + prefix = "gatk_ask_" + asm_headers = {f"{prefix}{k}": v for k, v in asm_headers.items()} + data = {sn: {f"{prefix}{k}": v for k, v in d.items()} for sn, d in data.items()} + + pconfig = {"id": f"{prefix}stats", "namespace": "GATK", "title": "GATK ASM counts"} + + module.add_section( + name="GATK ASM counts", + anchor="gatk-asm-stats", + description="Per-sample read count and base call fates from GATK AnalyzeSaturationMutagenesis.", + helptext=""" + This table shows the distribution of calls (for reads or for bases) across all samples. + Reads are categorized as WT, a variant, or filtered. Bases can be either evaluated or unevaluated, corresponding to the reads they come from. + + Reads are filtered for the following reasons: + + * Unmapped: the map quality is below the minimum MapQ (default = 4)' + * Low quality reads: Reads are trimmed for quality > minQ (default 30) before calling variants. If the final size is less than the minimum length (default 15), or if the remaining segment does not cover the ORF, the read is filtered. + + Paired reads are also split into overlapping and disjoint sets, with further filters for both. + + * Inconsistent: If overlapping reads disagree on the called variant, the read is filtered. + * Ignored mate: If the pairs are disjoint, the first read of the pair is used for analysis, and the second is ignored. + * Low quality variation: If the variant includes ambiguous bases (not A, C, G, or T, or -), the read is filtered. + * Insufficient flank: If the variant does not include a certain number of WT bases (default 2) flanking the variant, the read is filtered. + """, + plot=table.plot(data, asm_headers, pconfig), + ) diff --git a/multiqc/modules/gatk/base_recalibrator.py b/multiqc/modules/gatk/base_recalibrator.py index 5de3ef7cfc..9497147702 100644 --- a/multiqc/modules/gatk/base_recalibrator.py +++ b/multiqc/modules/gatk/base_recalibrator.py @@ -4,7 +4,9 @@ from collections import namedtuple from itertools import groupby +from multiqc.modules.gatk.utils import parse_report from multiqc.plots import linegraph, scatter +from multiqc.base_module import BaseMultiqcModule # Initialise the logger log = logging.getLogger(__name__) @@ -13,187 +15,204 @@ recal_table_type = RecalTableType(0, 1) -class BaseRecalibratorMixin: - def parse_gatk_base_recalibrator(self): - """Find GATK BaseRecalibrator logs and parse their data""" - - report_table_headers = { - "#:GATKTable:Arguments:Recalibration argument collection values used in this run": "arguments", - "#:GATKTable:Quantized:Quality quantization map": "quality_quantization_map", - "#:GATKTable:RecalTable0:": "recal_table_0", - "#:GATKTable:RecalTable1:": "recal_table_1", - "#:GATKTable:RecalTable2:": "recal_table_2", - } - samples_kept = {rt_type: set() for rt_type in recal_table_type} - self.gatk_base_recalibrator = { - recal_type: {table_name: {} for table_name in report_table_headers.values()} - for recal_type in recal_table_type - } - - for f in self.find_log_files("gatk/base_recalibrator", filehandles=True): - # Check that we're not ignoring this sample name - if self.is_ignore_sample(f["s_name"]): - continue - - # Superfluous function call to confirm that it is used in this module - # Replace None with actual version if it is available - self.add_software_version(None, f["s_name"]) - - parsed_data = self.parse_report(f["f"].readlines(), report_table_headers) - rt_type = determine_recal_table_type(parsed_data) - if len(parsed_data) > 0: - if f["s_name"] in samples_kept[rt_type]: - log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}") - samples_kept[rt_type].add(f["s_name"]) - - self.add_data_source(f, section="base_recalibrator") - for table_name, sample_tables in parsed_data.items(): - self.gatk_base_recalibrator[rt_type][table_name][f["s_name"]] = sample_tables - - # Filter to strip out ignored sample names. Again. - for rt_type in recal_table_type: - for table_name, sample_tables in self.gatk_base_recalibrator[rt_type].items(): - self.gatk_base_recalibrator[rt_type][table_name] = self.ignore_samples(sample_tables) - - n_reports_found = sum([len(samples_kept[rt_type]) for rt_type in recal_table_type]) - - if n_reports_found > 0: - log.info(f"Found {n_reports_found} BaseRecalibrator reports") - - # Write data to file - self.write_data_file(self.gatk_base_recalibrator, "gatk_base_recalibrator") - - self.add_quality_score_vs_no_of_observations_section() - self.add_reported_vs_empirical_section() - - return n_reports_found - - def add_quality_score_vs_no_of_observations_section(self): - """Add a section for the quality score vs number of observations line plot""" - - sample_data = [] - data_labels = [] - - # Loop through the different data types - for rt_type_name, rt_type in recal_table_type._asdict().items(): - sample_tables = self.gatk_base_recalibrator[rt_type]["quality_quantization_map"] - if len(sample_tables) == 0: - continue - - count_data = {} - pct_data = {} - for sample, table in sample_tables.items(): - count_data[sample] = {} - pct_data[sample] = {} - - # Get the total count for this sample - sample_y_sum = sum(int(y) for y in table["Count"]) - - # Collect the data for the plots - for x, y in zip(table["QualityScore"], table["Count"]): - # Quality score counts - count_data[sample][int(x)] = int(y) - # Quality score percentages - try: - pct_data[sample][int(x)] = float(y) / sample_y_sum - except ZeroDivisionError: - pct_data[sample][int(x)] = 0 - - # Append the datasets for this data type - sample_data.append(count_data) - sample_data.append(pct_data) - - # Build data label configs for this data type - data_labels.append({"name": f"{rt_type_name.capitalize().replace('_', '-')} Count", "ylab": "Count"}) - data_labels.append({"name": f"{rt_type_name.capitalize().replace('_', '-')} Percent", "ylab": "Percent"}) - - plot = linegraph.plot( - sample_data, - pconfig={ - "title": "GATK: Observed Quality Score Counts", - "id": "gatk-base-recalibrator-quality-scores-plot", - "xlab": "Observed Quality Score", - "ylab": "Count", - "x_decimals": False, - "data_labels": data_labels, - }, +def parse_gatk_base_recalibrator(module: BaseMultiqcModule) -> int: + """Find GATK BaseRecalibrator logs and parse their data""" + + PREFIXES = { + "gatk": "#:GATKTable:", + "sentieon": "#:SENTIEON_QCAL_TABLE:", + } + + REPORT_TABLE_HEADERS = { + "Arguments:Recalibration argument collection values used in this run": "arguments", + "Quantized:Quality quantization map": "quality_quantization_map", + "RecalTable0:": "recal_table_0", + "RecalTable1:": "recal_table_1", + "RecalTable2:": "recal_table_2", + } + samples_kept = {rt_type: set() for rt_type in recal_table_type} + data = { + recal_type: {table_name: {} for table_name in REPORT_TABLE_HEADERS.values()} for recal_type in recal_table_type + } + + for f in module.find_log_files("gatk/base_recalibrator"): + # Check that we're not ignoring this sample name + if module.is_ignore_sample(f["s_name"]): + continue + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + module.add_software_version(None, f["s_name"]) + + # Try GATK first + parsed_data = parse_report( + f["f"].splitlines(), + {PREFIXES["gatk"] + k: v for k, v in REPORT_TABLE_HEADERS.items()}, ) - - # Reported vs empirical quality scores - self.add_section( - name="Observed Quality Scores", - anchor="gatk-base-recalibrator-quality-scores", - description=( - "This plot shows the distribution of base quality scores in each sample before and " - "after base quality score recalibration (BQSR). Applying BQSR should broaden the " - "distribution of base quality scores." - ), - helptext=( - "For more information see " - "[the Broad's description of BQSR]" - "(https://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr)" - "." - ), - plot=plot, - ) - - def add_reported_vs_empirical_section(self): - sample_data = [] - data_labels = [] - - # Loop through the different data types - for ( - rt_type_name, - rt_type, - ) in recal_table_type._asdict().items(): - # This table appears to be the correct one to use for reported vs empirical - # https://github.com/broadinstitute/gatk/blob/853b53ec2a3ac2d90d7d82a6c8451e29a34692d2/src/main/resources/org/broadinstitute/hellbender/utils/recalibration/BQSR.R#L148 - sample_tables = self.gatk_base_recalibrator[rt_type]["recal_table_1"] - if len(sample_tables) == 0: - continue - - reported_empirical = {} - for sample, table in sample_tables.items(): - reported_empirical[sample] = [] - table_rows = [dict(zip(table, r)) for r in zip(*table.values())] - table_rows.sort(key=lambda r: r["QualityScore"]) - for reported, group in groupby(table_rows, lambda r: r["QualityScore"]): - g = list(group) - reported_empirical[sample].append( - { - "x": int(reported), - "y": sum(float(r["EmpiricalQuality"]) for r in g) / len(g) if len(g) > 0 else 0, - } - ) - - sample_data.append(reported_empirical) - - # Build data label configs for this data type - data_labels.append( - {"name": f"{rt_type_name} Reported vs. Empirical Quality", "ylab": "Empirical quality score"} + if len(parsed_data) == 0: + parsed_data = parse_report( + f["f"].splitlines(), + {PREFIXES["sentieon"] + k: v for k, v in REPORT_TABLE_HEADERS.items()}, ) - - plot = scatter.plot( - sample_data, - pconfig={ - "title": "Reported vs. Empirical Quality", - "id": "gatk-base-recalibrator-reported-empirical-plot", - "xlab": "Reported quality score", - "ylab": "Empirical quality score", - "x_decimals": False, - "data_labels": data_labels, - "xmin": 0, - "ymin": 0, - "square": True, - }, + if len(parsed_data) == 0: + continue + + rt_type = determine_recal_table_type(parsed_data) + if len(parsed_data) > 0: + if f["s_name"] in samples_kept[rt_type]: + log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}") + samples_kept[rt_type].add(f["s_name"]) + + module.add_data_source(f, section="base_recalibrator") + for table_name, sample_tables in parsed_data.items(): + data[rt_type][table_name][f["s_name"]] = sample_tables + + # Filter to strip out ignored sample names. Again. + for rt_type in recal_table_type: + for table_name, sample_tables in data[rt_type].items(): + data[rt_type][table_name] = module.ignore_samples(sample_tables) + + n_reports_found = sum([len(samples_kept[rt_type]) for rt_type in recal_table_type]) + if n_reports_found == 0: + return 0 + + log.info(f"Found {n_reports_found} BaseRecalibrator reports") + + # Write data to file + module.write_data_file(data, "gatk_base_recalibrator") + + add_quality_score_vs_no_of_observations_section(module, data) + add_reported_vs_empirical_section(module, data) + return n_reports_found + + +def add_quality_score_vs_no_of_observations_section(module, data): + """Add a section for the quality score vs number of observations line plot""" + + sample_data = [] + data_labels = [] + + # Loop through the different data types + for rt_type_name, rt_type in recal_table_type._asdict().items(): + sample_tables = data[rt_type]["quality_quantization_map"] + if len(sample_tables) == 0: + continue + + count_data = {} + pct_data = {} + for sample, table in sample_tables.items(): + count_data[sample] = {} + pct_data[sample] = {} + + # Get the total count for this sample + sample_y_sum = sum(int(y) for y in table["Count"]) + + # Collect the data for the plots + for x, y in zip(table["QualityScore"], table["Count"]): + # Quality score counts + count_data[sample][int(x)] = int(y) + # Quality score percentages + try: + pct_data[sample][int(x)] = float(y) / sample_y_sum + except ZeroDivisionError: + pct_data[sample][int(x)] = 0 + + # Append the datasets for this data type + sample_data.append(count_data) + sample_data.append(pct_data) + + # Build data label configs for this data type + data_labels.append({"name": f"{rt_type_name.capitalize().replace('_', '-')} Count", "ylab": "Count"}) + data_labels.append({"name": f"{rt_type_name.capitalize().replace('_', '-')} Percent", "ylab": "Percent"}) + + plot = linegraph.plot( + sample_data, + pconfig={ + "title": "GATK: Observed Quality Score Counts", + "id": "gatk-base-recalibrator-quality-scores-plot", + "xlab": "Observed Quality Score", + "ylab": "Count", + "x_decimals": False, + "data_labels": data_labels, + }, + ) + + # Reported vs empirical quality scores + module.add_section( + name="Observed Quality Scores", + anchor="gatk-base-recalibrator-quality-scores", + description=( + "This plot shows the distribution of base quality scores in each sample before and " + "after base quality score recalibration (BQSR). Applying BQSR should broaden the " + "distribution of base quality scores." + ), + helptext=( + "For more information see " + "[the Broad's description of BQSR]" + "(https://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr)" + "." + ), + plot=plot, + ) + + +def add_reported_vs_empirical_section(module, data): + sample_data = [] + data_labels = [] + + # Loop through the different data types + for ( + rt_type_name, + rt_type, + ) in recal_table_type._asdict().items(): + # This table appears to be the correct one to use for reported vs empirical + # https://github.com/broadinstitute/gatk/blob/853b53ec2a3ac2d90d7d82a6c8451e29a34692d2/src/main/resources/org/broadinstitute/hellbender/utils/recalibration/BQSR.R#L148 + sample_tables = data[rt_type]["recal_table_1"] + if len(sample_tables) == 0: + continue + + reported_empirical = {} + for sample, table in sample_tables.items(): + reported_empirical[sample] = [] + table_rows = [dict(zip(table, r)) for r in zip(*table.values())] + table_rows.sort(key=lambda r: r["QualityScore"]) + for reported, group in groupby(table_rows, lambda r: r["QualityScore"]): + g = list(group) + reported_empirical[sample].append( + { + "x": int(reported), + "y": sum(float(r["EmpiricalQuality"]) for r in g) / len(g) if len(g) > 0 else 0, + } + ) + + sample_data.append(reported_empirical) + + # Build data label configs for this data type + data_labels.append( + {"name": f"{rt_type_name} Reported vs. Empirical Quality", "ylab": "Empirical quality score"} ) - self.add_section( - name="Reported Quality vs. Empirical Quality", - anchor="gatk-base-recalibrator-reported-empirical", - description="Plot shows the reported quality score vs the empirical quality score.", - plot=plot, - ) + plot = scatter.plot( + sample_data, + pconfig={ + "title": "Reported vs. Empirical Quality", + "id": "gatk-base-recalibrator-reported-empirical-plot", + "xlab": "Reported quality score", + "ylab": "Empirical quality score", + "x_decimals": False, + "data_labels": data_labels, + "xmin": 0, + "ymin": 0, + "square": True, + }, + ) + + module.add_section( + name="Reported Quality vs. Empirical Quality", + anchor="gatk-base-recalibrator-reported-empirical", + description="Plot shows the reported quality score vs the empirical quality score.", + plot=plot, + ) def determine_recal_table_type(parsed_data): diff --git a/multiqc/modules/gatk/gatk.py b/multiqc/modules/gatk/gatk.py index f2e0b245af..ac24c038ff 100644 --- a/multiqc/modules/gatk/gatk.py +++ b/multiqc/modules/gatk/gatk.py @@ -2,14 +2,14 @@ from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound -from .analyze_saturation_mutagenesis import AnalyzeSaturationMutagenesisMixin -from .base_recalibrator import BaseRecalibratorMixin -from .varianteval import VariantEvalMixin +from .analyze_saturation_mutagenesis import parse_gatk_analyze_saturation_mutagenesis +from .base_recalibrator import parse_gatk_base_recalibrator +from .varianteval import parse_gatk_varianteval log = logging.getLogger(__name__) -class MultiqcModule(BaseMultiqcModule, AnalyzeSaturationMutagenesisMixin, BaseRecalibratorMixin, VariantEvalMixin): +class MultiqcModule(BaseMultiqcModule): """ Supported tools: @@ -55,9 +55,9 @@ def __init__(self): # Call submodule functions n_reports_found = 0 - n_reports_found += self.parse_gatk_analyze_saturation_mutagenesis() - n_reports_found += self.parse_gatk_base_recalibrator() - n_reports_found += self.parse_gatk_varianteval() + n_reports_found += parse_gatk_analyze_saturation_mutagenesis(self) + n_reports_found += parse_gatk_base_recalibrator(self) + n_reports_found += parse_gatk_varianteval(self) # Exit if we didn't find anything if n_reports_found == 0: @@ -65,52 +65,3 @@ def __init__(self): # Add to the General Stats table (has to be called once per MultiQC module) self.general_stats_addcols(self.general_stats_data, self.general_stats_headers) - - def parse_report(self, lines, table_names): - """Parse a GATK report https://software.broadinstitute.org/gatk/documentation/article.php?id=1244 - - Only GATTable entries are parsed. Tables are returned as a dict of tables. - Each table is a dict of arrays, where names correspond to column names, and arrays - correspond to column values. - - Args: - lines (file handle): an iterable over the lines of a GATK report. - table_names (dict): a dict with keys that are GATK report table names - (e.g. "#:GATKTable:Quantized:Quality quantization map"), and values that are the - keys in the returned dict. - - Returns: - { - table_1: - { - col_1: [ val_1, val_2, ... ] - col_2: [ val_1, val_2, ... ] - ... - } - table_2: - ... - } - """ - - report = dict() - lines = (line for line in lines) - for line in lines: - line = line.rstrip() - if line in table_names.keys(): - report[table_names[line]] = self.parse_gatk_report_table(lines) - return report - - @staticmethod - def parse_gatk_report_table(lines): - headers = next(lines).rstrip().split() - table = {h: [] for h in headers} - for line in lines: - line = line.rstrip() - - # testing to see if we have reached the end of a table in a GATKReport - if line == "": - break - - for index, value in enumerate(line.split()): - table[headers[index]].append(value) - return table diff --git a/multiqc/modules/gatk/utils.py b/multiqc/modules/gatk/utils.py new file mode 100644 index 0000000000..54ed481e85 --- /dev/null +++ b/multiqc/modules/gatk/utils.py @@ -0,0 +1,48 @@ +def parse_report(lines, table_names): + """Parse a GATK report https://software.broadinstitute.org/gatk/documentation/article.php?id=1244 + + Only GATTable entries are parsed. Tables are returned as a dict of tables. + Each table is a dict of arrays, where names correspond to column names, and arrays + correspond to column values. + + Args: + lines (file handle): an iterable over the lines of a GATK report. + table_names (dict): a dict with keys that are GATK report table names + (e.g. "#:GATKTable:Quantized:Quality quantization map"), and values that are the + keys in the returned dict. + + Returns: + { + table_1: + { + col_1: [ val_1, val_2, ... ] + col_2: [ val_1, val_2, ... ] + ... + } + table_2: + ... + } + """ + + report = dict() + lines = (line for line in lines) + for line in lines: + line = line.rstrip() + if line in table_names.keys(): + report[table_names[line]] = parse_gatk_report_table(lines) + return report + + +def parse_gatk_report_table(lines): + headers = next(lines).rstrip().split() + table = {h: [] for h in headers} + for line in lines: + line = line.rstrip() + + # testing to see if we have reached the end of a table in a GATKReport + if line == "": + break + + for index, value in enumerate(line.split()): + table[headers[index]].append(value) + return table diff --git a/multiqc/modules/gatk/varianteval.py b/multiqc/modules/gatk/varianteval.py index 452a2efb8f..318aa97a76 100644 --- a/multiqc/modules/gatk/varianteval.py +++ b/multiqc/modules/gatk/varianteval.py @@ -8,87 +8,82 @@ log = logging.getLogger(__name__) -class VariantEvalMixin: - def parse_gatk_varianteval(self): - """Find GATK varianteval logs and parse their data""" +def parse_gatk_varianteval(module): + """Find GATK varianteval logs and parse their data""" - self.gatk_varianteval = dict() - for f in self.find_log_files("gatk/varianteval", filehandles=True): - parsed_data = parse_single_report(f["f"]) - if len(parsed_data) > 1: - if f["s_name"] in self.gatk_varianteval: - log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}") - self.add_data_source(f, section="varianteval") - self.gatk_varianteval[f["s_name"]] = parsed_data + data = dict() + for f in module.find_log_files("gatk/varianteval", filehandles=True): + parsed_data = parse_single_report(f["f"]) + if len(parsed_data) > 1: + if f["s_name"] in data: + log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}") + module.add_data_source(f, section="varianteval") + data[f["s_name"]] = parsed_data - # Filter to strip out ignored sample names - self.gatk_varianteval = self.ignore_samples(self.gatk_varianteval) + # Filter to strip out ignored sample names + data = module.ignore_samples(data) - n_reports_found = len(self.gatk_varianteval) - if n_reports_found == 0: - return 0 + n_reports_found = len(data) + if n_reports_found == 0: + return 0 - log.info(f"Found {n_reports_found} VariantEval reports") + log.info(f"Found {n_reports_found} VariantEval reports") - # Superfluous function call to confirm that it is used in this module - # Replace None with actual version if it is available - self.add_software_version(None) + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + module.add_software_version(None) - # Write parsed report data to a file (restructure first) - self.write_data_file(self.gatk_varianteval, "multiqc_gatk_varianteval") + # Write parsed report data to a file (restructure first) + module.write_data_file(data, "multiqc_gatk_varianteval") - # Get consensus TiTv references - titv_ref = None - for s_name in self.gatk_varianteval: - if titv_ref is None: - titv_ref = self.gatk_varianteval[s_name]["titv_reference"] - elif titv_ref != self.gatk_varianteval[s_name]["titv_reference"]: - titv_ref = "Multiple" - break + # Get consensus TiTv references + titv_ref = None + for s_name in data: + if titv_ref is None: + titv_ref = data[s_name]["titv_reference"] + elif titv_ref != data[s_name]["titv_reference"]: + titv_ref = "Multiple" + break - # General Stats Table - varianteval_headers = dict() - varianteval_headers["known_titv"] = { - "title": "TiTV ratio (known)", - "description": f"TiTV ratio from variants found in '{titv_ref}'", - "min": 0, - "scale": "Blues", - "shared_key": "titv_ratio", - } - varianteval_headers["novel_titv"] = { - "title": "TiTV ratio (novel)", - "description": f"TiTV ratio from variants NOT found in '{titv_ref}'", - "min": 0, - "scale": "Blues", - "shared_key": "titv_ratio", - } - varianteval_headers["called_titv"] = { - "title": "TiTV ratio (called)", - "description": f"TiTV ratio from variants found in '{titv_ref}'", - "min": 0, - "scale": "Blues", - "shared_key": "titv_ratio", - } - varianteval_headers["filtered_titv"] = { - "title": "TiTV ratio (filtered)", - "description": f"TiTV ratio from variants NOT found in '{titv_ref}'", - "min": 0, - "scale": "Blues", - "shared_key": "titv_ratio", - } - self.general_stats_addcols(self.gatk_varianteval, varianteval_headers, "GATK VariantEval") + # General Stats Table + headers = dict() + headers["known_titv"] = { + "title": "TiTV ratio (known)", + "description": f"TiTV ratio from variants found in '{titv_ref}'", + "min": 0, + "scale": "Blues", + "shared_key": "titv_ratio", + } + headers["novel_titv"] = { + "title": "TiTV ratio (novel)", + "description": f"TiTV ratio from variants NOT found in '{titv_ref}'", + "min": 0, + "scale": "Blues", + "shared_key": "titv_ratio", + } + headers["called_titv"] = { + "title": "TiTV ratio (called)", + "description": f"TiTV ratio from variants found in '{titv_ref}'", + "min": 0, + "scale": "Blues", + "shared_key": "titv_ratio", + } + headers["filtered_titv"] = { + "title": "TiTV ratio (filtered)", + "description": f"TiTV ratio from variants NOT found in '{titv_ref}'", + "min": 0, + "scale": "Blues", + "shared_key": "titv_ratio", + } + module.general_stats_addcols(data, headers, "GATK VariantEval") - # Variant Counts plot - self.add_section( - name="Variant Counts", anchor="gatk-count-variants", plot=count_variants_barplot(self.gatk_varianteval) - ) + # Variant Counts plot + module.add_section(name="Variant Counts", anchor="gatk-count-variants", plot=count_variants_barplot(data)) - # Compare Overlap Table - self.add_section( - name="Compare Overlap", anchor="gatk-compare-overlap", plot=comp_overlap_table(self.gatk_varianteval) - ) + # Compare Overlap Table + module.add_section(name="Compare Overlap", anchor="gatk-compare-overlap", plot=comp_overlap_table(data)) - return n_reports_found + return n_reports_found def parse_single_report(f): diff --git a/multiqc/plots/plotly/violin.py b/multiqc/plots/plotly/violin.py index e2c98a93ed..c4b217dcde 100644 --- a/multiqc/plots/plotly/violin.py +++ b/multiqc/plots/plotly/violin.py @@ -691,5 +691,5 @@ def find_outliers( outlier_status[indices] = True if added_values: - outlier_status = outlier_status[: -len(added_values)] + outlier_status = outlier_status[: -len(added_values)] # type: ignore return outlier_status diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index f8ab5973a1..e774633214 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -364,8 +364,10 @@ ganon: gatk/varianteval: contents: "#:GATKTable:TiTvVariantEvaluator" gatk/base_recalibrator: - contents: "#:GATKTable:Arguments:Recalibration" - num_lines: 3 + - contents: "#:GATKTable:Arguments:Recalibration" + num_lines: 3 + - contents: "#:SENTIEON_QCAL_TABLE:Arguments:Recalibration" + num_lines: 3 gatk/analyze_saturation_mutagenesis: fn: "*.readCounts" contents: ">>Reads in disjoint pairs evaluated separately:" From 5abf8111a0fd759e4835d8b824f3bec8cdcec8fd Mon Sep 17 00:00:00 2001 From: G Fedewa Date: Wed, 11 Dec 2024 15:09:45 -0800 Subject: [PATCH 32/34] Added module Checkm2: Rapid assessment of genome bin quality using machine learning (#2978) * initial checkm2 commit * working outputs * cleaned and added docstring * Use contents in search pattern * Ignore None values * Fix raised exception * Only filter None * allow for multiple log files * Refactor * Add missing self.write_data_file --------- Co-authored-by: Vlad Savelyev --- multiqc/config_defaults.yaml | 1 + multiqc/modules/checkm2/__init__.py | 3 + multiqc/modules/checkm2/checkm2.py | 168 ++++++++++++++++++++++++++++ multiqc/search_patterns.yaml | 3 + pyproject.toml | 1 + 5 files changed, 176 insertions(+) create mode 100644 multiqc/modules/checkm2/__init__.py create mode 100644 multiqc/modules/checkm2/checkm2.py diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index 7fb67ec16c..73dbb11ef6 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -459,6 +459,7 @@ module_order: - vep - bakta - prokka + - checkm2 - qc3C - nanoq - nanostat diff --git a/multiqc/modules/checkm2/__init__.py b/multiqc/modules/checkm2/__init__.py new file mode 100644 index 0000000000..6255dbb3f7 --- /dev/null +++ b/multiqc/modules/checkm2/__init__.py @@ -0,0 +1,3 @@ +from .checkm2 import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/checkm2/checkm2.py b/multiqc/modules/checkm2/checkm2.py new file mode 100644 index 0000000000..ccade1db23 --- /dev/null +++ b/multiqc/modules/checkm2/checkm2.py @@ -0,0 +1,168 @@ +import logging +from typing import Dict, Union + +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.plots import table +from multiqc.plots.table_object import TableConfig + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + """The module parses the `quality_report.tsv` files generated by CheckM2. + All statistics for all samples are saved to `multiqc_data/checkm2-first-table.txt`. + + This has been tested with CheckM2 v1.0.2 . + """ + + def __init__(self): + super(MultiqcModule, self).__init__( + name="CheckM2", + anchor="checkm2", + href="https://github.com/chklovski/CheckM2", + info="A rapid, scalable and accurate tool for assessing microbial genome quality using machine learning.", + doi=["10.1038/s41592-023-01940-w"], + ) + + data_by_sample = {} + for f in self.find_log_files("checkm2"): + self.parse_file(f, data_by_sample) + self.add_data_source(f) + + data_by_sample = self.ignore_samples(data_by_sample) + if len(data_by_sample) == 0: + raise ModuleNoSamplesFound + log.info(f"Found {len(data_by_sample)} reports") + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + self.add_software_version() + + # Write parsed report data to a file + self.write_data_file(data_by_sample, "multiqc_checkm2") + + self.mag_quality_table(data_by_sample) + + def parse_file(self, f, data_by_sample): + """Parse the quality_report.tsv output.""" + column_names = ( + "Name", + "Completeness", + "Contamination", + "Completeness_Model_Used", + "Translation_Table_Used", + "Coding_Density", + "Contig_N50", + "Average_Gene_Length", + "Genome_Size", + "GC_Content", + "Total_Coding_Sequences", + "Total_Contigs", + "Max_Contig_Length", + "Additional_Notes", + ) + lines = f["f"].splitlines() + if len(lines) <= 1: + log.warning(f"Skipping file {f['fn']} because it has no data") + return + for line in lines[1:]: + row = line.rstrip().split("\t") + if len(row) != len(column_names): + log.warning(f"Skipping line {line} because it has {len(row)} columns instead of {len(column_names)}") + continue + sname = row[0] + if sname in data_by_sample: + log.debug(f"Duplicate sample name found! Overwriting: {sname}") + data_by_sample[sname] = {k: v for k, v in zip(column_names[1:], row[1:]) if v != "None"} + + def mag_quality_table(self, data_by_sample): + """Write some quality stats and measures into a table.""" + headers = { + "Completeness": { + "title": "Predicted Completeness", + "description": "The percentage of MAG length relative to predicted total MAG length.", + "min": 0, + "max": 100, + "suffix": "%", + "scale": "YlGn", + }, + "Contamination": { + "title": "Predicted Contamination", + "description": "The length of the contaminating portion relative to the expected (complete, uncontaminated) genome length.", + "min": 0, + "suffix": "%", + "format": "{:,.2f}", + "scale": "YlOrRd", + }, + "Completeness_Model_Used": { + "title": "Completness Model Used", + "description": "Which ML model was used to predict completeness.", + "hidden": True, + }, + "Translation_Table_Used": { + "title": "Translation Table Used", + "description": "Genetic code translation table Prodigal used for gene predition.", + "scale": False, + "hidden": True, + }, + "Coding_Density": { + "title": "Coding Density", + "description": "Fraction of bases that are in predicted coding regions.", + "min": 0, + "max": 1, + "scale": "YlGn", + "format": "{:,.3f}", + }, + "Contig_N50": { + "title": "Contig N50", + "description": "The contig length such that the sum of all contigs at least as long will be 50% of the total MAG length.", + "hidden": True, + }, + "Average_Gene_Length": { + "title": "Average Gene Leght", + "description": "The average number of amino acids in predicted genes.", + "suffix": "a.a.", + "format": "{:,.0f}", + }, + "Genome_Size": { + "title": "Genome Size", + "description": "The predicted size of the genome", + "scale": "YlGn", + }, + "GC_Content": { + "title": "GC Content", + "description": "The fraction of the binned contig seqence that is G or C.", + "format": "{:,.2f}", + }, + "Total_Coding_Sequences": { + "title": "Total Coding Sequences", + "description": "The number of predicted coding sequences from Prodigal.", + "scale": "YlGn", + }, + "Total_Contigs": { + "title": "Total Contigs", + "description": "The number of contigs in the bin.", + "hidden": True, + }, + "Max_Contig_Length": { + "title": "Max Contig Length", + "description": "The length of the largest contig.", + "hidden": True, + "scale": "YlGn", + }, + "Additional_Notes": { + "title": "Additional Notes", + "description": "Any additional notes output by CheckM2.", + }, + } + pconfig = TableConfig( + title="Genome Quality", + id="checkm2-first-table", + ) + self.add_section( + name="Bin quality", + anchor="checkm2-quality", + description="Rapid assessment of genome bin quality using machine learning.", + helptext="The main use of CheckM2 is to predict the completeness and contamination of metagenome-assembled genomes (MAGs) and single-amplified genomes (SAGs), although it can also be applied to isolate genomes.", + plot=table.plot(data=data_by_sample, headers=headers, pconfig=pconfig), + ) diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index e774633214..59c896e9ba 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -215,6 +215,9 @@ cellranger/vdj_html: - fn: "*.html" contents: '"command": "Cell Ranger", "subcommand": "vdj"' num_lines: 20 +checkm2: + contents: "Name Completeness Contamination Completeness_Model_Used Translation_Table_Used Coding_Density Contig_N50 Average_Gene_Length Genome_Size GC_Content Total_Coding_Sequences Total_Contigs Max_Contig_Length" + num_lines: 10 checkqc: contents: "instrument_and_reagent_type" fn: "*.json" diff --git a/pyproject.toml b/pyproject.toml index e7705764a9..e059536f16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,7 @@ busco = "multiqc.modules.busco:MultiqcModule" bustools = "multiqc.modules.bustools:MultiqcModule" ccs = "multiqc.modules.ccs:MultiqcModule" cellranger = "multiqc.modules.cellranger:MultiqcModule" +checkm2 = "multiqc.modules.checkm2:MultiqcModule" checkqc = "multiqc.modules.checkqc:MultiqcModule" clipandmerge = "multiqc.modules.clipandmerge:MultiqcModule" clusterflow = "multiqc.modules.clusterflow:MultiqcModule" From 0c8e44788adbaea1555230d77d4c3dc4930ed259 Mon Sep 17 00:00:00 2001 From: G Fedewa Date: Wed, 11 Dec 2024 15:33:02 -0800 Subject: [PATCH 33/34] Add module Checkm: automated method for assessing the quality of a genome using a broader set of marker genes (#2990) * initial checkm funtion and dummy parse & table functions * parse function * changed checkm_data to update * added table plot * forgot to commit multiqc changes * fixed linting * Refactor * Color code lineage --------- Co-authored-by: Vlad Savelyev --- multiqc/config_defaults.yaml | 1 + multiqc/modules/checkm/__init__.py | 3 + multiqc/modules/checkm/checkm.py | 190 +++++++++++++++++++++++++++++ multiqc/search_patterns.yaml | 3 + pyproject.toml | 1 + 5 files changed, 198 insertions(+) create mode 100644 multiqc/modules/checkm/__init__.py create mode 100644 multiqc/modules/checkm/checkm.py diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index 73dbb11ef6..c370dccfd6 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -422,6 +422,7 @@ module_order: - rsem - rseqc - busco + - checkm - bustools - goleft_indexcov - gffcompare diff --git a/multiqc/modules/checkm/__init__.py b/multiqc/modules/checkm/__init__.py new file mode 100644 index 0000000000..e6822afd4b --- /dev/null +++ b/multiqc/modules/checkm/__init__.py @@ -0,0 +1,3 @@ +from .checkm import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/checkm/checkm.py b/multiqc/modules/checkm/checkm.py new file mode 100644 index 0000000000..0274360b4c --- /dev/null +++ b/multiqc/modules/checkm/checkm.py @@ -0,0 +1,190 @@ +import logging +import re + +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.plots import table +from multiqc.plots.table_object import TableConfig +from multiqc.utils import mqc_colour + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + """The module parses the output files generated by CheckM. + It will only parse an output file from `checkm lineage_wf`, `checkm taxonomy_wf`, and `checkm qa`. + The output file needs to be in format 1 (`-o 1`). + All statistics for all samples are saved to `multiqc_data/checkm-table.txt`. + + This has been tested with CheckM v1.2.1 . + """ + + def __init__(self): + super(MultiqcModule, self).__init__( + name="CheckM", + anchor="checkm", + href="https://github.com/Ecogenomics/CheckM", + info="CheckM estimates genome completeness and contamination based on the presence or absence of marker genes.", + doi=["10.1101/gr.186072.114"], + ) + + data_by_sample = {} + for f in self.find_log_files("checkm"): + self.parse_file(f, data_by_sample) + self.add_data_source(f) + + data_by_sample = self.ignore_samples(data_by_sample) + if len(data_by_sample) == 0: + raise ModuleNoSamplesFound + log.info(f"Found {len(data_by_sample)} reports") + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + self.add_software_version() + + # Write parsed report data to a file + self.write_data_file(data_by_sample, "multiqc_checkm") + + self.mag_quality_table(data_by_sample) + + def parse_file(self, f, data_by_sample): + """Parses the file from `checkm qa`. + Outputs from this command can come in several formats and with spaces or tabs. + This is tested with formats 1 and 2 `-o [1|2]`, and with spaces (default) and tabs `--tab-file` + """ + + column_names_format_1 = ( + "Bin Id", + "Marker lineage", + "# genomes", + "# markers", + "# marker sets", + "0", + "1", + "2", + "3", + "4", + "5+", + "Completeness", + "Contamination", + "Strain heterogeneity", + ) + column_names_format_2 = ( + "Bin Id", + "Marker lineage", + "# genomes", + "# markers", + "# marker sets", + "Completeness", + "Contamination", + "Strain heterogeneity", + "Genome size (bp)", + "# ambiguous bases", + "# scaffolds", + "# contigs", + "N50 (scaffolds)", + "N50 (contigs)", + "Mean scaffold length (bp)", + "Mean contig length (bp)", + "Longest scaffold (bp)", + "Longest contig (bp)", + "GC", + "GC std (scaffolds > 1kbp)", + "Coding density", + "Translation table", + "# predicted genes", + "0", + "1", + "2", + "3", + "4", + "5+", + ) + lines = f["f"].splitlines() + lines = [line.strip() for line in lines if line.strip() and not line.startswith("--")] + if len(lines) <= 1: + log.warning(f"Skipping file {f['fn']} because it has no data") + return + + header = lines[0].strip() + if not header.startswith(("Bin Id")): + log.warning(f"Unrecognized header in {f['fn']}: {header}") + return + + # Check which format the data is in so we can grab the correct columns later + column_names = [] + cols = re.split(r"\t| {3,}", header.rstrip()) + format_different_column = cols[5] + if format_different_column == "0": + column_names = column_names_format_1 + elif format_different_column == "Completeness": + column_names = column_names_format_2 + else: + log.warning(f"Unrecognized header in {f['fn']}: {header}") + return + + for line in lines[1:]: + row = re.split(r"\t| {3,}", line.rstrip()) + sname = row[0] + if sname in data_by_sample: + log.debug(f"Duplicate sample name found! Overwriting: {sname}") + data_by_sample[sname] = {k: v for k, v in zip(column_names[1:], row[1:]) if v is not None} + + def mag_quality_table(self, data_by_sample): + lineages = list(set(d.get("Marker lineage") for d in data_by_sample.values())) + scale = mqc_colour.mqc_colour_scale("Dark2") + lineages_colors = [{v: scale.get_colour(i, lighten=0.5)} for i, v in enumerate(lineages)] + headers = { + "Marker lineage": { + "title": "Marker lineage", + "description": "indicates lineage used for inferring marker set (a precise indication of where a bin was placed in CheckM's reference tree can be obtained with the tree_qa command)", + "cond_formatting_colours": lineages_colors, + "cond_formatting_rules": {v: [{"s_eq": v}] for v in lineages}, + }, + "# genomes": { + "title": "Genomes", + "description": "Number of reference genomes used to infer marker set.", + "min": 0, + }, + "# markers": { + "title": "Markers", + "description": "Number of inferred marker genes.", + "min": 0, + "scale": "YlGn", + }, + "# marker sets": { + "title": "Marker sets", + "description": "Number of inferred co-located marker sets", + "min": 0, + "scale": "YlOrRd-rev", + }, + "Completeness": { + "title": "Completeness", + "description": "Estimated completeness of genome as determined from the presence/absence of marker genes and the expected collocalization of these genes", + "min": 0, + "max": 100, + "suffix": "%", + "scale": "Purples", + "format": "{:,.2f}", + }, + "Contamination": { + "title": "Contamination", + "description": "Estimated contamination of genome as determined by the presence of multi-copy marker genes and the expected collocalization of these genes", + "min": 0, + "max": 100, + "suffix": "%", + "scale": "Reds", + "format": "{:,.2f}", + }, + } + pconfig = TableConfig( + title="Genome Quality", + id="checkm-first-table", + col1_header="Bin Id", + ) + self.add_section( + name="Bin quality", + anchor="checkm-quality", + description="The quality of microbial genomes recovered from isolates, single cells, and metagenomes.", + helptext="An automated method for assessing the quality of a genome using a broader set of marker genes specific to the position of a genome within a reference genome tree and information about the collocation of these genes.", + plot=table.plot(data=data_by_sample, headers=headers, pconfig=pconfig), + ) diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index 59c896e9ba..8e309a2f51 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -215,6 +215,9 @@ cellranger/vdj_html: - fn: "*.html" contents: '"command": "Cell Ranger", "subcommand": "vdj"' num_lines: 20 +checkm: + - contents_re: ".*Bin Id(?:\t| {3,})Marker lineage(?:\t| {3,})# genomes(?:\t| {3,})# markers(?:\t| {3,})# marker sets.*" + num_lines: 10 checkm2: contents: "Name Completeness Contamination Completeness_Model_Used Translation_Table_Used Coding_Density Contig_N50 Average_Gene_Length Genome_Size GC_Content Total_Coding_Sequences Total_Contigs Max_Contig_Length" num_lines: 10 diff --git a/pyproject.toml b/pyproject.toml index e059536f16..68e5ae2ca2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,6 +112,7 @@ busco = "multiqc.modules.busco:MultiqcModule" bustools = "multiqc.modules.bustools:MultiqcModule" ccs = "multiqc.modules.ccs:MultiqcModule" cellranger = "multiqc.modules.cellranger:MultiqcModule" +checkm = "multiqc.modules.checkm:MultiqcModule" checkm2 = "multiqc.modules.checkm2:MultiqcModule" checkqc = "multiqc.modules.checkqc:MultiqcModule" clipandmerge = "multiqc.modules.clipandmerge:MultiqcModule" From fa8f476e15f9471862286519abd7f15e4f1ba4f0 Mon Sep 17 00:00:00 2001 From: G Fedewa Date: Wed, 11 Dec 2024 16:02:42 -0800 Subject: [PATCH 34/34] Add module GTDB-Tk: a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes (#2970) * GTDBTK module minimum run * parsing and plotting a table * added defaults to prevent crashes if there is no classification * removed redundant column * cleaned up the table * formatting and docstrings * importing TableConfig * allow for multiple log files * fix no sample found error * filter None * Refactor * Do not strip leading spaces to assure same number of columns * Hide long classifications and method by default * Lint --------- Co-authored-by: Vlad Savelyev --- multiqc/config_defaults.yaml | 1 + multiqc/modules/checkm/checkm.py | 4 +- multiqc/modules/checkm2/checkm2.py | 2 +- multiqc/modules/gtdbtk/__init__.py | 3 + multiqc/modules/gtdbtk/gtdbtk.py | 166 +++++++++++++++++++++++++++++ multiqc/search_patterns.yaml | 3 + pyproject.toml | 1 + 7 files changed, 177 insertions(+), 3 deletions(-) create mode 100644 multiqc/modules/gtdbtk/__init__.py create mode 100644 multiqc/modules/gtdbtk/gtdbtk.py diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index c370dccfd6..4cdf4f4115 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -484,6 +484,7 @@ module_order: - truvari - megahit - ganon + - gtdbtk # Alignment tool stats - bbmap - bismark diff --git a/multiqc/modules/checkm/checkm.py b/multiqc/modules/checkm/checkm.py index 0274360b4c..975030338b 100644 --- a/multiqc/modules/checkm/checkm.py +++ b/multiqc/modules/checkm/checkm.py @@ -112,7 +112,7 @@ def parse_file(self, f, data_by_sample): # Check which format the data is in so we can grab the correct columns later column_names = [] - cols = re.split(r"\t| {3,}", header.rstrip()) + cols = re.split(r"\t| {3,}", header.rstrip("\n")) format_different_column = cols[5] if format_different_column == "0": column_names = column_names_format_1 @@ -123,7 +123,7 @@ def parse_file(self, f, data_by_sample): return for line in lines[1:]: - row = re.split(r"\t| {3,}", line.rstrip()) + row = re.split(r"\t| {3,}", line.rstrip("\n")) sname = row[0] if sname in data_by_sample: log.debug(f"Duplicate sample name found! Overwriting: {sname}") diff --git a/multiqc/modules/checkm2/checkm2.py b/multiqc/modules/checkm2/checkm2.py index ccade1db23..974bec3961 100644 --- a/multiqc/modules/checkm2/checkm2.py +++ b/multiqc/modules/checkm2/checkm2.py @@ -66,7 +66,7 @@ def parse_file(self, f, data_by_sample): log.warning(f"Skipping file {f['fn']} because it has no data") return for line in lines[1:]: - row = line.rstrip().split("\t") + row = line.rstrip("\n").split("\t") if len(row) != len(column_names): log.warning(f"Skipping line {line} because it has {len(row)} columns instead of {len(column_names)}") continue diff --git a/multiqc/modules/gtdbtk/__init__.py b/multiqc/modules/gtdbtk/__init__.py new file mode 100644 index 0000000000..f6122f3a4d --- /dev/null +++ b/multiqc/modules/gtdbtk/__init__.py @@ -0,0 +1,3 @@ +from .gtdbtk import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/gtdbtk/gtdbtk.py b/multiqc/modules/gtdbtk/gtdbtk.py new file mode 100644 index 0000000000..a47503c46c --- /dev/null +++ b/multiqc/modules/gtdbtk/gtdbtk.py @@ -0,0 +1,166 @@ +import logging + +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.plots import table +from multiqc.plots.table_object import TableConfig + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + """ + The module parses `summary.tsv` outputs from GTDB-Tk's `classify.py` and `classify_wf.py`. + + The module only works for version >= 2.4.0 because column names changed. + + `classify.py` and `classify_wf.py` are used to determine the taxonomic classification of input genomes. + """ + + def __init__(self): + super(MultiqcModule, self).__init__( + name="GTDB-Tk", + anchor="gtdbtk", + href="https://ecogenomics.github.io/GTDBTk/index.html", + info="Toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes.", + doi=["10.1093/bioinformatics/btac672"], + ) + + data_by_sample = {} + for f in self.find_log_files("gtdbtk"): + self.parse_file(f, data_by_sample) + self.add_data_source(f) + + data_by_sample = self.ignore_samples(data_by_sample) + if len(data_by_sample) == 0: + raise ModuleNoSamplesFound + log.info(f"Found {len(data_by_sample)} reports") + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + self.add_software_version() + + self.write_data_file(data_by_sample, "multiqc_gtdbtk") + + self.closest_taxa_table(data_by_sample) + + def parse_file(self, f, data_by_sample): + """Parse the summary.tsv outputs.""" + column_names = ( + "user_genome", + "classification", + "closest_genome_reference", + "closest_genome_reference_radius", + "closest_genome_taxonomy", + "closest_genome_ani", + "closest_genome_af", + "closest_placement_reference", + "closest_placement_radius", + "closest_placement_taxonomy", + "closest_placement_ani", + "closest_placement_af", + "pplacer_taxonomy", + "classification_method", + "note", + "other_related_references", + "msa_percent", + "translation_table", + "red_value", + "warnings", + ) + lines = f["f"].splitlines() + if len(lines) <= 1: + log.warning(f"Skipping file {f['fn']} because it has no data") + return + for line in lines[1:]: + row = line.rstrip("\n").split("\t") + if len(row) != len(column_names): + log.warning(f"Skipping line {line} because it has {len(row)} columns instead of {len(column_names)}") + continue + sname = row[0] + if sname in data_by_sample: + log.debug(f"Duplicate sample name found! Overwriting: {sname}") + data_by_sample[sname] = {k: v for k, v in zip(column_names[1:], row[1:]) if v} + + def closest_taxa_table(self, data_by_sample): + """Add a table showing the closest taxa for each query genome.""" + classication_method_translate_dict = { + "taxonomic classification defined by topology and ANI": ["closest_genome_ani", "closest_genome_af"], + "ani_screen": ["closest_genome_ani", "closest_genome_af"], + "taxonomic classification fully defined by topology": [ + "closest_placement_ani", + "closest_placement_af", + ], + "taxonomic novelty determined using RED": [None, None], + } + table_data = {} + for sname, data in data_by_sample.items(): + classification_value_columns = classication_method_translate_dict.get( + data.get("classification_method"), [None, None] + ) + classification = data.get("classification", "").split(";") + last_classification = classification[-1] + table_data[sname] = { + "classification": last_classification, + "full_classification": "; ".join(classification), + "classification_method": data.get("classification_method", None), + "ANI": data.get(classification_value_columns[0], None), + "AF": data.get(classification_value_columns[1], None), + "red_value": data.get("red_value", None), + "note": data.get("note"), + "warnings": data.get("warnings"), + } + headers = { + "classification": { + "title": "Classification", + "description": "GTDB taxonomy string inferred by the GTDB-Tk.", + }, + "full_classification": { + "title": "Full classification", + "description": "Full GTDB taxonomy string inferred by the GTDB-Tk.", + "hidden": True, + }, + "classification_method": { + "title": "Classification method", + "description": "Indicates the rule used to classify the genome.", + "hidden": True, + }, + "ANI": { + "title": "ANI to closest genome", + "description": "Depending on the classification method, either the 'closest_genome_ani' or 'closest_placement_ani'.", + "min": 0, + "max": 100, + }, + "AF": { + "title": "AF to closest genome", + "description": "Depending on the classification method, either the 'closest_genome_af' or 'closest_placement_af'.", + "min": 0, + "max": 1, + "scale": "Purples", + }, + "red_value": { + "title": "RED", + "description": "Indicates the relative evolutionary divergence (RED) for a query genome. RED is not calculated when a query genome can be classified based on ANI.", + "min": 0, + "max": 1, + }, + "warnings": { + "title": "Warnings", + "description": "Indicates unusual characteristics of the query genome that may impact the taxonomic assignment.", + }, + "note": { + "title": "Notes", + "description": "Provides additional information regarding the classification of the genome.", + }, + } + pconfig = TableConfig( + title="Taxonomy classifications", + id="gtdbtk-first-table", + col1_header="User genome", + ) + self.add_section( + name="MAG taxonomy", + anchor="gtdbtk-taxonomy", + description="The taxonomy of a MAG as found by GTDB.", + helptext="GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also be applied to isolate and single-cell genomes. ", + plot=table.plot(data=table_data, headers=headers, pconfig=pconfig), + ) diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index 8e309a2f51..020d46a39f 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -394,6 +394,9 @@ goleft_indexcov/ped: fn: "*-indexcov.ped" gopeaks: fn: "*_gopeaks.json" +gtdbtk: + contents: "user_genome classification closest_genome_reference closest_genome_reference_radius closest_genome_taxonomy closest_genome_ani" + num_lines: 10 haplocheck: contents: '"Sample" "Contamination Status" "Contamination Level" "Distance" "Sample Coverage"' num_lines: 10 diff --git a/pyproject.toml b/pyproject.toml index 68e5ae2ca2..069726da94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,6 +143,7 @@ gatk = "multiqc.modules.gatk:MultiqcModule" glimpse = "multiqc.modules.glimpse:MultiqcModule" goleft_indexcov = "multiqc.modules.goleft_indexcov:MultiqcModule" gopeaks = "multiqc.modules.gopeaks:MultiqcModule" +gtdbtk = "multiqc.modules.gtdbtk:MultiqcModule" haplocheck = "multiqc.modules.haplocheck:MultiqcModule" happy = "multiqc.modules.happy:MultiqcModule" hicexplorer = "multiqc.modules.hicexplorer:MultiqcModule"