diff --git a/.dockstore.yml b/.dockstore.yml
index 72dd5485eab..a042e3a1bb4 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -309,8 +309,7 @@ workflows:
branches:
- master
- ah_var_store
- - vs_1396_bq_query_audit
- - gg_VS-1395_Have_CreateVATFromVDS_MoreGoodness
+ - vs_1178_merge_master_to_ah_var_store_again
tags:
- /.*/
- name: GvsIngestTieout
@@ -439,7 +438,7 @@ workflows:
- master
- ah_var_store
- EchoCallset
- - vs_1412_pgen_pvars_not_compressed
+ - vs_1178_merge_master_to_ah_var_store_again
- name: MergePgenHierarchicalWdl
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/MergePgenHierarchical.wdl
diff --git a/.github/actions/upload-gatk-test-results/action.yml b/.github/actions/upload-gatk-test-results/action.yml
index b210216d2f0..e2c267e2c75 100644
--- a/.github/actions/upload-gatk-test-results/action.yml
+++ b/.github/actions/upload-gatk-test-results/action.yml
@@ -40,16 +40,17 @@ runs:
name: test-results-${{ inputs.is-docker == 'true' && 'docker-' || '' }}${{ matrix.Java }}-${{ matrix.testType }}
path: build/reports/tests
- - name: Upload to codecov
- run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
- shell: bash
+ # Disabling codecov because it is timing out and failing builds that otherwise succeed.
+ ## - name: Upload to codecov
+ ## run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
+ ## shell: bash
- name: Upload Reports
if: ${{ inputs.only-artifact != 'true' }}
id: uploadreports
run: |
gsutil -m cp -z html -z js -z xml -z css -r build/reports/tests gs:/${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/;
- VIEW_URL=https://storage.googleapis.com${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/tests/testOnPackagedReleaseJar/index.html
+ VIEW_URL=https://storage.googleapis.com${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/tests/test/index.html
echo "See the test report at ${VIEW_URL}";
echo view_url="${VIEW_URL}" >> $GITHUB_OUTPUT
@@ -91,4 +92,4 @@ runs:
run: |
pip install --user PyGithub;
python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }};
- shell: bash
\ No newline at end of file
+ shell: bash
diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml
index 2f97d560b65..76fa7a9d07e 100644
--- a/.github/workflows/gatk-tests.yml
+++ b/.github/workflows/gatk-tests.yml
@@ -116,7 +116,7 @@ jobs:
- name: 'Set up Cloud SDK'
if: needs.check-secrets.outputs.google-credentials == 'true'
- uses: google-github-actions/setup-gcloud@v0
+ uses: google-github-actions/setup-gcloud@v2
- name: pull lfs files
run: git lfs pull
@@ -188,7 +188,7 @@ jobs:
- name: 'Set up Cloud SDK'
if: needs.check-secrets.outputs.google-credentials == 'true'
- uses: google-github-actions/setup-gcloud@v0
+ uses: google-github-actions/setup-gcloud@v2
- name: build test jars
run: ./gradlew clean shadowTestClassJar shadowTestJar
@@ -230,10 +230,6 @@ jobs:
bash --init-file /gatk/gatkenv.rc /root/run_unit_tests.sh;
TEST_EXIT_VALUE=$?;
$( exit ${TEST_EXIT_VALUE} );
- sudo chmod -R a+w build/reports/;
- mkdir build/reports/tests/test \
- && cp -rp build/reports/tests/testOnPackagedReleaseJar/* build/reports/tests/test \
- && rm -r build/reports/tests/testOnPackagedReleaseJar;
- uses: ./.github/actions/upload-gatk-test-results
if: always()
diff --git a/.gitignore b/.gitignore
index e95e5ab6094..a73810708a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,4 @@ funcotator_tmp
#Test generated dot files
test*.dot
+.vscode/
diff --git a/Dockerfile b/Dockerfile
index d39a6ce95da..ceec398a9d3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,18 @@
-ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.1.0
+ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
# stage 1 for constructing the GATK zip
FROM ${BASE_DOCKER} AS gradleBuild
LABEL stage=gatkIntermediateBuildImage
ARG RELEASE=false
-RUN ls .
+
ADD . /gatk
WORKDIR /gatk
# Get an updated gcloud signing key, in case the one in the base image has expired
-RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
+#Download only resources required for the build, not for testing
+RUN ls . && \
+ rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt update &&\
apt-key list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
@@ -19,16 +21,13 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get -y clean && \
apt-get -y autoclean && \
apt-get -y autoremove && \
- rm -rf /var/lib/apt/lists/*
-RUN git lfs install --force
-
-#Download only resources required for the build, not for testing
-RUN git lfs pull --include src/main/resources/large
-
-RUN export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE
-RUN cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/
-RUN unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts
-RUN chmod -R a+rw /gatk/unzippedJar
+ rm -rf /var/lib/apt/lists/* && \
+ git lfs install --force && \
+ git lfs pull --include src/main/resources/large && \
+ export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE && \
+ cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ && \
+ unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts && \
+ chmod -R a+rw /gatk/unzippedJar
FROM ${BASE_DOCKER}
@@ -47,17 +46,17 @@ RUN chmod -R a+rw /gatk
COPY --from=gradleBuild /gatk/unzippedJar .
#Setup linked jars that may be needed for running gatk
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar
-RUN ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
+RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar && \
+ ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar && \
+ ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
WORKDIR /root
# Make sure we can see a help message
-RUN java -jar gatk.jar -h
-RUN mkdir /gatkCloneMountPoint
-RUN mkdir /jars
-RUN mkdir .gradle
+RUN java -jar gatk.jar -h && \
+ mkdir /gatkCloneMountPoint && \
+ mkdir /jars && \
+ mkdir .gradle
WORKDIR /gatk
@@ -80,28 +79,16 @@ RUN echo "source activate gatk" > /root/run_unit_tests.sh && \
echo "ln -s /gatkCloneMountPoint/build/ /gatkCloneMountPoint/scripts/docker/build" >> /root/run_unit_tests.sh && \
echo "cd /gatk/ && /gatkCloneMountPoint/gradlew -Dfile.encoding=UTF-8 -b /gatkCloneMountPoint/dockertest.gradle testOnPackagedReleaseJar jacocoTestReportOnPackagedReleaseJar -a -p /gatkCloneMountPoint" >> /root/run_unit_tests.sh
-# TODO: Determine whether we actually need this. For now it seems to be required because the version of libstdc++ on
-# TODO: the gatk base docker is out of date (maybe?)
-RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test && \
- apt-get update && \
- apt-get -y upgrade libstdc++6 && \
- apt-get -y dist-upgrade
-
-WORKDIR /root
-RUN cp -r /root/run_unit_tests.sh /gatk
-RUN cp -r gatk.jar /gatk
-ENV CLASSPATH /gatk/gatk.jar:$CLASSPATH
+RUN cp -r /root/run_unit_tests.sh /gatk && \
+ cp -r /root/gatk.jar /gatk
+ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
# Start GATK Python environment
-WORKDIR /gatk
-ENV PATH $CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
conda clean -afy && \
- find /opt/miniconda/ -follow -type f -name '*.a' -delete && \
- find /opt/miniconda/ -follow -type f -name '*.pyc' -delete && \
rm -rf /root/.cache/pip
CMD ["bash", "--init-file", "/gatk/gatkenv.rc"]
diff --git a/README.md b/README.md
index 1dc2bb366b8..26e731c26db 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ releases of the toolkit.
* [Requirements](#requirements)
* [Quick Start Guide](#quickstart)
* [Downloading GATK4](#downloading)
+ * [Tools Included in Docker Image](#dockerSoftware)
* [Building GATK4](#building)
* [Running GATK4](#running)
* [Passing JVM options to gatk](#jvmoptions)
@@ -115,6 +116,34 @@ You can download and run pre-built versions of GATK4 from the following places:
* You can download a GATK4 docker image from [our dockerhub repository](https://hub.docker.com/r/broadinstitute/gatk/). We also host unstable nightly development builds on [this dockerhub repository](https://hub.docker.com/r/broadinstitute/gatk-nightly/).
* Within the docker image, run gatk commands as usual from the default startup directory (/gatk).
+### Tools Included in Docker Image
+
+Our docker image contains the following bioinformatics tools, which can be run by invoking the tool name from the command line:
+* bedtools (v2.30.0)
+* samtools (1.13)
+* bcftools (1.13)
+* tabix (1.13+ds)
+
+We also include an installation of Python3 (3.6.10) with the following popular packages included:
+* numpy
+* scipy
+* tensorflow
+* pymc3
+* keras
+* scikit-learn
+* matplotlib
+* pandas
+* biopython
+* pyvcf
+* pysam
+
+We also include an installation of R (3.6.2) with the following popular packages included:
+* data.table
+* dplyr
+* ggplot2
+
+For more details on system packages, see the GATK [Base Dockerfile](scripts/docker/gatkbase/Dockerfile) and for more details on the Python3/R packages, see the [Conda environment setup file](scripts/gatkcondaenv.yml.template). Versions for the Python3/R packages can be found there.
+
## Building GATK4
* **To do a full build of GATK4, first clone the GATK repository using "git clone", then run:**
diff --git a/build.gradle b/build.gradle
index a41a1364aef..a8b3c976f64 100644
--- a/build.gradle
+++ b/build.gradle
@@ -16,6 +16,7 @@ plugins {
id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars
id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating
id 'com.palantir.git-version' version '0.5.1' //version helper
+ id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies
}
@@ -56,20 +57,20 @@ repositories {
mavenLocal()
}
-final htsjdkVersion = System.getProperty('htsjdk.version','3.0.5')
-final picardVersion = System.getProperty('picard.version','3.1.0')
+final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
+final picardVersion = System.getProperty('picard.version','3.1.1')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
-final sparkVersion = System.getProperty('spark.version', '3.3.1')
-final hadoopVersion = System.getProperty('hadoop.version', '3.3.1')
-final disqVersion = System.getProperty('disq.version','0.3.6')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.0')
-final bigQueryVersion = System.getProperty('bigQuery.version', '2.31.0')
-final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.41.0')
-final guavaVersion = System.getProperty('guava.version', '32.1.2-jre')
+final sparkVersion = System.getProperty('spark.version', '3.5.0')
+final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
+final disqVersion = System.getProperty('disq.version','0.3.8')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.3')
+final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
+final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
+final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
final log4j2Version = System.getProperty('log4j2Version', '2.17.1')
final testNGVersion = '7.0.0'
-final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.0'
+final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8'
final baseJarName = 'gatk'
final secondaryBaseJarName = 'hellbender'
@@ -267,12 +268,12 @@ dependencies {
// are routed to log4j
implementation 'org.apache.logging.log4j:log4j-jcl:' + log4j2Version
- implementation 'org.apache.commons:commons-lang3:3.5'
- implementation 'org.apache.commons:commons-math3:3.5'
+ implementation 'org.apache.commons:commons-lang3:3.14.0'
+ implementation 'org.apache.commons:commons-math3:3.6.1'
implementation 'org.hipparchus:hipparchus-stat:2.0'
- implementation 'org.apache.commons:commons-collections4:4.1'
- implementation 'org.apache.commons:commons-vfs2:2.0'
- implementation 'org.apache.commons:commons-configuration2:2.4'
+ implementation 'org.apache.commons:commons-collections4:4.4'
+ implementation 'org.apache.commons:commons-vfs2:2.9.0'
+ implementation 'org.apache.commons:commons-configuration2:2.9.0'
constraints {
implementation('org.apache.commons:commons-text') {
version {
@@ -287,7 +288,7 @@ dependencies {
implementation 'commons-io:commons-io:2.5'
implementation 'org.reflections:reflections:0.9.10'
- implementation 'it.unimi.dsi:fastutil:7.0.6'
+ implementation 'it.unimi.dsi:fastutil:7.0.13'
implementation 'org.broadinstitute:hdf5-java-bindings:1.1.0-hdf5_2.11.0'
implementation 'org.broadinstitute:gatk-native-bindings:1.0.0'
@@ -297,8 +298,8 @@ dependencies {
exclude group: 'org.apache.commons'
}
- //there is no mllib_2.12.15:3.3.0, so stay use 2.12:3.3.0
- implementation ('org.apache.spark:spark-mllib_2.12:3.3.0') {
+ // TODO: migrate to mllib_2.12.15?
+ implementation ('org.apache.spark:spark-mllib_2.12:' + sparkVersion) {
// JUL is used by Google Dataflow as the backend logger, so exclude jul-to-slf4j to avoid a loop
exclude module: 'jul-to-slf4j'
exclude module: 'javax.servlet'
@@ -306,15 +307,6 @@ dependencies {
}
implementation 'com.thoughtworks.paranamer:paranamer:2.8'
- implementation 'org.bdgenomics.bdg-formats:bdg-formats:0.5.0'
- implementation('org.bdgenomics.adam:adam-core-spark2_2.12:0.28.0') {
- exclude group: 'org.slf4j'
- exclude group: 'org.apache.hadoop'
- exclude group: 'org.scala-lang'
- exclude module: 'kryo'
- exclude module: 'hadoop-bam'
- }
-
implementation 'org.jgrapht:jgrapht-core:1.1.0'
implementation 'org.jgrapht:jgrapht-io:1.1.0'
@@ -344,10 +336,10 @@ dependencies {
implementation 'org.broadinstitute:gatk-bwamem-jni:1.0.4'
implementation 'org.broadinstitute:gatk-fermilite-jni:1.2.0'
- implementation 'org.broadinstitute:http-nio:0.1.0-rc1'
+ implementation 'org.broadinstitute:http-nio:1.1.0'
// Required for COSMIC Funcotator data source:
- implementation 'org.xerial:sqlite-jdbc:3.36.0.3'
+ implementation 'org.xerial:sqlite-jdbc:3.44.1.0'
// natural sort
implementation('net.grey-panther:natural-comparator:1.1')
@@ -986,6 +978,18 @@ task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) {
}
}
+// scan-gradle-plugin security vulnerability scan
+ossIndexAudit {
+ allConfigurations = false // if true includes the dependencies in all resolvable configurations. By default is false, meaning only 'compileClasspath', 'runtimeClasspath', 'releaseCompileClasspath' and 'releaseRuntimeClasspath' are considered
+ useCache = true // true by default
+ outputFormat = 'DEFAULT' // Optional, other values are: 'DEPENDENCY_GRAPH' prints dependency graph showing direct/transitive dependencies, 'JSON_CYCLONE_DX_1_4' prints a CycloneDX 1.4 SBOM in JSON format.
+ showAll = false // if true prints all dependencies. By default is false, meaning only dependencies with vulnerabilities will be printed.
+ printBanner = true // if true will print ASCII text banner. By default is true.
+
+ // ossIndexAudit can be configured to exclude vulnerabilities from matching
+ // excludeVulnerabilityIds = ['39d74cc8-457a-4e57-89ef-a258420138c5'] // list containing ids of vulnerabilities to be ignored
+ // excludeCoordinates = ['commons-fileupload:commons-fileupload:1.3'] // list containing coordinate of components which if vulnerable should be ignored
+}
/**
*This specifies what artifacts will be built and uploaded when performing a maven upload.
diff --git a/build_docker_remote.sh b/build_docker_remote.sh
index 72534a02b03..7eada9565c3 100755
--- a/build_docker_remote.sh
+++ b/build_docker_remote.sh
@@ -1,12 +1,26 @@
#!/usr/bin/env bash
#
-# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
+# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built
+# in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
#
-# If you are pushing an image to our release repositories, be sure that you've followed
-# the setup instructions here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_docker
-# and here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_gcloud
+# By default the images are pushed to the following GCR repository:
+#
+# us.gcr.io/broad-dsde-methods/broad-gatk-snapshots/gatk-remote-builds
+#
+# with a name like "${YOUR_USERNAME}-${GITHUB_TAG}-${GIT_HASH_FOR_TAG}"
+#
+# This script should not be used to push to our release repositories. After
+# you've built and staged an image, run the scripts/docker/release_prebuilt_docker_image.sh
+# script to push to the release repositories.
+#
+# Usage: build_docker_remote.sh -e
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
*
- *
* {@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}
{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}
{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}
{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}
* sample1 sample1.vcf.gz * sample2 sample2.vcf.gz sample2.vcf.gz.tbi @@ -218,12 +216,14 @@ public final class GenomicsDBImport extends GATKTool { public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel"; public static final String MERGE_CONTIGS_INTO_NUM_PARTITIONS = "merge-contigs-into-num-partitions"; public static final String BYPASS_FEATURE_READER = "bypass-feature-reader"; + public static final String VCF_HEADER_OVERRIDE = "header"; public static final int INTERVAL_LIST_SIZE_WARNING_THRESHOLD = 100; public static final int ARRAY_COLUMN_BOUNDS_START = 0; public static final int ARRAY_COLUMN_BOUNDS_END = 1; public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS; public static final String USE_GCS_HDFS_CONNECTOR = GenomicsDBArgumentCollection.USE_GCS_HDFS_CONNECTOR; + public static final String AVOID_NIO = "avoid-nio"; @Argument(fullName = WORKSPACE_ARG_LONG_NAME, doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " + @@ -239,7 +239,7 @@ public final class GenomicsDBImport extends GATKTool { "when using the "+INTERVAL_LIST_LONG_NAME+" option. " + "Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " + "Must point to an existing workspace.", - mutex = {WORKSPACE_ARG_LONG_NAME}) + mutex = {WORKSPACE_ARG_LONG_NAME, VCF_HEADER_OVERRIDE}) private String incrementalImportWorkspace; @Argument(fullName = SEGMENT_SIZE_ARG_LONG_NAME, @@ -254,7 +254,7 @@ public final class GenomicsDBImport extends GATKTool { " data for only a single sample. Either this or " + SAMPLE_NAME_MAP_LONG_NAME + " must be specified.", optional = true, - mutex = {SAMPLE_NAME_MAP_LONG_NAME}) + mutex = {SAMPLE_NAME_MAP_LONG_NAME, AVOID_NIO}) private ListvariantPaths; @Argument(fullName = VCF_BUFFER_SIZE_ARG_NAME, @@ -364,6 +364,13 @@ public final class GenomicsDBImport extends GATKTool { optional = true) private boolean sharedPosixFSOptimizations = false; + @Argument(fullName = VCF_HEADER_OVERRIDE, + doc = "Specify a vcf file to use instead of reading and combining headers from the input vcfs", + optional = true, + mutex ={INCREMENTAL_WORKSPACE_ARG_LONG_NAME} + ) + private FeatureInput headerOverride = null; + @Argument(fullName = BYPASS_FEATURE_READER, doc = "Use htslib to read input VCFs instead of GATK's FeatureReader. This will reduce memory usage and potentially speed up " + "the import. Lower memory requirements may also enable parallelism through " + MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL + @@ -371,6 +378,15 @@ public final class GenomicsDBImport extends GATKTool { optional = true) private boolean bypassFeatureReader = false; + @Argument(fullName = AVOID_NIO, + doc = "Do not attempt to open the input vcf file paths in java. This can only be used with " + BYPASS_FEATURE_READER + + ". It allows operating on file systems which GenomicsDB understands how to open but GATK does not. This will disable " + + "many of the sanity checks.", + mutex = {StandardArgumentDefinitions.VARIANT_LONG_NAME} + ) + @Advanced + private boolean avoidNio = false; + @Argument(fullName = USE_GCS_HDFS_CONNECTOR, doc = "Use the GCS HDFS Connector instead of the native GCS SDK client with gs:// URLs.", optional = true) @@ -440,10 +456,6 @@ public int getDefaultCloudIndexPrefetchBufferSize() { // Path to combined VCF header file to be written by GenomicsDBImporter private String vcfHeaderFile; - // GenomicsDB callset map protobuf structure containing all callset names - // used to write the callset json file on traversal success - private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB; - //in-progress batchCount private int batchCount = 1; @@ -463,11 +475,14 @@ public void onStartup() { initializeWorkspaceAndToolMode(); assertVariantPathsOrSampleNameFileWasSpecified(); assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive(); + assertAvoidNioConditionsAreValid(); initializeHeaderAndSampleMappings(); initializeIntervals(); super.onStartup(); } + + private void initializeWorkspaceAndToolMode() { if (incrementalImportWorkspace != null && !incrementalImportWorkspace.isEmpty()) { doIncrementalImport = true; @@ -495,6 +510,24 @@ private void assertVariantPathsOrSampleNameFileWasSpecified(){ } } + private void assertAvoidNioConditionsAreValid() { + if (avoidNio && (!bypassFeatureReader || headerOverride == null) ){ + final List missing = new ArrayList<>(); + if(!bypassFeatureReader){ + missing.add(BYPASS_FEATURE_READER); + } + if(headerOverride == null){ + missing.add(VCF_HEADER_OVERRIDE); + } + final String missingArgs = String.join(" and ", missing); + + // this potentially produces and exception with bad grammar but that's probably ok + throw new CommandLineException.MissingArgument(missingArgs, "If --" +AVOID_NIO + " is set then --" + BYPASS_FEATURE_READER + + " and --" + VCF_HEADER_OVERRIDE + " must also be specified."); + + } + } + private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importer, List intervals) { GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping(); @@ -523,32 +556,37 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe */ private void initializeHeaderAndSampleMappings() { // Only one of -V and --sampleNameMapFile may be specified - if (variantPaths != null && variantPaths.size() > 0) { + if (variantPaths != null && !variantPaths.isEmpty()) { // -V was specified final List headers = new ArrayList<>(variantPaths.size()); sampleNameMap = new SampleNameMap(); - for (final String variantPathString : variantPaths) { - final Path variantPath = IOUtils.getPath(variantPathString); - if (bypassFeatureReader) { - GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath); - } - final VCFHeader header = getHeaderFromPath(variantPath, null); - Utils.validate(header != null, "Null header was found in " + variantPath + "."); - assertGVCFHasOnlyOneSample(variantPathString, header); - headers.add(header); - final String sampleName = header.getGenotypeSamples().get(0); - try { - sampleNameMap.addSample(sampleName, new URI(variantPathString)); - } - catch(final URISyntaxException e) { - throw new UserException("Malformed URI "+e.toString(), e); + if(headerOverride == null) { + for (final String variantPathString : variantPaths) { + final Path variantPath = IOUtils.getPath(variantPathString); + if (bypassFeatureReader) { // avoid-nio can't be set here because it requires headerOverride + GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath); + } + final VCFHeader header = getHeaderFromPath(variantPath); + Utils.validate(header != null, "Null header was found in " + variantPath + "."); + assertGVCFHasOnlyOneSample(variantPathString, header); + headers.add(header); + + final String sampleName = header.getGenotypeSamples().get(0); + try { + sampleNameMap.addSample(sampleName, new URI(variantPathString)); + } catch (final URISyntaxException e) { + throw new UserException("Malformed URI " + e.getMessage(), e); + } } + mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true); + mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary(); + } else { + final VCFHeader header = getHeaderFromPath(headerOverride.toPath()); + mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder()); + mergedHeaderSequenceDictionary = header.getSequenceDictionary(); } - mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true); - mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary(); mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines()); - } else if (sampleNameMapFile != null) { // --sampleNameMap was specified @@ -556,31 +594,34 @@ private void initializeHeaderAndSampleMappings() { //the resulting database will have incorrect sample names //see https://github.com/broadinstitute/gatk/issues/3682 for more information // The SampleNameMap class guarantees that the samples will be sorted correctly. - sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader); + sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), + bypassFeatureReader && !avoidNio); final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey(); - final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample); - final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample); - final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath); + final VCFHeader header; + if(headerOverride == null){ + final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample); + header = getHeaderFromPath(firstVCFPath); + } else { + header = getHeaderFromPath(headerOverride.toPath()); + } //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering - mergedHeaderLines = new LinkedHashSet (header.getMetaDataInInputOrder()); + mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder()); mergedHeaderSequenceDictionary = header.getSequenceDictionary(); mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines()); - } - else if (getIntervalsFromExistingWorkspace){ + } else if (getIntervalsFromExistingWorkspace){ final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME); IOUtils.assertPathsAreReadable(vcfHeader); final String header = GenomicsDBUtils.readEntireFile(vcfHeader); try { File tempHeader = IOUtils.createTempFile("tempheader", ".vcf"); - Files.write(tempHeader.toPath(), header.getBytes(StandardCharsets.UTF_8)); + Files.writeString(tempHeader.toPath(), header); mergedHeaderSequenceDictionary = VCFFileReader.getSequenceDictionary(tempHeader); } catch (final IOException e) { - throw new UserException("Unable to create temporary header file to get sequence dictionary"); + throw new UserException("Unable to create temporary header file to get sequence dictionary", e); } - } - else { + } else { throw new UserException(StandardArgumentDefinitions.VARIANT_LONG_NAME+" or "+ SAMPLE_NAME_MAP_LONG_NAME+" must be specified unless "+ INTERVAL_LIST_LONG_NAME+" is specified"); @@ -599,8 +640,12 @@ else if (getIntervalsFromExistingWorkspace){ } } - private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) { - try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) { + private VCFHeader getHeaderFromPath(final Path variantPath) { + //TODO make this mangling unecessary + final String variantURI = variantPath.toAbsolutePath().toUri().toString(); + try(final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false, + BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer), + BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer))) { return (VCFHeader) reader.getHeader(); } catch (final IOException e) { throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e); @@ -633,9 +678,9 @@ private void writeIntervalListToFile() { @Override public void onTraversalStart() { String workspaceDir = overwriteCreateOrCheckWorkspace(); - vidMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME); - callsetMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME); - vcfHeaderFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME); + vidMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME); + callsetMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME); + vcfHeaderFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME); if (getIntervalsFromExistingWorkspace) { // intervals may be null if merge-contigs-into-num-partitions was used to create the workspace // if so, we need to wait for vid to be generated before writing out the interval list @@ -775,7 +820,7 @@ private List generateIntervalListFromWorkspace() { final int start = Integer.parseInt(partitionInfo[1]); final int end = Integer.parseInt(partitionInfo[2]); return new SimpleInterval(contig, start, end); - }).filter(o -> o != null).collect(Collectors.toList()); + }).filter(Objects::nonNull).collect(Collectors.toList()); } private ImportConfig createImportConfig(final int batchSize) { @@ -785,7 +830,7 @@ private ImportConfig createImportConfig(final int batchSize) { GenomicsDBImportConfiguration.ImportConfiguration.newBuilder(); importConfigurationBuilder.addAllColumnPartitions(partitions); importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample); - importConfigurationBuilder.setFailIfUpdating(true && !doIncrementalImport); + importConfigurationBuilder.setFailIfUpdating(!doIncrementalImport); importConfigurationBuilder.setSegmentSize(segmentSize); importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation); importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations); @@ -936,7 +981,7 @@ private FeatureReader getReaderFromPath(final Path variantPath, /* Anonymous FeatureReader subclass that wraps returned iterators to ensure that the GVCFs do not * contain MNPs. */ - return new FeatureReader () { + return new FeatureReader<>() { /** Iterator that asserts that variants are not MNPs. */ class NoMnpIterator implements CloseableTribbleIterator { private final CloseableTribbleIterator inner; @@ -971,7 +1016,8 @@ public VariantContext next() { return new NoMnpIterator(reader.query(chr, start, end)); } - @Override public CloseableTribbleIterator iterator() throws IOException { + @Override + public CloseableTribbleIterator iterator() throws IOException { return new NoMnpIterator(reader.iterator()); } }; @@ -990,7 +1036,7 @@ public VariantContext next() { * @return The workspace directory */ private String overwriteCreateOrCheckWorkspace() { - String workspaceDir = genomicsDBGetAbsolutePath(workspace); + String workspaceDir = GATKGenomicsDBUtils.genomicsDBGetAbsolutePath(workspace); // From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid // returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first. // returnCode = -1 : path was not a directory @@ -1016,7 +1062,7 @@ private String overwriteCreateOrCheckWorkspace() { } static class UnableToCreateGenomicsDBWorkspace extends UserException { - private static final long serialVersionUID = 1L; + @Serial private static final long serialVersionUID = 1L; UnableToCreateGenomicsDBWorkspace(final String message){ super(message); @@ -1028,7 +1074,7 @@ static class UnableToCreateGenomicsDBWorkspace extends UserException { * dictionary (as returned by {@link #getBestAvailableSequenceDictionary}) * to parse/verify them. Does nothing if no intervals were specified. */ - protected void initializeIntervals() { + void initializeIntervals() { if (intervalArgumentCollection.intervalsSpecified()) { if (getIntervalsFromExistingWorkspace || doIncrementalImport) { logger.warn(INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" was set, so ignoring specified intervals." + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java index 234fee68b3b..83f1048f535 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java @@ -3,7 +3,7 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList; import java.util.ArrayList; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index 935a52a159f..6cd7246fe92 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -5,8 +5,8 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import org.apache.avro.generic.GenericRecord; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang.math.LongRange; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.LongRange; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.engine.FeatureContext; @@ -39,10 +39,10 @@ public class ExtractCohortEngine { // a better, ref-dependent different manner if/when we get around to supporting other references // See: https://en.wikipedia.org/wiki/Pseudoautosomal_region private final List PARRegions = List.of( - new LongRange(23000000010001L, 23000002781479L), // PAR1, X - new LongRange(24000000010001L, 24000002781479L), // PAR1, Y - new LongRange(23000155701383L, 23000156030895L), // PAR2, X - new LongRange(24000056887903L, 24000057217415L)); // PAR2, Y + LongRange.of(23000000010001L, 23000002781479L), // PAR1, X + LongRange.of(24000000010001L, 24000002781479L), // PAR1, Y + LongRange.of(23000155701383L, 23000156030895L), // PAR2, X + LongRange.of(24000056887903L, 24000057217415L)); // PAR2, Y private final boolean printDebugInformation; private final int localSortMaxRecordsInRam; @@ -679,7 +679,7 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI private boolean isInPAR(final long location) { for (LongRange region : PARRegions) { - if (region.containsLong(location)) { + if (region.contains(location)) { return true; } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java index a81a57d8be3..8ab3b6b9d6f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java @@ -2,7 +2,7 @@ import htsjdk.samtools.util.Locatable; import org.apache.avro.generic.GenericRecord; -import org.apache.commons.lang.builder.ReflectionToStringBuilder; +import org.apache.commons.lang3.builder.ReflectionToStringBuilder; import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; public class ExtractCohortFilterRecord implements Locatable { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java index ffa1e44e77e..623c0678902 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java @@ -2,7 +2,7 @@ import htsjdk.samtools.util.Locatable; import org.apache.avro.generic.GenericRecord; -import org.apache.commons.lang.builder.ReflectionToStringBuilder; +import org.apache.commons.lang3.builder.ReflectionToStringBuilder; import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; import java.util.Objects; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java index 8f929262b7a..e5cda0ef47e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java @@ -3,7 +3,7 @@ import htsjdk.samtools.util.Locatable; import org.apache.avro.generic.GenericRecord; import org.apache.avro.util.Utf8; -import org.apache.commons.lang.builder.ReflectionToStringBuilder; +import org.apache.commons.lang3.builder.ReflectionToStringBuilder; import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; public class ReferenceRecord implements Locatable, Comparable { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java index a82ad989d26..6df437f01e2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java @@ -4,7 +4,7 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java index de6a914e42c..f1f71a6eb7f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.spark; +import com.google.common.io.Files; import htsjdk.samtools.*; import htsjdk.samtools.BAMSBIIndexer; import htsjdk.samtools.seekablestream.SeekableFileStream; @@ -18,7 +19,6 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.ReadConstants; -import org.codehaus.plexus.util.FileUtils; import picard.cmdline.programgroups.OtherProgramGroup; import java.io.*; @@ -166,7 +166,7 @@ private static void assertBamIsCoordinateSorted(final SAMFileHeader header) { private static void assertIsBam(final File inputBam) { if(!BamFileIoUtils.isBamFile(inputBam)) { throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a " - + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified."); + + "file with extension "+ Files.getFileExtension(inputBam.getName()) + " was specified."); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 95bb083f03b..de91be5e5b6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -89,7 +89,10 @@ public enum ComplexVariantSubtype { piDUP_RF, dDUP, dDUP_iDEL, - INS_iDEL + INS_iDEL, + CTX_PP_QQ, + CTX_PQ_QP, + CTX_INV } // not defined in output vcf header but used in internal id that is currently output in the ID column @@ -163,6 +166,7 @@ public enum ComplexVariantSubtype { public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT"; public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS"; public static final String TSS_DUP = "PREDICTED_TSS_DUP"; + public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP"; // SVTYPE classes public enum StructuralVariantAnnotationType { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java index 6c016f84436..2f51a7df114 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java @@ -4,7 +4,7 @@ import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; -import com.netflix.servo.util.VisibleForTesting; +import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.*; import htsjdk.samtools.util.Locatable; import htsjdk.samtools.util.SequenceUtil; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java index 8edcd595881..0f95878b389 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java @@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) { Utils.nonNull(dictionary); validatePosition(contigA, positionA, dictionary); validatePosition(contigB, positionB, dictionary); - Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0, - "End coordinate cannot precede start"); + // CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and + // B references the source sequence. + if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) { + Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0, + "End coordinate cannot precede start"); + } } private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java index da4e66f3fb6..9ca5056a679 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java @@ -31,6 +31,7 @@ */ public class ClosestSVFinder { + protected final boolean sortOutput; protected final Map truthIdToItemMap; protected final Map idToClusterMap; private final SVConcordanceLinkage linkage; @@ -49,7 +50,9 @@ public class ClosestSVFinder { */ public ClosestSVFinder(final SVConcordanceLinkage linkage, final Function collapser, + final boolean sortOutput, final SAMSequenceDictionary dictionary) { + this.sortOutput = sortOutput; this.linkage = Utils.nonNull(linkage); this.collapser = Utils.nonNull(collapser); outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary)); @@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage, lastItemContig = null; } + /** + * Sorts output by default + */ + public ClosestSVFinder(final SVConcordanceLinkage linkage, + final Function collapser, + final SAMSequenceDictionary dictionary) { + this(linkage, collapser, true, dictionary); + } + /** * Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered. * @param force flushes all variants in the output buffer regardless of position @@ -70,8 +82,12 @@ public List flush(final boolean force) { .map(c -> new ClosestPair(c.getItem(), c.getClosest())) .map(collapser) .collect(Collectors.toList()); - outputBuffer.addAll(collapsedRecords); - return flushBuffer(force); + if (sortOutput) { + outputBuffer.addAll(collapsedRecords); + return flushBuffer(force); + } else { + return collapsedRecords; + } } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java index 50c98a43f3d..23942237008 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java @@ -461,7 +461,7 @@ private VariantContext referenceBlockMerge(final List vcs, final final GenotypeBuilder gBuilder = new GenotypeBuilder(g); GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gBuilder, assignmentMethod, - g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g.getAlleles(), null); + g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g, null); genotypes.add(gBuilder.make()); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java index 312a90872e7..9036ecdd44d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java @@ -150,8 +150,9 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer final VariantContext result; + // only re-genotype polymorphic sites if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { - // only re-genotype polymorphic sites + // note that the calculateGenotypes method also calculates the QUAL score final VariantContext regenotypedVC = calculateGenotypes(originalVC, includeNonVariants); if (regenotypedVC == null) { return null; @@ -186,7 +187,7 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer //don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes())); VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true); - return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false, keepSB)).make(); + return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(annotated, false, keepSB)).make(); } else if (includeNonVariants) { // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true, false)).make(); @@ -461,24 +462,29 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, GenotypeGVCFs.PHASED_HOM_VAR_STRING); } - // create AD if it's not there - if ( !oldGT.hasAD() && vc.isVariant() ) { + // create AD if it's not there, but only if there's data + if ( !oldGT.hasAD() && vc.isVariant() && depth > 0) { final int[] AD = new int[vc.getNAlleles()]; AD[0] = depth; builder.AD(AD); } if ( createRefGTs ) { - // move the GQ to RGQ - if (oldGT.hasGQ()) { + //keep 0 depth samples and 0 GQ samples as no-call + if (depth > 0 && oldGT.hasGQ()) { + if (oldGT.getGQ() > 0) { + final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference()); + builder.alleles(refAlleles); + } else { + builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL)); + } + + // move the GQ to RGQ builder.noGQ(); attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ()); - } - - //keep 0 depth samples and 0 GQ samples as no-call - if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) { - final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference()); - builder.alleles(refAlleles); + } else { + builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL)); + builder.noGQ().noDP(); } // also, the PLs are technically no longer usable @@ -494,8 +500,8 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final * Does this genotype look like it has no reads and should be excluded from annotations? */ static boolean excludeFromAnnotations(Genotype oldGT) { - return oldGT.isHomRef() && !oldGT.hasPL() - && ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP()) + return (oldGT.isHomRef() || oldGT.isNoCall()) + && (!oldGT.hasDP() || oldGT.getDP() == 0) && oldGT.hasGQ() && oldGT.getGQ() == 0; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java index 6c36592b6ae..16b5142fb84 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java @@ -4,7 +4,7 @@ import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import org.aeonbits.owner.util.Collections; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.argparser.ExperimentalFeature; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java index f57d50a7cad..9484da25a9a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java @@ -439,7 +439,8 @@ protected static void removeStaleAttributesAfterMerge(final Map attributes.remove(GATKVCFConstants.MLE_ALLELE_COUNT_KEY); attributes.remove(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY); attributes.remove(VCFConstants.END_KEY); - attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT + attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); + attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT } /** @@ -579,6 +580,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc, final int ploidy = g.getPloidy(); final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g); if (!doSomaticMerge) { + //do attribute subsetting if (g.hasPL() || g.hasAD()) { int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false); if (g.hasPL()) { @@ -590,8 +592,10 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc, if (g.hasAD()) { genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles)); } + + } //clean up low confidence hom refs for better annotations later - } else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) { + if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) { genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); } } @@ -658,7 +662,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc, GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), genotypeBuilder, assignmentMethod, g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, - targetAlleles, originalGTAlleles, null); + targetAlleles, new GenotypeBuilder(g).alleles(originalGTAlleles).make(), null); mergedGenotypes.add(genotypeBuilder.make()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java index b0ff4985cec..7ba0a0baa24 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java @@ -3,14 +3,19 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import java.util.*; public final class AnnotationUtils { + + public static final String ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX = "AS_"; public static final String ALLELE_SPECIFIC_RAW_DELIM = "|"; public static final String ALLELE_SPECIFIC_REDUCED_DELIM = ","; public static final String ALLELE_SPECIFIC_SPLIT_REGEX = "\\|"; //String.split takes a regex, so we need to escape the pipe @@ -74,16 +79,23 @@ public static List decodeAnyASList( final String somethingList) { * @param annotation the annotation to be tested * @return true if the annotation is expected to have values per-allele */ - public static boolean isAlleleSpecific(final InfoFieldAnnotation annotation) { + public static boolean isAlleleSpecific(final VariantAnnotation annotation) { return annotation instanceof AlleleSpecificAnnotation; } + public static boolean isAlleleSpecificGatkKey(final String annotationKey) { + final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey); + return header.getCountType().equals(VCFHeaderLineCount.A) || + header.getCountType().equals(VCFHeaderLineCount.R) || + annotationKey.startsWith(ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX); + } + /** - * Handles all the Java and htsjdk parsing shenanigans - * @param rawDataString should not have surrounding brackets + * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString + * @param rawDataString may have surrounding brackets, with raw delimiter * @return */ - public static List getAlleleLengthListOfString(String rawDataString) { + public static List getAlleleLengthListOfStringFromRawData(String rawDataString) { if (rawDataString == null) { return Collections.emptyList(); } @@ -93,6 +105,21 @@ public static List getAlleleLengthListOfString(String rawDataString) { return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data } + /** + * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString + * @param dataString may have surrounding brackets, with reduced delimieter + * @return + */ + public static List getAlleleLengthListOfString(String dataString) { + if (dataString == null) { + return Collections.emptyList(); + } + if (dataString.startsWith("[")) { + dataString = dataString.substring(1, dataString.length() - 1).replaceAll("\\s", ""); + } + return Arrays.asList(dataString.split(ALLELE_SPECIFIC_REDUCED_DELIM, -1)); //-1 to keep empty data + } + static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) { final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() + " and possibly subsequent"); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java index 931a6bb5256..8ea5771b78e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java @@ -4,12 +4,13 @@ import htsjdk.samtools.util.Locatable; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; -import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.mutable.MutableInt; import org.broadinstitute.barclay.argparser.Argument; import org.apache.commons.lang3.tuple.Triple; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotation; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.haplotype.Event; @@ -27,7 +28,7 @@ */ @DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Describe the complexity of an assembly region") -public class AssemblyComplexity implements JumboInfoAnnotation { +public class AssemblyComplexity implements JumboInfoAnnotation, AlleleSpecificAnnotation { @Argument(fullName = "assembly-complexity-reference-mode", doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes", @@ -189,5 +190,4 @@ private static int uniqueVariants(final Haplotype hap1, final Haplotype hap2, fi private static int editDistance(final Haplotype hap1, final Haplotype hap2, final int excludedPosition) { return uniqueVariants(hap1, hap2, excludedPosition) + uniqueVariants(hap2, hap1, excludedPosition); } - } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java deleted file mode 100644 index c870d9693ee..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java +++ /dev/null @@ -1,175 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.annotator; - - -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; -import htsjdk.variant.variantcontext.VariantContext; -import org.apache.commons.lang.StringUtils; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; -import org.broadinstitute.hellbender.engine.FeatureContext; -import org.broadinstitute.hellbender.engine.ReferenceContext; -import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; -import org.broadinstitute.hellbender.utils.haplotype.Haplotype; -import org.broadinstitute.hellbender.utils.help.HelpConstants; -import org.broadinstitute.hellbender.utils.read.AlignmentUtils; -import org.broadinstitute.hellbender.utils.read.Fragment; -import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; -import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; -import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; - -import java.util.*; -import java.util.stream.Collectors; - -/** - * For each sample and for each allele a list feature vectors of supporting reads - * In order to reduce the number of delimiter characters, we flatten featurized reads. For example, suppose allele 1 has featurized reads - * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is - * 1,2,3,4|5,6,7,8 - */ -@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, - summary="Featurized read sets for Mutect3 training data") -public class FeaturizedReadSets implements JumboGenotypeAnnotation { - public static final int DEFAULT_BASE_QUALITY = 25; - - private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE; - - public static final int FEATURES_PER_READ = 11; - - private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA); - - // downsample ref reads to this count if needed - private final int maxRefCount; - - public FeaturizedReadSets(final int maxRefCount) { - this.maxRefCount = maxRefCount; - } - - public FeaturizedReadSets() { - this(DEFAULT_MAX_REF_COUNT); - } - - @Override - public void annotate(final ReferenceContext ref, - final FeatureContext features, - final VariantContext vc, - final Genotype g, - final GenotypeBuilder gb, - final AlleleLikelihoods likelihoods, - final AlleleLikelihoods fragmentLikelihoods, - final AlleleLikelihoods haplotypeLikelihoods) { - Utils.nonNull(vc); - Utils.nonNull(g); - Utils.nonNull(gb); - - if ( likelihoods == null) { - return; - } - - final List >> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()), - likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE); - - // flatten twice: over all reads supporting each allele and over all alleles - // we can partition by allele with the countsInAlleleOrder annotation - // and by read using the constant feature vector size - final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream() - .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream)) - .mapToInt(n -> n) - .toArray(); - - final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray(); - - gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder); - gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder); - } - - public static List
>> getReadVectors(final VariantContext vc, - final Collection
samples, - final AlleleLikelihoods likelihoods, - final AlleleLikelihoods haplotypeLikelihoods, - final int refDownsample, - final int altDownsample) { - return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap()); - } - - // returns Lists (in allele order) of lists of read vectors supporting each allele - public static List >> getReadVectors(final VariantContext vc, - final Collection
samples, - final AlleleLikelihoods likelihoods, - final AlleleLikelihoods haplotypeLikelihoods, - final int refDownsample, - final int altDownsample, - final Map altDownsampleMap) { - final Map > readsByAllele = likelihoods.alleles().stream() - .collect(Collectors.toMap(a -> a, a -> new ArrayList<>())); - - samples.stream().flatMap(s -> likelihoods.bestAllelesBreakingTies(s).stream()) - .filter(ba -> ba.isInformative()) - .forEach(ba -> readsByAllele.get(ba.allele).add(ba.evidence)); - - // downsample if necessary - final Allele refAllele = likelihoods.alleles().stream().filter(Allele::isReference).findFirst().get(); - for (final Allele allele : likelihoods.alleles()) { - final int downsample = allele.isReference() ? refDownsample : altDownsampleMap.getOrDefault(allele, altDownsample); - if (readsByAllele.get(allele).size() > downsample) { - Collections.shuffle(readsByAllele.get(allele)); - readsByAllele.put(allele, readsByAllele.get(allele).subList(0, downsample)); - } - } - - final Map bestHaplotypes = new HashMap<>(); - samples.stream().flatMap(s -> haplotypeLikelihoods.bestAllelesBreakingTies(s).stream()) - .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele))); - - return vc.getAlleles().stream() - .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList())) - .collect(Collectors.toList()); - } - - - @Override - public List getKeyNames() { - return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY); - } - - private static List featurize(final GATKRead read, final VariantContext vc, final Map bestHaplotypes) { - final List result = new ArrayList<>(); - result.add(read.getMappingQuality()); - result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY)); - result.add(read.isFirstOfPair() ? 1 : 0); - result.add(read.isReverseStrand() ? 1 : 0); - - // distances from ends of read - final int readPosition = ReadPosition.getPosition(read, vc).orElse(0); - result.add(readPosition); - result.add(read.getLength() - readPosition); - - - result.add(Math.abs(read.getFragmentLength())); - - // distances from ends of fragment - final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart()); - final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength()); - result.add(vc.getStart() - fragmentStart); - result.add(fragmentEnd - vc.getEnd()); - - // mismatches versus best haplotype - final Haplotype haplotype = bestHaplotypes.get(read); - final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); - final GATKRead copy = read.copy(); - copy.setCigar(readToHaplotypeAlignment.getCigar()); - final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches; - result.add(mismatchCount); - - final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count(); - result.add((int) indelsVsBestHaplotype); - Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features"); - - return result; - } - -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java index addbd29dfd7..96a749de50c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java @@ -4,7 +4,7 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; -import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.mutable.MutableInt; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java index 9c2d99b97b7..3ef5ade7148 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java @@ -4,7 +4,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.utils.Utils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java index e83be35f50f..930a6a51033 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -145,6 +145,10 @@ public List getInfoAnnotations() { return Collections.unmodifiableList(infoAnnotations); } + public List getJumboInfoAnnotations() { + return Collections.unmodifiableList(jumboInfoAnnotations); + } + /** * * @param infoAnnotationClassName the name of the Java class, NOT the annotation VCF key diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java index 9d78efd6d1b..f8bb3b3cea9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java @@ -7,7 +7,7 @@ import htsjdk.variant.vcf.VCFCompoundHeaderLine; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java index 5b4af7ec0ff..ef01d6fbf62 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java @@ -5,7 +5,7 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.engine.ReferenceContext; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java index d8466d0d997..b521f157d28 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java @@ -153,7 +153,7 @@ public Map finalizeRawData(final VariantContext vc, final Varia } protected void parseRawDataString(ReducibleAnnotationData > myData) { - List
values = AnnotationUtils.getAlleleLengthListOfString(myData.getRawData()); + List values = AnnotationUtils.getAlleleLengthListOfStringFromRawData(myData.getRawData()); if (values.size() != myData.getAlleles().size()) { throw new IllegalStateException("Number of alleles and number of allele-specific entries do not match. " + "Allele-specific annotations should have an entry for each allele including the reference."); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java index 0c4a07f015a..11890925065 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java @@ -4,6 +4,7 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.walkers.annotator.InfoFieldAnnotation; @@ -33,7 +34,6 @@ * are accumulated as well. */ public abstract class FlowAnnotatorBase implements InfoFieldAnnotation { - private final static Logger logger = LogManager.getLogger(FlowAnnotatorBase.class); protected final OneShotLogger flowMissingOneShotLogger = new OneShotLogger(FlowAnnotatorBase.class); @@ -149,7 +149,7 @@ private String establishReadGroupFlowOrder(final LocalContext localContext, fina // if here, no flow order was found. may we use a default? if ( isActualFlowOrderRequired() ) { localContext.generateAnnotation = false; - flowMissingOneShotLogger.warn("this.getClass().getSimpleName() + \" annotation will not be calculated, no '\" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + \"' argument provided\""); + flowMissingOneShotLogger.warn(this.getClass().getSimpleName() + " annotation will not be calculated, no '" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + "' argument provided"); } return FlowBasedRead.DEFAULT_FLOW_ORDER; @@ -201,7 +201,7 @@ protected void variantType(final VariantContext vc, final LocalContext localCont if (isSpecial(alleles.get(i))){ continue; } - if ((localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){ + if ((localContext.hmerIndelLength==null) || (localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){ isHmer=false; } } @@ -249,7 +249,12 @@ protected void isHmerIndel(final VariantContext vc, final LocalContext localCont // get byte before and after final byte before = getReferenceNucleotide(localContext, vc.getStart() - 1); final byte[] after = getReferenceHmerPlus(localContext, vc.getEnd() + 1, MOTIF_SIZE); - + if (after.length==0){ + flowMissingOneShotLogger.warn("Failed to get hmer from reference, probably because the " + + "variant is very close to the end of the chromosome, isHmerIndel and RightMotif annotations will not be calculated. " + + "Variant position: " + vc.getContig() + ":" + vc.getEnd() + 1); + return; + } // build two haplotypes. add byte before and after final byte[] refHap = buildHaplotype(before, ref.getBases(), after); final byte[] altHap = buildHaplotype(before, alt.getBases(), after); @@ -337,6 +342,12 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon } String motif = getRefMotif(localContext, vc.getStart() - MOTIF_SIZE, MOTIF_SIZE); + if (motif.length() != MOTIF_SIZE){ + flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because the variant is very close to the " + + "start of the chromosome. LeftMotif annotation will not be calculated. " + + "Variant position: " + vc.getContig() + ":" + vc.getStart()); + return; + } if ( a.length() != refLength ) { motif = motif.substring(1) + vc.getReference().getBaseString().substring(0, 1); } @@ -349,8 +360,13 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon protected void getRightMotif(final VariantContext vc, final LocalContext localContext) { final int refLength = vc.getReference().length(); - final String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE); - + String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE); + if (motif.length() != MOTIF_SIZE){ + flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because " + + "the variant is close to the end of the chromosome. RightMotif annotation will not be calculated. " + + "Variant position: " + vc.getContig() + ":" + vc.getStart()); + return; + } // fill empty entries (non indel alelles) for ( int i = 0 ; i < localContext.rightMotif.size() ; i++ ) { if ( localContext.rightMotif.get(i) == null ) { @@ -365,6 +381,11 @@ protected void gcContent(final VariantContext vc, final LocalContext localContex final int begin = vc.getStart() - (GC_CONTENT_SIZE / 2); final String seq = getRefMotif(localContext, begin + 1, GC_CONTENT_SIZE); + if ( seq.length() != GC_CONTENT_SIZE ) { + flowMissingOneShotLogger.warn("gcContent will not be calculated at position " + vc.getContig() + ":" + vc.getStart() + + " because the variant is too close to the edge of the chromosome"); + return; + } int gcCount = 0; for ( byte b : seq.getBytes() ) { if ( b == 'G' || b == 'C' ) { @@ -423,11 +444,11 @@ protected void cycleSkip(final VariantContext vc, final LocalContext localContex localContext.attributes.put(GATKVCFConstants.FLOW_CYCLESKIP_STATUS, css); } - // get a single nucleoid from reference + // get a single nucleotid from reference private byte getReferenceNucleotide(final LocalContext localContext, final int start) { final int index = start - localContext.ref.getWindow().getStart(); final byte[] bases = localContext.ref.getBases(); - Utils.validIndex(index, bases.length); + Utils.validIndex(index, bases.length); // do not catch, if here the location is outside of the reference, there is a problem! return bases[index]; } @@ -435,7 +456,13 @@ private byte getReferenceNucleotide(final LocalContext localContext, final int s private byte[] getReferenceHmerPlus(final LocalContext localContext, final int start, final int additional) { int index = start - localContext.ref.getWindow().getStart(); final byte[] bases = localContext.ref.getBases(); - Utils.validIndex(index, bases.length); + try { + Utils.validIndex(index, bases.length); + } catch (IllegalArgumentException e) { + flowMissingOneShotLogger.warn("Failed to get hmer from reference, too close to the edge. " + + "Position: " + localContext.ref.getContig() + ":" + index); + return new byte[0]; + } // get hmer final StringBuilder sb = new StringBuilder(); @@ -457,8 +484,12 @@ private String getRefMotif(final LocalContext localContext, final int start, fin final byte[] bases = localContext.ref.getBases(); final int startIndex = start - localContext.ref.getWindow().getStart(); final int endIndex = startIndex + length; - Utils.validIndex(startIndex, bases.length); - Utils.validIndex(endIndex-1, bases.length); + try { + Utils.validIndex(startIndex, bases.length); + Utils.validIndex(endIndex-1, bases.length); + } catch (IllegalArgumentException e) { + return ""; + } return new String(Arrays.copyOfRange(bases, startIndex, endIndex)); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java index 44b07f6fcfe..93420133913 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java @@ -1,7 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.contamination; import htsjdk.samtools.util.OverlapDetector; -import org.apache.commons.lang.mutable.MutableDouble; +import org.apache.commons.lang3.mutable.MutableDouble; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.optim.univariate.UnivariatePointValuePair; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java index 1a023488270..607f6691c58 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java @@ -13,7 +13,7 @@ import java.util.stream.IntStream; public class ContaminationSegmenter { - public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.between(0.1, 0.9); + public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.of(0.1, 0.9); public static final double KERNEL_SEGMENTER_LINEAR_COST = 1.0; public static final double KERNEL_SEGMENTER_LOG_LINEAR_COST = 1.0; public static final int KERNEL_SEGMENTER_DIMENSION = 100; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java index 9c4f09d1bcd..9d6c40a691c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java @@ -9,6 +9,7 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup; import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; @@ -100,6 +101,13 @@ * file that have AF of 0.01 or more. * * + * + * Finally for those using mappers other than bwa mem or dragen-os, {@code --minimum-mapping-quality} threshold is + * set to 50, which limits the usable reads that tool considers for generating pileups. Certain mappers are known to + * assign scores less that this threshold even for the unique mappings. If you observe all empty results in your + * summary file please adjust the {@code --minimum-mapping-quality} parameter according to your input files. + *
+ * */ @CommandLineProgramProperties( summary = "Tabulates pileup metrics for inferring contamination", @@ -112,8 +120,7 @@ public class GetPileupSummaries extends LocusWalker { public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency"; public static final String MAX_SITE_AF_SHORT_NAME = "max-af"; public static final String MIN_SITE_AF_SHORT_NAME = "min-af"; - public static final String MIN_MAPPING_QUALITY_LONG_NAME = "min-mapping-quality"; - public static final String MIN_MAPPING_QUALITY_SHORT_NAME = "mmq"; + private static final double DEFAULT_MIN_POPULATION_AF = 0.01; private static final double DEFAULT_MAX_POPULATION_AF = 0.2; @@ -137,9 +144,6 @@ public class GetPileupSummaries extends LocusWalker { doc = "Maximum population allele frequency of sites to consider.", optional = true) private double maxPopulationAlleleFrequency = DEFAULT_MAX_POPULATION_AF; - @Argument(fullName = MIN_MAPPING_QUALITY_LONG_NAME, shortName = MIN_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum read mapping quality", optional = true) - private int minMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY; - private boolean sawVariantsWithoutAlleleFrequency = false; private boolean sawVariantsWithAlleleFrequency = false; @@ -168,6 +172,7 @@ public boolean requiresFeatures() { @Override public ListgetDefaultReadFilters() { final List filters = new ArrayList<>(); + filters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY)); filters.add(ReadFilterLibrary.MAPPING_QUALITY_AVAILABLE); filters.add(ReadFilterLibrary.MAPPING_QUALITY_NOT_ZERO); filters.add(ReadFilterLibrary.MAPPED); @@ -208,8 +213,7 @@ public void apply(AlignmentContext alignmentContext, ReferenceContext referenceC final VariantContext vc = vcs.get(0); if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) { - final ReadPileup pileup = alignmentContext.getBasePileup() - .makeFilteredPileup(pe -> pe.getRead().getMappingQuality() >= minMappingQuality); + final ReadPileup pileup = alignmentContext.getBasePileup(); try { writer.writeRecord(new PileupSummary(vc, pileup)); } catch (final IOException ex) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java new file mode 100644 index 00000000000..83ac040011c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java @@ -0,0 +1,626 @@ +package org.broadinstitute.hellbender.tools.walkers.featuremapping; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMUtils; +import org.apache.commons.lang3.StringUtils; +import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.engine.ReadWalker; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.groundtruth.SeriesStats; +import org.broadinstitute.hellbender.utils.read.*; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +@CommandLineProgramProperties( + summary = "This program converts the flow qualities that Ultima Genomics CRAM reports to more conventional base qualities. " + + "Specifically, the generated quality will report the probability that a specific base is a sequencing error mismatch, " + + "while auxilary tags qa, qt, qg, qc report specific probability that a specific base X is a A->X error. " + + "Since mismatch error in flow-based chemistries can only occur as a result of several indel errors, " + + "we implemented several strategies to estimate the probability of a mismatch which can be specified" + + "using the svnq-mode parameter: " + + "Legacy - the quality value from flow matrix is used. " + + "Optimistic - assuming that the probability of the indel errors are p1 and p2, then snvq=p1*p2 - assuming they always coincide. " + + "Pessimistic - snvq=(1-p1)*(1-p2) - assuming they never coincide. " + + "Geometric - snvq=sqrt(Optimistic*Pessimistic) - i.e. the geometric mean of the optimistic and Pessimistic modes. " + + "The Geometric is set as the default mode", + oneLineSummary = "Add SNV Quality to the flow-based CRAM", + programGroup = FlowBasedProgramGroup.class +) + +@DocumentedFeature +@ExperimentalFeature +public final class AddFlowSNVQuality extends ReadWalker { + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "File to which reads should be written") + @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) + public GATKPath output = null; + private SAMFileGATKReadWriter outputWriter; + + @ArgumentCollection + public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); + + @ArgumentCollection + public AddFlowSNVQualityArgumentCollection aqArgs = new AddFlowSNVQualityArgumentCollection(); + + public static final char PHRED_ASCII_BASE = '!'; + + public static final int ERROR_PROB_BAND_1LESS = 0; + public static final int ERROR_PROB_BAND_KEY = 1; + public static final int ERROR_PROB_BAND_1MORE = 2; + public static final int ERROR_PROB_BANDS = 3; + + public double minLikelihoodProbRate = 1e-6; + public int maxQualityScore = 60; + + // locals + private SeriesStats inputQualStats = new SeriesStats(); + private SeriesStats outputBQStats = new SeriesStats(); + private SeriesStats outputQAltStats = new SeriesStats(); + private SeriesStats outputQCalledStats = new SeriesStats(); + private SeriesStats outputSumPStats = new SeriesStats(); + + // private class to hold the base probabilities and SNVQ probabilties for a read + class ReadProbs { + double[] baseProbs; + double[][] snvqProbs; // length of first dimension is flow order length + } + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + outputWriter = createSAMWriter(output, true); + } + + @Override + public void closeTool() { + super.closeTool(); + if ( outputWriter != null ) { + outputWriter.close(); + } + + try { + if ( aqArgs.debugCollectStatsInto != null ) + printStats(aqArgs.debugCollectStatsInto); + } catch (IOException e) { + throw new GATKException("", e); + } + } + + @Override + public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { + + // include supplementary alignments? + if ( read.isSupplementaryAlignment() && !aqArgs.keepSupplementaryAlignments ) { + return; + } + + // include qc-failed reads? + if ( read.failsVendorQualityCheck() && !aqArgs.includeQcFailedReads ) { + return; + } + + // collect input stats + if ( aqArgs.debugCollectStatsInto != null ) { + collectInputStats(read); + } + + // add SNVQ attributes + addBaseQuality(read, getHeaderForReads(), aqArgs.maxPhredScore, fbargs); + + // collect output stats + if ( aqArgs.debugCollectStatsInto != null ) { + collectOutputStats(read); + if ( aqArgs.debugReadName.size() != 0 && aqArgs.debugReadName.contains(read.getName()) ) { + dumpOutputRead(read); + } + } + + // write read to output + outputWriter.addRead(read); + } + + private void collectInputStats(GATKRead read) { + for ( byte q : read.getBaseQualitiesNoCopy() ) { + inputQualStats.add(q); + } + } + + private void collectOutputStats(GATKRead read) { + if ( aqArgs.outputQualityAttribute != null ) { + if (read.hasAttribute(aqArgs.outputQualityAttribute)) { + for (byte q : read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes()) { + outputBQStats.add(SAMUtils.fastqToPhred((char) q)); + } + } + } else { + for (byte q : read.getBaseQualitiesNoCopy()) { + outputBQStats.add(q); + } + } + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); + final byte[] bases = read.getBasesNoCopy(); + final double[] sumP = new double[bases.length]; + for ( int i = 0 ; i < 4 ; i++ ) { + byte altBase = rgInfo.flowOrder.getBytes()[i]; + String attrValue = read.getAttributeAsString(attrNameForNonCalledBase(altBase)); + int ofs = 0; + for ( byte q : attrValue.getBytes() ) { + if ( bases[ofs] != altBase ) { + outputQAltStats.add(SAMUtils.fastqToPhred((char)q)); + } else { + outputQCalledStats.add(SAMUtils.fastqToPhred((char)q)); + } + sumP[ofs] += Math.pow(10.0, SAMUtils.fastqToPhred((char)q) / -10.0); + ofs++; + + } + } + for ( double p : sumP ) { + outputSumPStats.add(p); + } + } + + // dump read as a csv for easier analysis + private void dumpOutputRead(GATKRead read) { + + try { + // open file + final String fname = aqArgs.debugCollectStatsInto + "." + read.getName() + ".csv"; + logger.info("dumping read into: " + fname); + final PrintWriter pw = new PrintWriter(fname); + + // write header + final StringBuilder hdr = new StringBuilder(); + hdr.append("pos,base,qual,tp,t0,bq"); + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); + for (int i = 0; i < 4; i++) { + hdr.append(","); + hdr.append(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i))); + } + hdr.append(",qCalled"); + pw.println(hdr); + + // access data + final byte[] bases = read.getBasesNoCopy(); + final byte[] quals = read.getBaseQualitiesNoCopy(); + final byte[] tp = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_TAG_NAME); + final byte[] t0 = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_T0_TAG_NAME); + final byte[] bq = (aqArgs.outputQualityAttribute != null) + ? read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes() + : null; + final byte[][] qX = new byte[4][]; + for (int i = 0; i < 4; i++) { + qX[i] = read.getAttributeAsString(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i))).getBytes(); + } + + // write lines + List line = new LinkedList<>(); + for (int pos = 0; pos < bases.length; pos++) { + line.clear(); + + // position + line.add(Integer.toString(pos)); + + // base, qual + line.add(Character.toString(bases[pos])); + line.add(Integer.toString(quals[pos])); + + // tp,t0,bq + line.add(Integer.toString(tp[pos])); + line.add(Integer.toString(SAMUtils.fastqToPhred((char)t0[pos]))); + if ( bq != null ) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char) bq[pos]))); + } + + // qX + int calledIndex = -1; + for (int i = 0; i < 4; i++) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[i][pos]))); + if ( bases[pos] == rgInfo.flowOrder.charAt(i) ) { + calledIndex = i; + } + } + + // qCalled + if ( calledIndex >= 0 ) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[calledIndex][pos]))); + } else { + line.add("-1"); + } + + // write the line + pw.println(StringUtils.join(line, ',')); + } + + // close file + pw.close(); + } catch (IOException e) { + throw new GATKException("", e); + } + } + + private void printStats(final String fname) throws IOException { + + inputQualStats.csvWrite(fname + ".inputQual.csv"); + outputBQStats.csvWrite(fname + ".outputBQ.csv"); + outputQAltStats.csvWrite(fname + ".outputQAlt.csv"); + outputQCalledStats.csvWrite(fname + ".outputQCalled.csv"); + outputSumPStats.csvWrite(fname + ".outputSumP.csv"); + } + + static public String attrNameForNonCalledBase(byte nonCalledBase) { + return attrNameForNonCalledBase((char)nonCalledBase); + } + + static public String attrNameForNonCalledBase(char nonCalledBase) { + return "q" + Character.toLowerCase(nonCalledBase); + } + + public void addBaseQuality(final GATKRead read, final SAMFileHeader hdr, double maxPhredScore, FlowBasedArgumentCollection fbargs) { + + // take in phred score limit + if ( !Double.isNaN(maxPhredScore) ) { + maxQualityScore = (int)maxPhredScore; + minLikelihoodProbRate = Math.pow(10, -maxPhredScore / 10.0); + } + + // convert to a flow base read + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(hdr, read); + final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final int flowOrderLength = FlowBasedReadUtils.calcFlowOrderLength(rgInfo.flowOrder); + + // generate base and snvq probabilities for the read + final ReadProbs readProbs = generateFlowReadBaseAndSNVQErrorProbabilities(fbRead, flowOrderLength, rgInfo.flowOrder.getBytes()); + + // install in read + if ( aqArgs.outputQualityAttribute != null ) { + read.setAttribute(aqArgs.outputQualityAttribute, new String(convertErrorProbToFastq(readProbs.baseProbs))); + } else { + read.setBaseQualities(convertErrorProbToPhred(readProbs.baseProbs)); + } + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + final String name = AddFlowSNVQuality.attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i)); + read.setAttribute(name, new String(convertErrorProbToFastq(readProbs.snvqProbs[i]))); + } + } + + // Not using SamUtils function since normally an error probability can not be zero. + // still, this method is called to convert base quality as well as snvq, which is computed. + // the following check is a safety, in case snvq produces a zero. + private char[] convertErrorProbToFastq(double[] errorProb) { + + byte[] phred = convertErrorProbToPhred(errorProb); + return SAMUtils.phredToFastq(phred).toCharArray(); + } + + // Not using SamUtils function since normally an error probability can not be zero. + // still, this method is called to convert base quality as well as snvq, which is computed. + // the following check is a safety, in case snvq produces a zero. + private byte[] convertErrorProbToPhred(double[] errorProb) { + + final byte[] phred = new byte[errorProb.length]; + for ( int i = 0 ; i < errorProb.length ; i++ ) { + + if ( errorProb[i] == 0 ) { + phred[i] = (byte)maxQualityScore; + } else { + final double p = errorProb[i]; + phred[i] = (byte)Math.round(-10 * Math.log10(p)); + } + } + return phred; + } + + /** + * generate base and snvq probabilties for a read. + * + * @param fbRead a flow based read + * @param flowOrderLength number of bases in flow order (essentially number of valid base values) + * @param flowOrder the flow order itself (which can be the size of flowOrderLength or a repeat of it + * + * @return an instance of a private class containing the base probabilities as well as the snvq probabilities + */ + private ReadProbs generateFlowReadBaseAndSNVQErrorProbabilities(final FlowBasedRead fbRead, final int flowOrderLength, byte[] flowOrder) { + + /** + * access key and error probabilities + * for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix} + */ + final int[] key = fbRead.getKey(); + final double[][] errorProbBands = extractErrorProbBands(fbRead, minLikelihoodProbRate); + + // allocate returned prob arrays + final double[] baseProbs = new double[fbRead.getBasesNoCopy().length]; + final double[][] snvqProbs = new double[flowOrderLength][]; + for ( int i = 0 ; i < snvqProbs.length ; i++ ) { + snvqProbs[i] = new double[baseProbs.length]; + } + + // loop over hmers via flow key + int base = 0; + Map allBaseProb0 = new LinkedHashMap<>(); + Map allBaseProb1 = new LinkedHashMap<>(); + + for ( int flow = 0 ; flow < key.length ; flow++ ) { + if ( key[flow] != 0 ) { + + // establish initial stat + allBaseProb0.clear(); + allBaseProb1.clear(); + int flow_i = (flow % flowOrderLength); + + // establish hmer quality + final int hmerLength = key[flow]; + final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength, flowOrder, allBaseProb0, allBaseProb1); + + // install value in first byte of the hmer + baseProbs[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + if ( allBaseProb0.containsKey(flowOrder[i]) ) { + snvqProbs[i][base - 1] = allBaseProb0.get(flowOrder[i]); + } else if ( i != flow_i ) { + snvqProbs[i][base - 1] = minLikelihoodProbRate; + } + } + + // for hmers longer than 1 + if ( hmerLength > 1 ) { + + // skip all but last (leave with zero error probability) + base += (hmerLength - 2); + + // fill last base from computed error probability + baseProbs[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1 + + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + if ( allBaseProb1.containsKey(flowOrder[i]) ) { + final double p = allBaseProb1.get(flowOrder[i]); + for ( int j = 0 ; j < hmerLength - 1 ; j++ ) { + snvqProbs[i][base - 1 - j] = (j == 0) ? p : minLikelihoodProbRate; // all but last get the min prob + } + } else if ( i != flow_i ) { + for ( int j = 0 ; j < hmerLength - 1 ; j++ ) { + snvqProbs[i][base - 1 - j] = minLikelihoodProbRate; + } + } + } + } + + // override result for the last base with the original hmer error probability + if ( base == baseProbs.length ) { + baseProbs[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; + } + } + } + + // adjust probability of called bases so that sum will be 1, also enforce min prob + final byte[] bases = fbRead.getBasesNoCopy(); + for ( int ofs = 0 ; ofs < bases.length ; ofs++ ) { + + // go through alt bases and accumulate p, find out index of called bases (in flow order) + final byte calledBase = bases[ofs]; + double altP = 0; + int calledIndex = -1; + for (int i = 0; i < flowOrderLength; i++) { + if ( calledBase != flowOrder[i] ) { + snvqProbs[i][ofs] = Math.max(minLikelihoodProbRate, snvqProbs[i][ofs]); + altP += snvqProbs[i][ofs]; + } else { + calledIndex = i; + } + } + if ( calledBase < 0 ) { + throw new GATKException(String.format("failed to locate called base %c in flow order %s", (char)calledBase, flowOrder)); + } + + // install probability in called base slot + snvqProbs[calledIndex][ofs] = Math.max(0, 1 - altP); + + // at this point, bq becomes trivial (?) + baseProbs[ofs] = 1 - snvqProbs[calledIndex][ofs]; + } + + // build return value + ReadProbs readProbs = new ReadProbs(); + readProbs.baseProbs = baseProbs; + readProbs.snvqProbs = snvqProbs; + return readProbs; + } + + // extract error probability bands. middle (1) band is the key prob. + // lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths + private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) { + + // access key + final int[] key = flowRead.getKey(); + + // allocate result + double[][] result = new double[ERROR_PROB_BANDS][]; + for ( int i = 0 ; i < result.length ; i++ ) { + result[i] = new double[key.length]; + } + + for ( int i = 0 ; i < key.length ; i++ ) { + + // extract key probability + result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue); + + // -1 + if ( key[i] > 0 ) { + result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue); + } else { + result[ERROR_PROB_BAND_1LESS][i] = minValue; + } + + // +1 + if ( key[i] < flowRead.getMaxHmer() ) { + result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue); + } else { + result[ERROR_PROB_BAND_1MORE][i] = minValue; + } + } + + return result; + } + + @VisibleForTesting + protected double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, + final int flowOrderLength, byte[] flowOrder, + Map allBaseProb0, Map allBaseProb1) { + + // result is left/right error probabilities + final double[] errorProbs = new double[2]; + final int hmerLength = key[flow]; + + errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength, flowOrder, allBaseProb0); + if ( hmerLength != 1 ) { + errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength, flowOrder, allBaseProb1); + } + + return errorProbs; + } + + private double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, + final int flowOrderLength, final byte[] flowOrder, final Map allBaseProb) { + + // create a key slice of the area around the flow/hmer. + final int minIndex = Math.max(flow - (flowOrderLength - 1), 0); + final int maxIndex = Math.min(flow + (flowOrderLength - 1), key.length - 1); + final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1); + final int hmerLength = key[flow]; + + // walk the flows towards the side until (and including) the first non-zero key + // on hmers of length 1 we walk both sides + final class SliceInfo { + int[] slice; + byte altByte; + int sideFlow; + } + final List slices = new LinkedList<>(); + final int[] incrs = (hmerLength != 1) + ? new int[] { sideIncr } + : new int[] { sideIncr, -sideIncr}; + for (int incr : incrs) { + for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) { + + // side flow can no overflow the slice + if ( sideFlow < minIndex || sideFlow > maxIndex ) { + break; + } + + // create a alternative key slice by incrementing sideFlow and decrementing flow + final int[] altSlice = Arrays.copyOf(slice, slice.length); + altSlice[sideFlow - minIndex] += 1; + altSlice[flow - minIndex] -= 1; + if ( sliceIsValidForConsideration(altSlice, flowOrderLength) ) { + SliceInfo si = new SliceInfo(); + si.slice = altSlice; + si.altByte = flowOrder[sideFlow % flowOrderLength]; + si.sideFlow = sideFlow; + slices.add(si); + } + + // is the sideFlow (the first encountered) non-zero? if so, break + if (key[sideFlow] != 0) { + break; + } + } + } + + // at this point, we have a list of valid slices. figure out the error probability for each of them + // and compute the base quality + final double keyP = sliceProbs(slice, minIndex, key, errorProbBands, flow, flow)[0]; + double sumP = keyP; + for ( final SliceInfo si : slices ) { + final double[] sliceP = sliceProbs(si.slice, minIndex, key, errorProbBands, flow, si.sideFlow); + if ( allBaseProb != null ) { + allBaseProb.put(si.altByte, getSnvq(sliceP[0], sliceP[1], sliceP[2], aqArgs.snvMode)); + } + sumP += sliceP[0]; + } + final double ep = 1 - (keyP / sumP); + + return ep; + } + + static double getSnvq(final double sliceP, final double p1, final double p2, AddFlowSNVQualityArgumentCollection.SnvqModeEnum snvMode) { + if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Legacy ) { + return sliceP; + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Optimistic ) { + return (p1 * p2); + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Pessimistic ) { + return (1 - (1 - p1) * (1 - p2)); + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Geometric ) { + return Math.sqrt((p1 * p2) * (1 - (1 - p1) * (1 - p2))); + } else { + throw new GATKException("unknown snvqMode: " + snvMode); + } + } + + // compute probability for a slice + private static double[] sliceProbs(final int[] slice, final int minIndex, final int[] key, final double[][] errorProbBands, + final int flow, final int sideFlow) { + + double accumulatedP = 1.0; + double p1 = 0.0; + double p2 = 0.0; + int key_i = minIndex; + for ( int i = 0 ; i < slice.length ; i++, key_i++ ) { + final int hmer = key[key_i]; + final int band; + if ( slice[i] == (hmer - 1) ) { + band = ERROR_PROB_BAND_1LESS; + } else if ( slice[i] == (hmer + 1) ) { + band = ERROR_PROB_BAND_1MORE; + } else if ( slice[i] == hmer ){ + band = ERROR_PROB_BAND_KEY; + } else { + throw new GATKException("slice[i] and hmer are too far apart: " + slice[i] + " " + hmer); + } + final double p = errorProbBands[band][key_i]; + accumulatedP *= p; + + // collect p1/p2 (flow and sideFlow probs) + if ( key_i == flow ) { + p1 = p; + } + if ( key_i == sideFlow ) { + p2 = p; + } + } + + return new double[] {accumulatedP, p1, p2}; + } + + static boolean sliceIsValidForConsideration(final int[] slice, final int flowOrderLength) { + + // look for strings of consecutive zeros in length of flowOrderLength - 1 + int consecutiveZeros = 0; + for ( int key : slice ) { + if ( key != 0 ) { + consecutiveZeros = 0; + } else { + consecutiveZeros++; + if ( consecutiveZeros >= (flowOrderLength - 1) ) { + return false; + } + } + } + + // if here, not found -> valid + return true; + } +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java new file mode 100644 index 00000000000..4c20b14e59c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java @@ -0,0 +1,70 @@ +package org.broadinstitute.hellbender.tools.walkers.featuremapping; + +import org.broadinstitute.barclay.argparser.Advanced; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.Hidden; + +import java.io.Serializable; +import java.util.List; + +/** + * Set of arguments for the {@link AddFlowSNVQuality} + */ +public class AddFlowSNVQualityArgumentCollection implements Serializable{ + private static final long serialVersionUID = 1L; + public static final String MAX_PHRED_SCORE_FULL_NAME = "max-phred-score"; + public static final String KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME = "keep-supplementary-alignments"; + public static final String INCLUDE_QC_FAILED_READ_FULL_NAME = "include-qc-failed-read"; + public static final String SNVQ_MODE_FULL_NAME = "snvq-mode"; + public static final String OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME = "output-quality-attribute"; + public static final String DEBUG_READ_NAME_FULL_NAME = "debug-read-name"; + public static final String DEBUG_COLLECT_STATS_INTO_FULL_NAME = "debug-collect-stats-into"; + + public enum SnvqModeEnum { + Legacy, + Optimistic, + Pessimistic, + Geometric + }; + + /** + * maximum value for + * delta in score + **/ + @Argument(fullName = MAX_PHRED_SCORE_FULL_NAME, doc = "Limit value for phred scores", optional = true) + public double maxPhredScore = Double.NaN; + + /** + * keep supplementary alignments? + **/ + @Argument(fullName = KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME, doc = "keep supplementary alignments ?", optional = true) + public boolean keepSupplementaryAlignments = true; + + @Advanced + @Argument(fullName= INCLUDE_QC_FAILED_READ_FULL_NAME, doc = "include reads with QC failed flag", optional = true) + public boolean includeQcFailedReads = true; + + /** + * snvq computation mode + */ + @Argument(fullName = SNVQ_MODE_FULL_NAME, doc = "snvq calculation mode.", optional = true) + public SnvqModeEnum snvMode = SnvqModeEnum.Geometric; + + /** + * By default this tool overwrites the QUAL field with the new qualities. Setting this argument saves the original qualities in the specified SAM tag. + */ + @Argument(fullName = OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME, doc = "alternate SAM tag to put original quality scores instead of overwriting the QUAL field. If not used, QUAL will be overwritten.", optional = true) + public String outputQualityAttribute = null; + + /** + * debug read names? + **/ + @Hidden + @Argument(fullName = DEBUG_READ_NAME_FULL_NAME, doc = "Read names of reads to output details of as part of debugging. ", optional = true) + public List debugReadName = null; + + @Advanced + @Hidden + @Argument(fullName= DEBUG_COLLECT_STATS_INTO_FULL_NAME, doc = "Statistics about the reads will be output to given filename.", optional = true) + public String debugCollectStatsInto = null; +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java index b3ddc3e21b6..2014ce3030e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java @@ -2,12 +2,14 @@ import htsjdk.samtools.*; import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.SequenceUtil; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.math3.util.Precision; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -81,6 +83,19 @@ @ExperimentalFeature public final class FlowFeatureMapper extends ReadWalker { + static class CopyAttrInfo { + public final String name; + public final VCFHeaderLineType type; + public final String desc; + + public CopyAttrInfo(final String spec) { + final String[] toks = spec.split(","); + name = toks[0]; + type = toks.length > 1 ? VCFHeaderLineType.valueOf(toks[1]) : VCFHeaderLineType.String; + desc = toks.length > 2 ? StringUtils.join(Arrays.copyOfRange(toks, 2, toks.length), ",") : ("copy-attr: " + name); + } + } + private static final Logger logger = LogManager.getLogger(FlowFeatureMapper.class); private static final String VCB_SOURCE = "fm"; @@ -101,8 +116,18 @@ public final class FlowFeatureMapper extends ReadWalker { private static final String VCF_SMQ_RIGHT = "X_SMQ_RIGHT"; private static final String VCF_SMQ_LEFT_MEAN = "X_SMQ_LEFT_MEAN"; private static final String VCF_SMQ_RIGHT_MEAN = "X_SMQ_RIGHT_MEAN"; + private static final String VCF_ADJACENT_REF_DIFF = "X_ADJACENT_REF_DIFF"; + private static final Double LOWEST_PROB = 0.0001; + private static final int VENDOR_QUALITY_CHECK_FLAG = 0x200; + + private static final String INCLUDE_QC_FAILED_READS_FULL_NAME = "include-qc-failed-reads"; + + final private List copyAttrInfo = new LinkedList<>(); + + // order here is according to SequenceUtil.VALID_BASES_UPPER + final private String scoreForBaseNames[] = new String[SequenceUtil.VALID_BASES_UPPER.length]; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -131,6 +156,10 @@ public final class FlowFeatureMapper extends ReadWalker { @Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true) public boolean floorBlocks = false; + @Advanced + @Argument(fullName=INCLUDE_QC_FAILED_READS_FULL_NAME, doc = "include reads with QC failed flag", optional = true) + public boolean includeQcFailedReads = false; + @ArgumentCollection public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); @@ -175,6 +204,9 @@ protected static class MappedFeature implements Comparable { int smqLeftMean; int smqRightMean; + double scoreForBase[]; + boolean adjacentRefDiff; + public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases, byte[] refBases, int readBasesOffset, int start, int offsetDelta) { this.read = read; @@ -315,8 +347,20 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the right of the feature")); headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the left of the feature")); headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the right of the feature")); - for ( String name : fmArgs.copyAttr ) { - headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name)); + for ( String spec : fmArgs.copyAttr ) { + final CopyAttrInfo info = new CopyAttrInfo(spec); + headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + info.name, 1, info.type, info.desc)); + copyAttrInfo.add(info); + } + + // validation mode? + if ( fmArgs.reportAllAlts ) { + for ( int baseIndex = 0 ; baseIndex < SequenceUtil.VALID_BASES_UPPER.length ; baseIndex++ ) { + headerInfo.add(new VCFInfoHeaderLine(scoreNameForBase(baseIndex), 1, VCFHeaderLineType.Float, "Base specific mapping score")); + } + } + if ( fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff ) { + headerInfo.add(new VCFInfoHeaderLine(VCF_ADJACENT_REF_DIFF, 1, VCFHeaderLineType.Flag, "Adjacent base filter indication: indel in the adjacent 5 bases to the considered base on the read")); } final VCFHeader vcfHeader = new VCFHeader(headerInfo); @@ -324,6 +368,13 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f return vcfHeader; } + private String scoreNameForBase(int baseIndex) { + if ( scoreForBaseNames[baseIndex] == null ) { + scoreForBaseNames[baseIndex] = VCF_SCORE + "_" + new String(new byte[]{SequenceUtil.VALID_BASES_UPPER[baseIndex]}); + } + return scoreForBaseNames[baseIndex]; + } + @Override public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { @@ -337,6 +388,11 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext, return; } + // include qc-failed reads? + if ( ((read.getFlags() & VENDOR_QUALITY_CHECK_FLAG) != 0) && !includeQcFailedReads ) { + return; + } + // flush qeues up to this read flushQueue(read, referenceContext); @@ -349,6 +405,20 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext, // score the feature fr.score = scoreFeature(fr); + // score for validation mode? + if ( fmArgs.reportAllAlts) { + fr.scoreForBase = new double[SequenceUtil.VALID_BASES_UPPER.length]; + for ( int baseIndex = 0 ; baseIndex < fr.scoreForBase.length ; baseIndex++ ) { + final byte base = SequenceUtil.VALID_BASES_UPPER[baseIndex]; + boolean incl = base != fr.readBases[0]; + if ( incl ) { + fr.scoreForBase[baseIndex] = scoreFeature(fr, base); + } else { + fr.scoreForBase[baseIndex] = Double.NaN; + } + } + } + // emit feature if filters in if ( filterFeature(fr) ) { featureQueue.add(fr); @@ -413,14 +483,17 @@ private void enrichFeature(final MappedFeature fr) { } private double scoreFeature(final MappedFeature fr) { + return scoreFeature(fr, (byte)0); + } + private double scoreFeature(final MappedFeature fr, byte altBase) { // build haplotypes final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), fr.read); - final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder); + final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder, altBase); // create flow read - final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, - rgInfo.maxClass, fbargs); + final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final int diffLeft = haplotypes[0].getStart() - flowRead.getStart() + fr.offsetDelta; final int diffRight = flowRead.getEnd() - haplotypes[0].getEnd(); flowRead.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), false); @@ -524,16 +597,24 @@ public static double computeLikelihoodLocal(final FlowBasedRead read, final Flow return result; } - private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder) { + private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder, byte altBase) { // build bases for flow haplotypes // NOTE!!!: this code assumes length of feature on read and reference is the same // this is true for SNP but not for INDELs - it will have to be re-written! // TODO: write for INDEL - final byte[] bases = fr.read.getBasesNoCopy(); + byte[] bases = fr.read.getBasesNoCopy(); int offset = fr.readBasesOffset; int refStart = fr.start; int refModOfs = 0; + + // install alt base? + byte orgBase = 0; + if ( altBase != 0 ) { + orgBase = fr.refBases[0]; + fr.refBases[0] = altBase; + } + if ( offset > 0 ) { // reach into hmer before offset--; @@ -569,6 +650,11 @@ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final Strin new FlowBasedHaplotype(refHaplotype, flowOrder) }; + // restore changes + if ( altBase != 0 ) { + fr.refBases[0] = orgBase; + } + // return return result; } @@ -590,7 +676,11 @@ private void emitFeature(final MappedFeature fr) { // create alleles final Collection alleles = new LinkedList<>(); - alleles.add(Allele.create(fr.readBases, false)); + if ( fmArgs.reportAllAlts && Arrays.equals(fr.readBases, fr.refBases) ) { + alleles.add(Allele.create("*".getBytes(), false)); + } else { + alleles.add(Allele.create(fr.readBases, false)); + } alleles.add(Allele.create(fr.refBases, true)); // create variant context builder @@ -625,11 +715,34 @@ private void emitFeature(final MappedFeature fr) { vcb.attribute(VCF_SMQ_RIGHT_MEAN, fr.smqRightMean); } - for ( String name : fmArgs.copyAttr ) { - if ( fr.read.hasAttribute(name) ) { - vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name)); + for ( CopyAttrInfo info : copyAttrInfo ) { + if ( fr.read.hasAttribute(info.name) ) { + final String attrName = fmArgs.copyAttrPrefix + info.name; + if ( info.type == VCFHeaderLineType.Integer ) { + vcb.attribute(attrName, fr.read.getAttributeAsInteger(info.name)); + } else if ( info.type == VCFHeaderLineType.Float ) { + vcb.attribute(attrName, fr.read.getAttributeAsFloat(info.name)); + } else { + vcb.attribute(attrName, fr.read.getAttributeAsString(info.name)); + } + } + } + + // validation mode? + if ( fmArgs.reportAllAlts ) { + if ( fr.scoreForBase != null ) { + for (int baseIndex = 0; baseIndex < SequenceUtil.VALID_BASES_UPPER.length; baseIndex++) { + if (!Double.isNaN(fr.scoreForBase[baseIndex])) { + vcb.attribute(scoreNameForBase(baseIndex), String.format("%.5f", fr.scoreForBase[baseIndex])); + } + } } } + if ( fr.adjacentRefDiff ) { + vcb.attribute(VCF_ADJACENT_REF_DIFF, true); + } + + // build it! final VariantContext vc = vcb.make(); // write to file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java index 22c13b00878..03660746d2b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java @@ -32,7 +32,7 @@ enum MappingFeatureEnum { /** * attributes to copy from bam **/ - @Argument(fullName = "copy-attr", doc = "attributes to copy from bam", optional = true) + @Argument(fullName = "copy-attr", doc = "attributes to copy from bam. format , , . types: Integer, Float, String, Character, Flag", optional = true) public List copyAttr = new LinkedList<>(); /** @@ -116,4 +116,18 @@ enum MappingFeatureEnum { @Hidden @Argument(fullName = "surrounding-mean-quality-size", doc = "number of bases around the feature to calculate surrounding mean quality", optional = true) public Integer surroundingMeanQualitySize = null; + + /** + * validation mode - if not specified, this feature is off + **/ + @Hidden + @Argument(fullName = "report-all-alts", doc = "In this mode (aka validation mode), every base of every read in the input CRAM and interval is reported, and an X_SCORE value is calculated for all 3 possible alts", optional = true) + public boolean reportAllAlts = false; + + /** + * adjacent-ref-diff mode - if not specified, this feature is off + **/ + @Hidden + @Argument(fullName = "tag-bases-with-adjacent-ref-diff", doc = "In this mode bases that have an adjacent difference from the reference on the same read are not discarded, and tagged with X_ADJACENT_REF_DIFFm", optional = true) + public boolean tagBasesWithAdjacentRefDiff = false; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java index 05c952c317a..3f23cb5a6e6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java @@ -1,7 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.featuremapping; import htsjdk.samtools.CigarElement; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.text.similarity.LevenshteinDistance; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -21,22 +21,34 @@ public class SNVMapper implements FeatureMapper { - final int identBefore; - final int identAfter; + final int surroundBefore; + final int surroundAfter; final int minCigarElementLength; final LevenshteinDistance levDistance = new LevenshteinDistance(); final Integer smqSize; final Integer smqSizeMean; + final boolean ignoreSurround; + final int spanBefore; + final int spanAfter; + + final FlowFeatureMapperArgumentCollection fmArgs; + public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) { - identBefore = fmArgs.snvIdenticalBases; - identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore; - minCigarElementLength = identBefore + 1 + identAfter; + surroundBefore = fmArgs.snvIdenticalBases; + surroundAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : surroundBefore; smqSize = fmArgs.surroundingMediaQualitySize; smqSizeMean = fmArgs.surroundingMeanQualitySize; + this.fmArgs = fmArgs; + + // ignore surround + ignoreSurround = fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff; + spanBefore = ignoreSurround ? 0 : surroundBefore; + spanAfter = ignoreSurround ? 0 : surroundAfter; + minCigarElementLength = spanBefore + 1 + spanAfter; // adjust minimal read length - FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter); + FlowBasedRead.setMinimalReadLength(1 + 1 + spanAfter); } @Override @@ -93,30 +105,44 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons if ( length >= minCigarElementLength && cigarElement.getOperator().consumesReadBases() && cigarElement.getOperator().consumesReferenceBases() ) { - readOfs += identBefore; - refOfs += identBefore; - for ( int ofs = identBefore ; ofs < length - identAfter ; ofs++, readOfs++, refOfs++ ) { + readOfs += spanBefore; + refOfs += spanBefore; + for ( int ofs = spanBefore ; ofs < length - spanAfter ; ofs++, readOfs++, refOfs++ ) { - if ( ref[refOfs] != 'N' && bases[readOfs] != ref[refOfs] ) { + if ( ref[refOfs] != 'N' && (fmArgs.reportAllAlts || (bases[readOfs] != ref[refOfs])) ) { // check that this is really a SNV (must be surrounded by identical ref) boolean surrounded = true; - for ( int i = 0 ; i < identBefore && surrounded ; i++ ) { - if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) { + for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) { + final int bIndex = readOfs-1-i; + final int rIndex = refOfs-1-i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - for ( int i = 0 ; i < identAfter && surrounded ; i++ ) { - if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) { + for (int i = 0; i < surroundAfter && surrounded ; i++ ) { + final int bIndex = readOfs+1+i; + final int rIndex = refOfs+1+i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - if ( !surrounded ) { + if ( (!fmArgs.reportAllAlts && !fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) { continue; } // add this feature FlowFeatureMapper.MappedFeature feature = FlowFeatureMapper.MappedFeature.makeSNV(read, readOfs, ref[refOfs], referenceContext.getStart() + refOfs, readOfs - refOfs); + if ( (fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) + feature.adjacentRefDiff = true; feature.nonIdentMBasesOnRead = nonIdentMBases; feature.refEditDistance = refEditDistance; if ( !read.isReverseStrand() ) @@ -166,8 +192,8 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons features.add(feature); } } - readOfs += identAfter; - refOfs += identAfter; + readOfs += spanAfter; + refOfs += spanAfter; } else { @@ -243,8 +269,8 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen cigarElement.getOperator().consumesReferenceBases() ) { // break out if not enough clearing - if ( (start < referenceContext.getStart() + refOfs + identBefore) || - (start >= referenceContext.getStart() + refOfs + length - identAfter) ) + if ( (start < referenceContext.getStart() + refOfs + spanBefore) || + (start >= referenceContext.getStart() + refOfs + length - spanAfter) ) return FilterStatus.Filtered; int delta = start - (referenceContext.getStart() + refOfs); @@ -255,13 +281,25 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen // check that this is really a SNV (must be surrounded by identical ref) boolean surrounded = true; - for ( int i = 0 ; i < identBefore && surrounded ; i++ ) { - if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) { + for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) { + final int bIndex = readOfs-1-i; + final int rIndex = refOfs-1-i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - for ( int i = 0 ; i < identAfter && surrounded ; i++ ) { - if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) { + for (int i = 0; i < surroundAfter && surrounded ; i++ ) { + final int bIndex = readOfs+1+i; + final int rIndex = refOfs+1+i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java index dd6a2e93838..a2642f20150 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java @@ -114,6 +114,7 @@ public final class VariantFiltration extends VariantWalker { public static final String CLUSTER_WINDOW_SIZE_LONG_NAME = "cluster-window-size"; public static final String MASK_EXTENSION_LONG_NAME = "mask-extension"; public static final String MASK_NAME_LONG_NAME = "mask-name"; + public static final String MASK_DESCRIPTION_LONG_NAME = "mask-description"; public static final String FILTER_NOT_IN_MASK_LONG_NAME = "filter-not-in-mask"; public static final String MISSING_VAL_LONG_NAME = "missing-values-evaluate-as-failing"; public static final String INVERT_LONG_NAME = "invert-filter-expression"; @@ -238,6 +239,14 @@ public final class VariantFiltration extends VariantWalker { @Argument(fullName=ALLELE_SPECIFIC_LONG_NAME, optional=true, doc="Set mask at the allele level. This option is not compatible with clustering.") public boolean applyForAllele = false; + /** + * If a mask interval list is provided, then set the description of the filter in the VCF header to this String. + * Note that if spaces are needed, then the entire description should be enclosed in quotes. Also note that if + * --filter-not-in-mask is used, the description should be adapted to reflect the reverse logic. + */ + @Argument(fullName=MASK_DESCRIPTION_LONG_NAME, optional=true, doc="Description to add to the FILTER field in VCF header for the mask filter.") + public String maskDescription; + // JEXL expressions for the filters private List filterExps; private List genotypeFilterExps; @@ -305,7 +314,9 @@ private void initializeVcfWriter() { } if ( mask != null ) { - if (filterRecordsNotInMask) { + if (maskDescription != null) { + hInfo.add(new VCFFilterHeaderLine(maskName, maskDescription)); + } else if (filterRecordsNotInMask) { hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask")); } else { hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask")); @@ -331,6 +342,9 @@ public void onTraversalStart() { if (filterRecordsNotInMask && mask == null) { throw new CommandLineException.BadArgumentValue(FILTER_NOT_IN_MASK_LONG_NAME, "argument not allowed if mask argument is not provided"); } + if (maskDescription != null && mask == null) { + throw new CommandLineException.BadArgumentValue(MASK_DESCRIPTION_LONG_NAME, "argument not allowed if mask argument is not provided"); + } filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions); genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions); howToTreatMissingValues = failMissingValues ? JexlMissingValueTreatment.TREAT_AS_MATCH : JexlMissingValueTreatment.TREAT_AS_MISMATCH; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java index 60f1ef7b205..aede6c33fc2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java @@ -130,7 +130,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, } gb.PL(newLikelihoods); - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc); + GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g, gpc); // restrict SAC to the new allele subset if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { @@ -139,7 +139,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, } // restrict AD to the new allele subset - if(g.hasAD()) { + if(g.hasAD() && gb.makeWithShallowCopy().hasAD()) { final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD()); gb.AD(newAD); // if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java index bbc9cb888aa..eaa9bad7033 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java @@ -56,6 +56,7 @@ public enum GenotypeAssignmentMethod { /** * Use PLs unless they are unavailable, in which case use best match to original + * GQ0 hom-refs will be converted to no-calls */ PREFER_PLS } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java index 7efb5687ac3..9b6c5030497 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java @@ -4,7 +4,7 @@ import com.google.common.primitives.Ints; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.*; @@ -349,6 +349,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis //If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration), // then we need to actually find the GT from PLs makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles); + } else if (g.hasGQ() && g.getGQ() == 0) { + makeGenotypeCall(g, genotypeBuilder, null, targetAlleles); //null likelihoods for reblocked hom-ref that we want to no-call } final Map attrs = new HashMap<>(g.getExtendedAttributes()); attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY); @@ -391,14 +393,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb, final double[] genotypeLikelihoods, final List allelesToUse) { - if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) { - //gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ(); - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL, - genotypeLikelihoods, allelesToUse, null); - } else { - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, - genotypeLikelihoods, allelesToUse, null); - } + GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, + genotypeLikelihoods, allelesToUse, g, null); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java index a0f804ffc11..bd8cf62e851 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java @@ -239,6 +239,25 @@ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, fi return result; } + /** + * The following functions estimate the error probability for an hmer. specifically two error + * probability values are generated: one for the first base of the hmer and another for the + * rest of its bases. + * + * The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability + * It iterates over the possible valid combinations of errors and sums them up. + * + * @param key - key (hmer length) in flow space + * @param errorProbBands - for each flow (position in the key) three error probabilities are provided: + * [0] - for the hmer being one base shorter + * [1] - for the hmer to be at its length + * [2] - for the hmer to be one base longer + * @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length) + * @param flowOrderLength - the cycle length of of the flow order (usually 4) + * @return an array of two probabilities: + * [0] - probability for the first base of the hmer + * [1] - probability for the rest of the bases of the hmer + */ @VisibleForTesting protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java index cee262d277c..74434c89cbe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java @@ -5,7 +5,7 @@ import htsjdk.samtools.util.Locatable; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.Tuple; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -858,9 +858,9 @@ else if ( hasQ || hasZ ) { cols.put("ReadName", read.getName()); // haplotypes and reference scores - cols.put("PaternalHaplotypeScore", paternal.score); - cols.put("MaternalHaplotypeScore", maternal.score); - cols.put("RefHaplotypeScore", refScore); + cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score)); + cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score)); + cols.put("RefHaplotypeScore", String.format("%.6f", refScore)); // build haplotype keys final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java new file mode 100644 index 00000000000..3cee7b801bb --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java @@ -0,0 +1,926 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import com.opencsv.CSVReader; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.math3.util.Precision; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerUtils; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentLikelihoodEngine; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LikelihoodEngineArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadLikelihoodCalculationEngine; +import org.broadinstitute.hellbender.utils.BaseUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.clipping.ReadClipper; +import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype; +import org.broadinstitute.hellbender.utils.haplotype.Haplotype; +import org.broadinstitute.hellbender.utils.read.*; +import org.broadinstitute.hellbender.utils.report.GATKReport; +import org.broadinstitute.hellbender.utils.report.GATKReportTable; + +import java.io.*; +import java.text.DecimalFormat; +import java.util.*; +import java.util.zip.GZIPOutputStream; + +/** + * Converts Ultima reads into flow-based annotation, and provides some general statistics regarding + * quality and errors relative to the reference. Ultima data is flow-based, and thus the original computed + * quality refers to each flow, rather than each base. In the Ultima cram/bam, there is a per-base representation + * of the original flow qualities, where the original quality is distributed along each flow (homopolymer). + * In order to reconstitute the original flow information, the tool incorporates the information encoded + * in the Ultima cram/bam, and outputs both the read in flow space, as well as a conversion of the aligned + * reference portion into flow space, and an alignment score. + * + * Input
+ *+ *
+ * + *- Ultima aligned SAM/BAM/CRAM
+ *Output
+ *+ *
+ * + *- Per read ground truth information CSV and a ground truth scoring quality report, in GATK report format
+ *CSV Output Description
+ * csv with the read representation in flow space. The csv includes the following columns: + *ReadName + *ReadKey : The signal of the read at each flow according to the flow order + *ReadIsReversed : Whether the read is reversed in the alignment + *ReadMQ : The mapping quality of the read + *ReadRQ : The read rq value + *GroundTruthKey : The aligned reference section, translated into per-flow signals + *ReadSequence + *Score : A flow-based alignment score. Since the alignment is per-flow, in the case that there’s a cycle skip, the read and reference flow signals will not be aligned, and therefore the score will be inaccurate. + *NormalizedScore: A flow-based normalized alignment score + *ErrorProbability : The error of each flow (corresponds to the signals in ReadKey) + *ReadKeyLength + *GroundTruthKeyLength + *CycleSkipStatus : One of NS (Non Skip), PCS (Possible Cycle Skip), or CS (Cycle Skip) + *Cigar + *LowestQBaseTP + * + *GATK Report Description
+ * In the quality report (optional), the following tables are included: + * + *qualReport:error rate per qual : The error rate for each quality. Columns: + *+ *
+ * + *- qual: The encoded quality + *
- count: The number of times the quality was observed + *
- error: The error rate of the flows with this qual + *
- phred: the error translated into a phred score + *
qual_hmerReport:error rate per qual by hmer. The error rate for each quality and hmer combination. Columns: + *+ *
+ * + *- qual: The encoded quality + *
- hmer: The hmer length + *
- count: The number of times the quality was observed + *
- error: The error rate of the flows with this qual + *
qual_hmer_deviation_base_Report:error rate per qual by hmer and deviation. The count of errors for each qual, hmer, deviation and base + *+ *
+ * + *- qual: The encoded quality + *
- hmer: The hmer length + *
- deviation: The deviation (difference in signal, relative to the reference) + *
- base: The base + *
- count: The number of times the deviation was observed + *
Phred/qual statistics per flow position report. Various statistics for each flow position in relationship to the found quality value. Columns: + *+ *
+ * + *- flow - flow position
+ *- count - count of observations
+ *- min - minimal observed quality
+ *- max - maximal observed quality
+ *- mean - mean observed value
+ *- median - median observed value
+ *- std - standard deviation
+ *- p1...Pn - percentil columns, accotding to the --quality-percentiles parameter
+ *Usage examples
+ *+ * gatk GroundTruthScorer \ + * -I input.bam \ + * -R reference.fasta.gz + * -L chr20 \ + * --output-csv output.csv \ + * --report-file report.txt \ + * --omit-zeros-from-report \ (optional) + * --features-file dbsnp.chr9.vcf.gz \ (optional) + * --genome-prior genome_prior.csv (optional) + *+ * + * {@GATK.walkertype ReadWalker} + */ + +@CommandLineProgramProperties( + summary = "Ground Truth Scorer", + oneLineSummary = "Score reads against a reference/ground truth", + programGroup = FlowBasedProgramGroup.class +) +@DocumentedFeature +@ExperimentalFeature +public class GroundTruthScorer extends ReadWalker { + private static final Logger logger = LogManager.getLogger(GroundTruthScorer.class); + + public static final String OUTPUT_CSV_LONG_NAME = "output-csv"; + public static final String REPORT_FILE_LONG_NAME = "report-file"; + public static final String USE_SOFTCLIPPED_BASES_LONG_NAME = "use-softclipped-bases"; + public static final String GENOME_PRIOR_LONG_NAME = "genome-prior"; + public static final String FEATURES_FILE_LONG_NAME = "features-file"; + public static final String NORMALIZED_SCORE_THRESHOLD_LONG_NAME = "normalized-score-threshold"; + public static final String ADD_MEAN_CALL_LONG_NAME = "add-mean-call"; + public static final String GT_NO_OUTPUT_LONG_NAME = "gt-no-output"; + public static final String OMIT_ZEROS_FROM_REPORT = "omit-zeros-from-report"; + public static final String QUALITY_PERCENTILES = "quality-percentiles"; + public static final String EXCLUDE_ZERO_FLOWS = "exclude-zero-flows"; + + private static final int QUAL_VALUE_MAX = 60; + private static final int HMER_VALUE_MAX = 100; //TODO: This should become a parameter + private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1; + + private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1; + private static final double DEFAULT_RATIO_THRESHOLD = 0.003; + + /* + Private accumulator class for counting false/true observations (hence Boolean). + + Observations are counted at a top level and are also optionally classified into a set of bins (the + number of which is fixed upon construction). The bins themselves are also BooleanAccumulator objects, + resulting in a tree like multi-level accumulator. + + + GroundTruthScores builds a four level deep accumulation tree, which can support observations of a + boolean event with 3-deep context (bin1,bin2,bin3). + + Once accumulation is done, the instance is able to generate a suitable GATKReportTable for any given + bin depth (1, 2 or 3). + + */ + private static class BooleanAccumulator { + long falseCount; + long trueCount; + BooleanAccumulator[] bins; + + // add an observation to this accumulator + void add(final boolean b) { + if (b) { + trueCount++; + } else { + falseCount++; + } + } + + // add an observation to this accumulator and to one of the bins in the level under it (1 deep) + void add(final boolean b, final int bin) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b); + } + } + + // add an observation to this accumulator and to two levels of bins under it (2 deep) + void add(final boolean b, final int bin, final int bin2) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b, bin2); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2); + } + } + + // add an observation to this accumulator and to three levels of bins under it (3 deep) + void add(final boolean b, final int bin, final int bin2, final int bin3) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b, bin2, bin3); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2, bin3); + } + } + + // get observation count of this accumulator + long getCount() { + return falseCount + trueCount; + } + + // get the false rate/ration for this accumulator + double getFalseRate() { + return (getCount() == 0) ? 0.0 : ((double)falseCount / getCount()); + } + + // create a set of accumulators with 3-deep bin nesting + static BooleanAccumulator[] newReport(final int size, final int binCount, final int binCount2, final int binCount3) { + BooleanAccumulator[] report = new BooleanAccumulator[size]; + for ( byte i = 0 ; i < report.length ; i++ ) { + report[i] = new BooleanAccumulator(); + if ( binCount != 0 ) { + report[i].bins = new BooleanAccumulator[binCount]; + for ( int j = 0 ; j < report[i].bins.length ; j++ ) { + report[i].bins[j] = new BooleanAccumulator(); + if ( binCount2 != 0 ) { + report[i].bins[j].bins = new BooleanAccumulator[binCount2]; + for ( int k = 0 ; k < report[i].bins[j].bins.length ; k++ ) { + report[i].bins[j].bins[k] = new BooleanAccumulator(); + if (binCount3 != 0) { + report[i].bins[j].bins[k].bins = new BooleanAccumulator[binCount3]; + for (int m = 0; m < report[i].bins[j].bins[k].bins.length; m++) { + report[i].bins[j].bins[k].bins[m] = new BooleanAccumulator(); + } + } + } + } + } + } + } + return report; + } + + // create a GATK report from a set of accumulators without nesting into their bins + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name, final double probThreshold, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name + "Report", "error rate per " + name, 4); + table.addColumn(name, "%d"); + table.addColumn("count", "%d"); + table.addColumn("error", "%f"); + table.addColumn("phred", "%d"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + if ( omitZeros && i != 0 && report[i].getCount() == 0 ) + continue; + else { + final double rate = report[i].getFalseRate(); + final double phredRate = (rate == 0.0 && report[i].getCount() != 0 && probThreshold != 0.0) ? probThreshold : rate; + + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, report[i].getCount()); + table.set(rowIndex, 2, rate); + table.set(rowIndex, 3, phredRate != 0 ? (int) Math.ceil(-10.0 * Math.log10(phredRate)) : 0); + rowIndex++; + } + } + return table; + } + + // create a GATK report from a set of accumulators while nesting into one level of their bins (1 deep) + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "Report", "error rate per " + name1 + " by " + name2, 4); + table.addColumn(name1, "%d"); + table.addColumn(name2, "%d"); + table.addColumn("count", "%d"); + table.addColumn("error", "%f"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + for ( int j = 0; j < report[i].bins.length ; j++ ) { + if ( omitZeros && (i != 0 || j != 0) && report[i].bins[j].getCount() == 0 ) + continue; + else { + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, j); + table.set(rowIndex, 2, report[i].bins[j].getCount()); + table.set(rowIndex, 3, report[i].bins[j].getFalseRate()); + rowIndex++; + } + } + } + return table; + } + + // create a GATK report from a set of accumulators while nesting into two levels of their bins (2 deep) + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final String name3, final String name4, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "_" + name3 + "_" + name4 + "_Report", "error rate per " + name1 + " by " + name2 + " and " + name3, 5); + table.addColumn(name1, "%d"); + table.addColumn(name2, "%d"); + table.addColumn(name3, "%s"); + table.addColumn(name4, "%s"); + table.addColumn("count", "%d"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + for ( int j = 0; j < report[i].bins.length ; j++ ) { + for ( int k = 0; k < report[i].bins[j].bins.length ; k++ ) { + for ( int m = 0; m < report[i].bins[j].bins[k].bins.length ; m++ ) { + if ( omitZeros && (i != 0 || j != 0 || k != 0 || m != 0) && report[i].bins[j].bins[k].bins[m].getCount() == 0 ) + continue; + else { + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, j); + table.set(rowIndex, 2, binToDeviation(k)); + table.set(rowIndex, 3, String.format("%c", binToBase(m))); + table.set(rowIndex, 4, report[i].bins[j].bins[k].bins[m].getCount()); + rowIndex++; + } + } + } + } + } + return table; + } + } + + private static class PercentileReport extends SeriesStats { + + static GATKReportTable newReportTable(final Vectorreport, String qualityPercentiles) { + String[] qp = qualityPercentiles.split(","); + final GATKReportTable table = new GATKReportTable("PhredBinAccumulator", "PhredBinAccumulator", 8 + qp.length); + table.addColumn("flow", "%d"); + table.addColumn("count", "%d"); + table.addColumn("min", "%f"); + table.addColumn("max", "%f"); + table.addColumn("mean", "%f"); + table.addColumn("median", "%f"); + table.addColumn("std", "%f"); + for ( final String p : qp ) { + table.addColumn("p" + p, "%f"); + } + int rowIndex = 0; + for ( final PercentileReport r : report ) { + int col = 0; + table.set(rowIndex, col++, rowIndex); + table.set(rowIndex, col++, r.getCount()); + table.set(rowIndex, col++, r.getMin()); + table.set(rowIndex, col++, r.getMax()); + table.set(rowIndex, col++, r.getMean()); + table.set(rowIndex, col++, r.getMedian()); + table.set(rowIndex, col++, r.getStd()); + for ( String p : qp ) { + table.set(rowIndex, col++, r.getPercentile(Double.parseDouble(p))); + } + rowIndex++; + } + return table; + } + + void addProb(double p) { + super.add(-10 * Math.log10(p)); + } + } + + @Argument(fullName = OUTPUT_CSV_LONG_NAME, doc="main CSV output file. supported file extensions: .csv, .csv.gz.") + public GATKPath outputCsvPath = null; + + @Argument(fullName = REPORT_FILE_LONG_NAME, doc="report output file.", optional = true) + public GATKPath reportFilePath = null; + + @ArgumentCollection + public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection(); + + @ArgumentCollection + public FlowBasedAlignmentArgumentCollection fbargs = new FlowBasedAlignmentArgumentCollection(); + + @Argument(fullName = USE_SOFTCLIPPED_BASES_LONG_NAME, doc="", optional = true) + public boolean useSoftclippedBases; + + @Argument(fullName = GENOME_PRIOR_LONG_NAME, doc="CSV input file containing genome-prior (one line per base with hmer frequencies).", optional = true) + public GATKPath genomePriorPath; + + @Argument(fullName = FEATURES_FILE_LONG_NAME, doc="A VCF file containing features to be used as a use for filtering reads.", optional = true) + public FeatureDataSource features; + + @Argument(fullName = NORMALIZED_SCORE_THRESHOLD_LONG_NAME, doc="threshold for normalized score, below which reads are ignored", optional = true) + public double normalizedScoreThreshold = NORMALIZED_SCORE_THRESHOLD_DEFAULT; + + @Argument(fullName = ADD_MEAN_CALL_LONG_NAME, doc="Add ReadMeanCall and ReadProbs columns to output", optional = true) + public boolean addMeanCalll; + + @Argument(fullName = GT_NO_OUTPUT_LONG_NAME, doc = "do not generate output records", optional = true) + public boolean noOutput = false; + + @Argument(fullName = OMIT_ZEROS_FROM_REPORT, doc = "omit zero values from output report", optional = true) + public boolean omitZerosFromReport = false; + + @Argument(fullName = QUALITY_PERCENTILES, doc = "list of quality percentiles, defaults to 10,25,50,75,90", optional = true) + public String qualityPercentiles = "10,25,50,75,90"; + + @Argument(fullName = EXCLUDE_ZERO_FLOWS, doc = "should flows with a call of zero be included in the percentile report?", optional = true) + public boolean excludeZeroFlows = false; + + // locals + private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine; + private PrintWriter outputCsv; + private DecimalFormat doubleFormat = new DecimalFormat("0.0#####"); + private GenomePriorDB genomePriorDB; + private BooleanAccumulator[] qualReport; + private String[] csvFieldOrder; + private Vector percentileReports; + + // static/const + static final private String[] CSV_FIELD_ORDER_BASIC = { + "ReadName", "ReadKey", "ReadIsReversed", "ReadMQ", "ReadRQ", "GroundTruthKey", "ReadSequence", + "Score", "NormalizedScore", "ErrorProbability", + "ReadKeyLength", "GroundTruthKeyLength", "CycleSkipStatus", "Cigar", "LowestQBaseTP" + }; + static final private String[] CSV_FIELD_ORDER_MEAN_CALL = { + "ReadProbs", "ReadMeanCall" + }; + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + + // establish csv fields + List order = new LinkedList<>(Arrays.asList(CSV_FIELD_ORDER_BASIC)); + if ( addMeanCalll ) { + order.addAll(Arrays.asList(CSV_FIELD_ORDER_MEAN_CALL)); + } + csvFieldOrder = order.toArray(new String[0]); + + // create likelihood engine + final ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false); + if ( engine instanceof FlowBasedAlignmentLikelihoodEngine ) { + likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine; + } else { + throw new GATKException("must use a flow based likelihood calculation engine"); + } + + // open genome prior if provided + if ( genomePriorPath != null ) { + try { + genomePriorDB = new GenomePriorDB(genomePriorPath); + } catch (IOException e) { + throw new GATKException("failed to open genome-prior file: " + genomePriorPath); + } + } + + // open output, write header + try { + if (outputCsvPath.toPath().toString().endsWith(".gz")) { + outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream())); + } else { + outputCsv = new PrintWriter(outputCsvPath.getOutputStream()); + } + } catch (IOException e) { + throw new GATKException("failed to open csv output: " + outputCsvPath, e); + } + emitCsvHeaders(); + + // initialize reports + if ( reportFilePath != null ) { + qualReport = BooleanAccumulator.newReport(QUAL_VALUE_MAX + 1, HMER_VALUE_MAX + 1, deviationToBin(HMER_VALUE_MAX + 1), BASE_VALUE_MAX + 1); + + // establish max hmer size of flow input + int maxClass = FlowBasedRead.MAX_CLASS; + final SAMFileHeader header = getHeaderForReads(); + if ( header != null ) { + for ( SAMReadGroupRecord rg : header.getReadGroups() ) { + if ( rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG) != null ) { + maxClass = Math.max(maxClass, Integer.parseInt(rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG))); + } + } + } + percentileReports = new Vector<>(); + } + } + + @Override + public void closeTool() { + + // close main output csv + if ( outputCsv != null ) { + outputCsv.close(); + } + + // write reports + if ( reportFilePath != null ) { + final GATKReport report = new GATKReport( + BooleanAccumulator.newReportTable(qualReport, "qual", DEFAULT_RATIO_THRESHOLD, omitZerosFromReport), + BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport), + BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport), + PercentileReport.newReportTable(percentileReports, qualityPercentiles) + ); + try ( final PrintStream ps = new PrintStream(reportFilePath.getOutputStream()) ) { + report.print(ps); + } + } + + super.closeTool(); + } + + @Override + public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { + + // working with unmapped reads is really not practical, as we need the reference to work out ground truth + if ( read.isUnmapped() ) + return; + if ( referenceContext.getWindow() == null ) + return; + + // handle read clipping + final GATKRead clippedRead; + if (isSoftClipped(read) ) { + if (useSoftclippedBases) { + referenceContext.setWindow(read.getStart() - read.getUnclippedStart(), read.getUnclippedEnd() - read.getEnd());; + clippedRead = ReadClipper.revertSoftClippedBases(read); + } else { + clippedRead = ReadClipper.hardClipSoftClippedBases(read); + } + } else { + clippedRead = read; + } + + // filter? + if ( (features != null) && !filter(clippedRead, referenceContext) ) { + return; + } + + // create flow read/haplotype + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), clippedRead); + final FlowBasedRead flowRead = new FlowBasedRead(clippedRead, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true); + final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, rgInfo.flowOrder); + + // is this really needed? + if ( !flowRead.isValid() ) { + return; + } + + // compute score + final int hapKeyLength = flowHaplotype.getKeyLength(); + final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false); + final double normalizedScore = score / flowRead.getKeyLength(); + if ( normalizedScore < normalizedScoreThreshold ) + return; + + // compute error probability + final double[] errorProb = computeErrorProb(flowRead, genomePriorDB); + + // cycle skip + final FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus = FlowBasedReadUtils.getCycleSkipStatus(flowRead, referenceContext); + + // accumulate reports + if ( cycleSkipStatus != FlowBasedReadUtils.CycleSkipStatus.CS && qualReport != null ) { + addToQualReport(flowRead, referenceContext, errorProb); + } + + // emit + try { + emit(flowRead, flowHaplotype, score, normalizedScore, errorProb, read, cycleSkipStatus); + } catch (IOException e) { + throw new GATKException("failed to write output record", e); + } + } + + private boolean filter(final GATKRead read, final ReferenceContext referenceContext) { + + // loop on features contained in the read, check that they are in agreement with read data + Iterator iter = features.query(new SimpleInterval(read)); + byte[] ref = referenceContext.getBases(); + while ( iter.hasNext() ) { + final VariantContext vc = iter.next(); + for ( int refCoord = vc.getStart() ; refCoord <= vc.getEnd() ; refCoord++ ) { + + // get byte from read + Optional readByte = ReadUtils.getReadBaseAtReferenceCoordinate(read, refCoord); + if ( !readByte.isPresent() ) { + return false; + } + + // get byte from reference + byte refByte = ref[refCoord - referenceContext.getWindow().getStart()]; + + // compare + if ( refByte != readByte.get() ) { + return false; + } + } + } + + // if here, no interference detected + return true; + } + + /* + * compute error probability vector for a read + * + * The vector has one element for each flow key, representing the probability complementing the call-probability to 1 + * This is further complicated by the optional presence of a genome-prior database, which provides factoring for + * each hmer length (on a base basis) + */ + private double[] computeErrorProb(final FlowBasedRead flowRead, final GenomePriorDB genomePriorDB) { + + final int[] key = flowRead.getKey(); + final byte[] flowOrder = flowRead.getFlowOrderArray(); + + final double[] probCol = new double[flowRead.getMaxHmer() + 1]; + double[] result = new double[key.length]; + + for ( int i = 0 ; i < key.length ; i++ ) { + + // step 1 - extract column & sum + double sum = 0; + for ( int j = 0 ; j < probCol.length ; j++ ) { + sum += (probCol[j] = flowRead.getProb(i, j)); + } + + // step 2 - normalize column + if ( sum != 0.0 ) { + for (int j = 0; j < probCol.length; j++) { + probCol[j] /= sum; + } + } + + // step 3 - scale according to prior genome? + if ( genomePriorDB != null ) { + + long[] prior = genomePriorDB.getPriorForBase(flowOrder[i]); + sum = 0; + if ( prior != null ) { + for (int j = 0; j < probCol.length; j++) { + sum += (probCol[j] *= prior[j]); + } + } + + // assign normalized result + if ( sum != 0.0 ) { + result[i] = 1 - (probCol[Math.min(probCol.length - 1, Math.min(key[i], flowRead.getMaxHmer()))] / sum); + } else { + // revert to non-prior normalization + result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())]; + } + } else { + + // assign normalized result + result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())]; + } + + // accumulate error probabilities + if ( percentileReports != null ) { + if ( key[i] != 0 || !excludeZeroFlows ) { + while ( percentileReports.size() < (i + 1) ) { + percentileReports.add(new PercentileReport()); + } + percentileReports.get(i).addProb(result[i]); + } + } + } + + return result; + } + + /* + * compute lowest-quality-base-tp-value vector for a read + * + * The vector has one element for each flow key, representing the value of the tp for the hmer base + * which has the lowest quality value + * + * example: + * Bases: TTTTT + * Qs: ABCBA + * Tp: -1 1 0 1 -1 + * Output: -1 + */ + private byte[] computeLowestQBaseTP(final FlowBasedRead flowRead) { + + final int[] key = flowRead.getKey(); + byte[] result = new byte[key.length]; + final byte[] tp = flowRead.getAttributeAsByteArray("tp"); + final byte[] qual = flowRead.getBaseQualitiesNoCopy(); + + // loop om key + int seq_i = 0; + for ( int i = 0 ; i < key.length ; i++ ) { + + // extract hmer length, zero is easy + int hmer = key[i]; + if ( hmer == 0 ) { + result[i] = 0; + continue; + } + + // scan qualities for the lowest value, start with first + // as qualities and tp are symetric, we can scan up to the middle + // when finding the middle (offset) account for even/odd hmers + result[i] = tp[seq_i]; + byte lowestQ = qual[seq_i]; + int hmer_scan_length = (hmer + 1) / 2; + for ( int j = 1 ; j < hmer_scan_length ; j++ ) { + if ( qual[seq_i + j] < lowestQ ) { + result[i] = tp[seq_i + j]; + lowestQ = qual[seq_i + j]; + } + } + + // advance + seq_i += hmer; + } + + return result; + } + + private void emitCsvHeaders() { + + outputCsv.println(StringUtils.join(csvFieldOrder, ",")); + } + + private void emit(final FlowBasedRead flowRead, final FlowBasedHaplotype refHaplotype, double score, final double normalizedScore, final double[] errorProb, + GATKRead read, + FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus) throws IOException { + + // build line columns + final Map cols = new LinkedHashMap<>(); + + // read info + cols.put("ReadName", flowRead.getName()); + cols.put("ReadIsReversed", flowRead.isReverseStrand() ? 1 : 0); + cols.put("ReadMQ", flowRead.getMappingQuality()); + cols.put("ReadRQ", flowRead.getAttributeAsFloat("rq")); + cols.put("CycleSkipStatus", cycleSkipStatus); + cols.put("Cigar", read.getCigar().toString()); + + + // keys, seq, etc + cols.put("ReadKey", "\"" + StringUtils.join(flowRead.getKey(), ',') + "\""); + cols.put("GroundTruthKey", "\"" + StringUtils.join(refHaplotype.getKey(), ',') + "\""); + cols.put("ReadSequence", flowRead.getBasesString()); + cols.put("ReadKeyLength", flowRead.getKeyLength()); + cols.put("GroundTruthKeyLength", refHaplotype.getKeyLength()); + + // scores + cols.put("Score", score); + cols.put("NormalizedScore", normalizedScore); + cols.put("ErrorProbability", "\"" + StringUtils.join( + Arrays.stream(errorProb).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + + // lowest q base tp + final byte[] lowestQBaseTP = computeLowestQBaseTP(flowRead); + cols.put("LowestQBaseTP", "\"" + StringUtils.join(lowestQBaseTP, ',') + "\""); + + // add read probabilities + if ( addMeanCalll ) { + double[][] readProbsAndMeanCall = collectReadProbs(flowRead); + cols.put("ReadProbs", "\"" + StringUtils.join( + Arrays.stream(readProbsAndMeanCall[0]).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + cols.put("ReadMeanCall", "\"" + StringUtils.join( + Arrays.stream(readProbsAndMeanCall[1]).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + } + + // construct line + StringBuilder sb = new StringBuilder(); + int colIndex = 0; + for ( String field : csvFieldOrder ) { + if ( colIndex++ > 0 ) { + sb.append(','); + } + if ( !cols.containsKey(field) ) { + throw new GATKException("column missing from csv line: " + field); + } + sb.append(cols.get(field)); + cols.remove(field); + } + if ( cols.size() > 0 ) { + throw new GATKException("invalid columns on csv line: " + cols.keySet()); + } + + // output line + if ( !noOutput ) { + outputCsv.println(sb); + } + } + + private double[][] collectReadProbs(final FlowBasedRead read) { + + final int keyLength = read.getKeyLength(); + final int maxHmer = read.getMaxHmer(); + final double[] probs = new double[keyLength * (maxHmer + 1)]; + final double[] meanCall = new double[keyLength]; + + // retrieve probs + int pos = 0; + for ( int flow = 0 ; flow < keyLength ; flow++ ) { + double mc = 0; + int mc_sum = 0; + for ( int hmer = 0 ; hmer <= maxHmer ; hmer++ ) { + final double p = read.getProb(flow, hmer); + probs[pos++] = p; + mc += (p * hmer); + mc_sum += hmer; + } + meanCall[flow] = mc / mc_sum; + } + + return new double[][] {probs, meanCall}; + } + + static private class GenomePriorDB { + + final private Map db = new LinkedHashMap<>(); + + GenomePriorDB(GATKPath path) throws IOException { + + final CSVReader csvReader = new CSVReader(new InputStreamReader(path.getInputStream())); + String[] line; + while ( (line = csvReader.readNext()) != null ) { + long[] prior = new long[HMER_VALUE_MAX + 1]; + Byte base = line[0].getBytes()[0]; + for ( int i = 0 ; i < prior.length ; i++ ) { + if ( i == 0 ){ + prior[i] = Long.parseLong(line[i+1]); + } else { + prior[i] = Long.parseLong(line[i]); + } + } + db.put(base, prior); + } + } + + long[] getPriorForBase(byte base) { + return db.get(base); + } + } + + private void addToQualReport(FlowBasedRead flowRead, ReferenceContext referenceContext, final double[] errorProb) { + + // convert reference to key space + final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true); + final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, flowRead.getFlowOrder()); + + // access keys & flow order + final int[] readKey = flowRead.getKey(); + final int[] hapKey = flowHaplotype.getKey(); + final byte[] flowOrder = flowRead.getFlowOrder().getBytes(); + + // loop on key positions + if ( readKey.length != hapKey.length ) { + return; + } + for ( int flow = 0 ; flow < readKey.length ; flow++ ) { + + // determine quality + final double prob = Precision.round(errorProb[flow], (QUAL_VALUE_MAX / 10) + 1); + final int qual = (int)Math.ceil(-10 * Math.log10(prob)); + + // determine if matches reference + final int deviation = readKey[flow] - hapKey[flow]; + final boolean same = (deviation == 0); + + // accumulate + if ( qual < qualReport.length ) { + int baseBin = baseToBin(flowOrder[flow % flowOrder.length], flowRead.isReverseStrand()); + qualReport[qual].add(same, readKey[flow], deviationToBin(deviation), baseBin); + } + } + } + + static private int baseToBin(byte base, boolean isReverseStrand) { + final byte trueBase = !isReverseStrand ? base : BaseUtils.simpleComplement(base); + return FlowBasedRead.DEFAULT_FLOW_ORDER.indexOf(trueBase); + } + + static private byte binToBase(int bin) { + return (byte)FlowBasedRead.DEFAULT_FLOW_ORDER.charAt(bin); + } + + // 0,-1,1,-2,2... -> 0,1,2,3,4... + static private int deviationToBin(final int deviation) { + if ( deviation >= 0 ) { + return deviation * 2; + } else { + return (-deviation * 2) - 1; + } + } + + // 0,1,2,3,4... -> 0,-1,1,-2,2... + static private String binToDeviation(final int bin) { + if ( bin == 0 ) { + return "0"; + } else if ( (bin % 2) == 0 ) { + return String.format("+%d", bin / 2); + } else { + return String.format("%d", -((bin+1) / 2)); + } + } + + static private boolean isSoftClipped( final GATKRead read ) { + if ( read.isUnmapped() ) + return false; + if ( read.getCigar().getFirstCigarElement() == null ) + return false; + final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator(); + final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator(); + return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) || + (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java new file mode 100644 index 00000000000..2ba594b7058 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java @@ -0,0 +1,160 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicInteger; + +public class SeriesStats { + + private static final Logger logger = LogManager.getLogger(SeriesStats.class); + + // local state + private double last = Double.NaN; + private int count = 0; + private double sum = 0; + private double min = Double.NaN; + private double max = Double.NaN; + private SortedMap bins = new TreeMap<>(); + private int intCount = 0; + private Map auxBins = new LinkedHashMap<>(); + + public void csvWrite(final String path) throws IOException { + logger.info("Writing SeriesStats " + toDigest() + " into " + path); + PrintWriter pw = new PrintWriter(path); + pw.println("value,count"); + boolean intKeys = isIntKeys(); + for (Map.Entry entry : bins.entrySet() ) { + if ( intKeys ) { + pw.println(String.format("%d,%d", entry.getKey().intValue(), entry.getValue().get())); + } else { + pw.println(String.format("%f,%d", entry.getKey(), entry.getValue().get())); + } + } + pw.close(); + } + + public void add(int v) { + add((double)v); + intCount++; + } + + public void add(double v) { + + // save in simple values + last = v; + sum += v; + if ( count > 0 ) { + min = Math.min(min, v); + max = Math.max(max, v); + } else { + min = max = v; + } + count++; + + // save in bins + final Double key = v; + if ( bins.containsKey(key) ) { + bins.get(key).incrementAndGet(); + } else { + bins.put(key, new AtomicInteger(1)); + } + } + + public double getLast() { + return last; + } + + public int getCount() { + return count; + } + + public double getMin() { + return (count != 0) ? min : Double.NaN; + } + + public double getMax() { + return (count != 0) ? max : Double.NaN; + } + + public int getUniq() { + return bins.size(); + } + + public double getMean() { + return (count != 0) ? (sum / count) : Double.NaN; + } + + public double getMedian() { + return getPercentile(50); + } + + public double getPercentile(double precentile) { + if ( count == 0 ) { + return Double.NaN; + } else if ( count == 1 ) { + return last; + } else { + + int percentileIndex = (int)(count * precentile / 100); + int index = 0; + for (Map.Entry entry : bins.entrySet() ) { + int binSize = entry.getValue().get(); + if ( percentileIndex >= index && (percentileIndex < (index + binSize)) ) { + return entry.getKey(); + } + index += binSize; + } + + // if here, we need the highest entry + return bins.lastKey(); + } + } + + public double getStd() { + + if (count == 0) { + return Double.NaN; + } + + // calculate mean + double mean = getMean(); + + // calculate sum of sqr(deviations) + double vsum = 0; + for (Map.Entry entry : bins.entrySet()) { + int binSize = entry.getValue().get(); + vsum += (Math.pow(entry.getKey() - mean, 2) * binSize); + } + + // calculate variance and std + double variance = vsum / count; + return Math.sqrt(variance); + } + + public Map getBins() { + return this.bins; + } + + public Map getAuxBins() { + return this.auxBins; + } + + public String toDigest() { + if ( isIntKeys() ) { + return String.format("count=%d, min=%d, max=%d, median=%d, bin.count=%d", getCount(), (int)getMin(), (int)getMax(), (int)getMedian(), getBins().size()); + } else { + return String.format("count=%d, min=%f, max=%f, median=%f, bin.count=%d", getCount(), getMin(), getMax(), getMedian(), getBins().size()); + } + } + + private boolean isIntKeys() { + return (count == intCount); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java index 17728fa6035..3a350228128 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java @@ -29,6 +29,8 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import com.google.common.annotations.VisibleForTesting; + /** * Filtering haplotypes that contribute weak alleles to the genotyping. * @@ -278,7 +280,8 @@ private AlleleLikelihoods subsetHaplotypesByAlleles(final A * @param sorThreshold only variants with SOR above threshold will be considered * @return list of alleles that can be removed */ - private List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, + @VisibleForTesting + List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, final List alleles, final double qualThreshold, final double sorThreshold) { @@ -303,9 +306,11 @@ private List identifyBadAlleles(final List collectedRPLs, final //we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD //we will first filter the one with the lowest QUAL. logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount)); - for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) { - if (!result.contains(alleles.get(sorIndices[i]))) { - result.add(alleles.get(sorIndices[i])); + for (int i = sorIndices.length-1 ; (i >= 0) ; i--) { + if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){ + if (!result.contains(alleles.get(sorIndices[i]))) { + result.add(alleles.get(sorIndices[i])); + } } } logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount)); @@ -340,7 +345,7 @@ private AlleleLikelihoods getAlleleLikelihoodMatrix(final Alle .forEach(alleleHaplotypeMap.get(notAllele)::add); final AlleleLikelihoods alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap); - logger.debug(() -> String.format("GALM: %s %d %d", allele.toString(), alleleHaplotypeMap.get(allele).size(), alleleHaplotypeMap.get(notAllele).size())); + logger.debug(() -> String.format("GALM: %s %d %d", allele, alleleHaplotypeMap.get(allele.altAllele()).size(), alleleHaplotypeMap.get(notAllele).size())); return alleleLikelihoods; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java index 2788cc21575..010a9e21398 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java @@ -11,7 +11,10 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import java.io.OutputStreamWriter; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Filtering haplotypes that contribute weak alleles to the genotyping. This is a version that determines if allele is weak using @@ -57,14 +60,16 @@ int getAlleleLikelihoodVsInverse(final AlleleLikelihoods allel final GenotypingLikelihoods genotypingLikelihoods = genotypesModel.calculateLikelihoods(alleleList, genotypingData, null, 0, null); - AFCalculationResult af = afCalc.fastCalculateDiploidBasedOnGLs(genotypingLikelihoods, genotypingEngine.getPloidyModel().totalPloidy()); - final double log10Confidence = af.log10ProbOnlyRefAlleleExists(); - final double phredScaledConfidence = (10.0 * log10Confidence) + 0.0; - final int[] asPL = genotypingLikelihoods.sampleLikelihoods(0).getAsPLs(); - - logger.debug(() -> String.format("GAL:: %s: %d %d %d", allele.toString(), asPL[0], asPL[1], asPL[2])); - return Math.min(asPL[1]-asPL[0], asPL[2]-asPL[0]); + List perSamplePLs = new ArrayList<>(); + for (int i = 0; i < genotypingLikelihoods.numberOfSamples(); i++) { + final int[] pls = genotypingLikelihoods.sampleLikelihoods(i).getAsPLs(); + perSamplePLs.add(Math.min(pls[1] - pls[0], pls[2] - pls[0])); + final int finalI = i; + logger.debug(() -> String.format("GAL (%s):: %s: %d %d %d", + genotypingLikelihoods.getSample(finalI), allele.toString(), pls[0], pls[1], pls[2])); + } + return Collections.min(perSamplePLs); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java index 1d7eecc0fe8..fa172768608 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java @@ -133,12 +133,12 @@ public AlleleLikelihoods computeReadLikelihoods(final List< @Override public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) { final double log10ErrorRate = Math.log10(expectedErrorRate); - final double catastrophicErrorRate = fbargs.fillingValue; - final double log10catastrophicErrorRate = Math.log10(fbargs.fillingValue); + final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors. + final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate); return read -> { final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate); - final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * fbargs.fillingValue)) : - Math.ceil(read.getLength() * fbargs.fillingValue); + final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) : + Math.ceil(read.getLength() * largeEventErrorRate); return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate; }; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java index 700659297c9..27c5f0eb875 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java @@ -132,12 +132,13 @@ public AlleleLikelihoods computeReadLikelihoods(final List< @Override public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) { final double log10ErrorRate = Math.log10(expectedErrorRate); - final double catastrophicErrorRate = Math.log10(fbargs.fillingValue); + final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors. + final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate); return read -> { final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate)); - final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * fbargs.fillingValue)); - return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*catastrophicErrorRate; + final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate)); + return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate; }; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java index f504d400b13..557e7f0add9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -127,6 +127,21 @@ * argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause * performance challenges including excessive slowness. We are working on resolving these limitations. * + * For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY, + * see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions + * should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region + * when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will + * be determined using the following precedence:
+ *+ *
+ * + *- ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant + * (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine + * the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;
+ *- ploidy given via global -ploidy flag;
+ *- ploidy determined by the default global built-in constant for humans (2).
+ *Coordinates for the PAR for CRCh38 can be found here.
+ * *Additional Notes
**
- When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index 7ab58d73264..f818659bb5e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -1,7 +1,10 @@ package org.broadinstitute.hellbender.tools.walkers.haplotypecaller; +import htsjdk.tribble.NamedFeature; import htsjdk.variant.variantcontext.VariantContext; +import org.apache.arrow.util.VisibleForTesting; import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.SerializationUtils; import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; @@ -9,6 +12,7 @@ import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection; +import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; @@ -23,9 +27,11 @@ /** * Set of arguments for the {@link HaplotypeCallerEngine} */ -public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{ +public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable { private static final long serialVersionUID = 1L; + public static final String PLOIDY_REGIONS_NAME = "ploidy-regions"; + public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands"; public static final String GQ_BAND_SHORT_NAME = "GQB"; public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality"; @@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu return new HaplotypeCallerReadThreadingAssemblerArgumentCollection(); } + @Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true) + public FeatureInputploidyRegions = null; + /** * You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This * is especially useful if your samples are all in the same file but you need to run them individually through HC @@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() { return flowMode != FlowMode.NONE; } + // Copy method used to create new hcArgs with same fields except input ploidy model + public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) { + HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this); + newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy; + return newArgsWithNewPloidy; + } /** * the different flow modes, in terms of their parameters and their values diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 71f6d64ede7..36918f1fb91 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -3,7 +3,10 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.OverlapDetector; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.NamedFeature; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; @@ -20,12 +23,15 @@ import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.tools.walkers.annotator.*; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod; +import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingEngine; import org.broadinstitute.hellbender.tools.walkers.genotyper.MinimalGenotypingEngine; import org.broadinstitute.hellbender.tools.walkers.genotyper.OutputMode; import org.broadinstitute.hellbender.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; +import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.hellbender.utils.haplotype.Event; import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles; @@ -83,8 +89,31 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { protected final OutputStreamWriter assemblyDebugOutStream; - // the genotyping engine for the isActive() determination - private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null; + /** + * List of interval file entries including regions and custom ploidy values to apply in that region. + */ + protected final List ploidyRegions = new ArrayList<>(); + + /** + * An OverlapDetector object for checking whether region overlaps given ploidyRegions. + */ + protected final OverlapDetector ploidyRegionsOverlapDetector; + + /** + * List of all custom ploidies provided by user + */ + private final LinkedHashSet allCustomPloidies; + + /** + * The default genotyping engine for the isActive() determination + */ + private MinimalGenotypingEngine defaultActiveRegionEvaluationGenotyperEngine = null; + + /** + * Map of user-provided ploidy values to corresponding active region genotyper. Values are checked as valid Integers during + * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime. + */ + private final HashMap ploidyToActiveEvaluationGenotyper = new HashMap<>(); protected ReadThreadingAssembler assemblyEngine = null; @@ -93,7 +122,16 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { // If we are in PDHMM mode we need to hold onto two likelihoods engines for the fallback private ReadLikelihoodCalculationEngine pdhmmLikelihoodCalculationEngine = null; - protected HaplotypeCallerGenotypingEngine genotypingEngine = null; + /** + * The default genotyping engine to use for actual variant calling and genotyping in an active region. + */ + protected HaplotypeCallerGenotypingEngine defaultGenotypingEngine = null; + + /** + * Map of user-provided ploidy values to corresponding genotyper. Values are checked as valid Integers during + * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime. + */ + protected final HashMap ploidyToGenotyperMap = new HashMap<>(); private VariantAnnotatorEngine annotationEngine = null; @@ -163,7 +201,7 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { /** * Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header, * and a reference file - * @param hcArgs command-line arguments for the HaplotypeCaller + * @param hcArgs command-line arguments for the HaplotypeCaller * @param assemblyRegionArgs * @param createBamOutIndex true to create an index file for the bamout * @param createBamOutMD5 true to create an md5 file for the bamout @@ -196,6 +234,21 @@ public HaplotypeCallerEngine(final HaplotypeCallerArgumentCollection hcArgs, Ass HaplotypeCallerGenotypingDebugger.initialize(hcArgs.genotyperDebugOutStream); } + // Parse the user provided custom ploidy regions into ploidyRegions object containing SimpleCounts + if (this.hcArgs.ploidyRegions != null) { + FeatureDataSource ploidyDataSource = new FeatureDataSource<>(this.hcArgs.ploidyRegions, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, NamedFeature.class); + ploidyDataSource.forEach(r -> this.ploidyRegions.add(new SimpleCount(r))); + } + + for (SimpleCount region : this.ploidyRegions) { + if (!IntervalUtils.intervalIsOnDictionaryContig(region.getInterval(), readsHeader.getSequenceDictionary())) { + throw new UserException("Invalid region provided for --ploidy-regions at " + region.getContig() + ":" + region.getStart() + "-" + region.getEnd() + ". Contig name or endpoint doesn't match read sequence dictionary."); + } + } + + this.ploidyRegionsOverlapDetector = OverlapDetector.create(this.ploidyRegions); + this.allCustomPloidies = this.ploidyRegions.stream().map(SimpleCount::getCount).collect(Collectors.toCollection(LinkedHashSet::new)); + trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, readsHeader.getSequenceDictionary()); initialize(createBamOutIndex, createBamOutMD5); } @@ -242,8 +295,16 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5 initializeActiveRegionEvaluationGenotyperEngine(); - genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); - genotypingEngine.setAnnotationEngine(annotationEngine); + defaultGenotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); + defaultGenotypingEngine.setAnnotationEngine(annotationEngine); + + // Create other custom genotyping engines if user provided ploidyRegions + for (final int ploidy : this.allCustomPloidies) { + HaplotypeCallerArgumentCollection newPloidyHcArgs = hcArgs.copyWithNewPloidy(ploidy); + HaplotypeCallerGenotypingEngine newGenotypingEngine = new HaplotypeCallerGenotypingEngine(newPloidyHcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); + newGenotypingEngine.setAnnotationEngine(annotationEngine); + this.ploidyToGenotyperMap.put(ploidy, newGenotypingEngine); + } boolean isFlowBased = (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBased) || (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBasedHMM); @@ -381,8 +442,18 @@ private void initializeActiveRegionEvaluationGenotyperEngine() { // Seems that at least with some test data we can lose genuine haploid variation if we use ploidy == 1 activeRegionArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, hcArgs.standardArgs.genotypeArgs.samplePloidy); - activeRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList); - activeRegionEvaluationGenotyperEngine.setLogger(logger); + defaultActiveRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList); + defaultActiveRegionEvaluationGenotyperEngine.setLogger(logger); + + // If custom ploidyRegions provided, create corresponding map for active region determination genotyper + for (final int ploidy : this.allCustomPloidies) { + StandardCallerArgumentCollection newPloidyActiveArgs = new StandardCallerArgumentCollection(); + newPloidyActiveArgs.copyStandardCallerArgsFrom(activeRegionArgs); + newPloidyActiveArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, ploidy); + MinimalGenotypingEngine newActiveGenotypingEngine = new MinimalGenotypingEngine(newPloidyActiveArgs, samplesList); + newActiveGenotypingEngine.setLogger(logger); + this.ploidyToActiveEvaluationGenotyper.put(ploidy, newActiveGenotypingEngine); + } } /** @@ -455,7 +526,7 @@ public VCFHeader makeVCFHeader( final SAMSequenceDictionary sequenceDictionary, final Set headerInfo = new HashSet<>(); headerInfo.addAll(defaultToolHeaderLines); - headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); + headerInfo.addAll(defaultGenotypingEngine.getAppropriateVCFInfoHeaders()); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions(emitReferenceConfidence())); // all callers need to add these standard annotation header lines @@ -523,6 +594,57 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence vcfWriter.writeHeader(makeVCFHeader(sequenceDictionary, defaultToolHeaderLines)); } + /** + * Determines the appropriate ploidy to use at given site for different genotyping engines. + * @param region Current region of interest + * @return Ploidy value to use here given user inputs, or -1 if fall back to default + */ + private int getPloidyToUseAtThisSite(Locatable region) { + Set overlaps = this.ploidyRegionsOverlapDetector.getOverlaps(region); + // Return first engine for interval overlapping this region + if (!overlaps.isEmpty()) { + Iterator intervals = overlaps.iterator(); + int max = intervals.next().getCount(); + while (intervals.hasNext()) { + int next = intervals.next().getCount(); + if (next > max) { + max = next; + } + } + return max; + } else { + return -1; // Sentinel value to fall back to default genotyper + } + } + + /** + * Selects appropriate active region genotyping engine for given region + * @param region Current region of interest + * @return Active genotyping engine with correct ploidy setting for given region + */ + private MinimalGenotypingEngine getLocalActiveGenotyper(Locatable region) { + int currentPloidy = getPloidyToUseAtThisSite(region); + if (currentPloidy == -1) { + return this.defaultActiveRegionEvaluationGenotyperEngine; + } else { + return this.ploidyToActiveEvaluationGenotyper.get(currentPloidy); + } + } + + /** + * Selects appropriate genotyping engine for given region. + * @param region Current region of interest, e.g. AssemblyRegion + * @return Genotyping engine with correct ploidy setting for given region + */ + protected HaplotypeCallerGenotypingEngine getLocalGenotypingEngine(Locatable region) { + int currentPloidy = getPloidyToUseAtThisSite(region); + if (currentPloidy == -1) { + return this.defaultGenotypingEngine; + } else { + return this.ploidyToGenotyperMap.get(currentPloidy); + } + } + /** * Given a pileup, returns an ActivityProfileState containing the probability (0.0 to 1.0) that it's an "active" site. * @@ -537,6 +659,8 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence */ @Override public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext features) { + MinimalGenotypingEngine localActiveGenotypingEngine = getLocalActiveGenotyper(ref); + if (forceCallingAllelesPresent && features.getValues(hcArgs.alleles, ref).stream().anyMatch(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered())) { return new ActivityProfileState(ref.getInterval(), 1.0); } @@ -546,7 +670,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer return new ActivityProfileState(ref.getInterval(), 0.0); } - final int ploidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy; + final int ploidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy; final List noCall = GATKVariantContextUtils.noCallAlleles(ploidy); // used to noCall all genotypes until the exact model is applied final Map splitContexts; @@ -565,7 +689,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer sample.getValue().getBasePileup().forEach(p -> PileupBasedAlleles.addMismatchPercentageToRead(p.getRead(), readsHeader, ref)); } // The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not. - final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy; + final int activeRegionDetectionHackishSamplePloidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy; final double[] genotypeLikelihoods = ((RefVsAnyResult) referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny( activeRegionDetectionHackishSamplePloidy, sample.getValue().getBasePileup(), ref.getBase(), @@ -579,9 +703,9 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer if (genotypes.size() == 1) { // Faster implementation using exact marginalization instead of iteration - isActiveProb = activeRegionEvaluationGenotyperEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector()); + isActiveProb = localActiveGenotypingEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector()); } else { - final VariantContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make()); + final VariantContext vcOut = localActiveGenotypingEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make()); isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb(vcOut.getPhredScaledQual()); } @@ -607,6 +731,8 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer * @return List of variants discovered in the region (may be empty) */ public List callRegion(final AssemblyRegion region, final FeatureContext features, final ReferenceContext referenceContext) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region); + if ( hcArgs.justDetermineActiveRegions ) { // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return NO_CALLS; @@ -632,6 +758,7 @@ public List callRegion(final AssemblyRegion region, final Featur final List givenAlleles = features.getValues(hcArgs.alleles).stream() .filter(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered()) .flatMap(vc -> GATKVariantContextUtils.splitVariantContextToEvents(vc, false, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, false).stream()) + .filter(event -> event.getStart() >= region.getSpan().getStart()) // filter out events that do not start within the region. This approach works because events that begin upstream of the calling window cannot be called by this region calling code in the frist place. .collect(Collectors.toList()); if( givenAlleles.isEmpty() && region.size() == 0 ) { @@ -799,7 +926,7 @@ public List callRegion(final AssemblyRegion region, final Featur if (hcArgs.filterAlleles) { logger.debug("Filtering alleles"); - AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine); + AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine); //need to update haplotypes to find the alleles EventMap.buildEventMapsForHaplotypes(readLikelihoods.alleles(), assemblyResult.getFullReferenceWithPadding(), @@ -849,7 +976,7 @@ public List callRegion(final AssemblyRegion region, final Featur } } - final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( + final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods( haplotypes, subsettedReadLikelihoodsFinal, perSampleFilteredReadList, @@ -890,7 +1017,7 @@ public List callRegion(final AssemblyRegion region, final Featur result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - subsettedReadLikelihoodsFinal, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, + subsettedReadLikelihoodsFinal, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors)); trimmingResult.nonVariantRightFlankRegion().ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, VCpriors))); @@ -948,6 +1075,7 @@ protected boolean containsCalls(final CalledHaplotypes calledHaplotypes) { * @return a list of variant contexts (can be empty) to emit for this ref region */ protected List referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized, final List VCpriors) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region); if ( emitReferenceConfidence() ) { if ( needsToBeFinalized ) { AssemblyBasedCallerUtils.finalizeRegion(region, @@ -967,7 +1095,7 @@ protected List referenceModelForNoVariation(final AssemblyRegion final List haplotypes = Collections.singletonList(refHaplotype); return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, paddedLoc, region, AssemblyBasedCallerUtils.createDummyStratifiedReadMap(refHaplotype, samplesList, readsHeader, region), - genotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors); + localGenotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors); } else { return NO_CALLS; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java index b5b5c2cfa88..e47245c9f91 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java @@ -460,6 +460,7 @@ private void uncollapse(final CallRegionContext context) { } private void filter(final CallRegionContext context) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region); try { // no need for this step? @@ -479,7 +480,7 @@ private void filter(final CallRegionContext context) { context.suspiciousLocations = new HashSet<>(); if (hcArgs.filterAlleles) { logger.debug("Filtering alleles"); - AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine); + AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine); //need to update haplotypes to find the alleles EventMap.buildEventMapsForHaplotypes(context.readLikelihoods.alleles(), context.assemblyResult.getFullReferenceWithPadding(), @@ -520,6 +521,7 @@ private void filter(final CallRegionContext context) { } private void genotype(final CallRegionContext context) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region); // no need for this step? if ( context.regionVariants != null ) { @@ -542,7 +544,7 @@ private void genotype(final CallRegionContext context) { // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] List haplotypes = context.readLikelihoods.alleles(); - final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( + final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods( haplotypes, context.readLikelihoods, perSampleFilteredReadList, @@ -582,7 +584,7 @@ private void genotype(final CallRegionContext context) { result.addAll(referenceConfidenceModel.calculateRefConfidence(context.assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), context.assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - context.readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, + context.readLikelihoods, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, context.VCpriors)); context.nonVariantRightFlankRegion.ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, context.VCpriors))); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java index 095e2dab8ea..dd02b8e8d4b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java @@ -2,7 +2,7 @@ import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java new file mode 100644 index 00000000000..c166b061784 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java @@ -0,0 +1,203 @@ +package org.broadinstitute.hellbender.tools.walkers.mutect; + + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; +import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality; +import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; +import org.broadinstitute.hellbender.utils.haplotype.Haplotype; +import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine; +import org.broadinstitute.hellbender.utils.pileup.PileupElement; +import org.broadinstitute.hellbender.utils.read.AlignmentUtils; +import org.broadinstitute.hellbender.utils.read.Fragment; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * For each sample and for each allele a list feature vectors of supporting reads + * In order to reduce the number of delimiter characters, we flatten featurized reads. For example, suppose allele 1 has featurized reads + * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is + * 1,2,3,4|5,6,7,8 + */ +public class FeaturizedReadSets { + private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class); + + public static final int DEFAULT_BASE_QUALITY = 25; + + private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA); + private static final int FEATURES_PER_RANGE = 5; + private static final List RANGES = List.of(5, 10, 25, 50); + public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size(); + private static final int VERY_BAD_QUAL_THRESHOLD = 10; + private static final int BAD_QUAL_THRESHOLD = 20; + + private FeaturizedReadSets() { } + + public static List >> getReadVectors(final VariantContext vc, + final Collection
samples, + final AlleleLikelihoods likelihoods, + final AlleleLikelihoods haplotypeLikelihoods, + final int refDownsample, + final int altDownsample, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode, readGroupIndices); + } + + // returns Lists (in allele order) of lists of read vectors supporting each allele + public static List >> getReadVectors(final VariantContext vc, + final Collection
samples, + final AlleleLikelihoods likelihoods, + final AlleleLikelihoods haplotypeLikelihoods, + final int refDownsample, + final int altDownsample, + final Map altDownsampleMap, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + final Map > readsByAllele = likelihoods.alleles().stream() + .collect(Collectors.toMap(a -> a, a -> new ArrayList<>())); + + samples.stream().flatMap(s -> likelihoods.bestAllelesBreakingTies(s).stream()) + .filter(ba -> ba.isInformative()) + .forEach(ba -> readsByAllele.get(ba.allele).add(ba.evidence)); + + // downsample if necessary + final Allele refAllele = likelihoods.alleles().stream().filter(Allele::isReference).findFirst().get(); + for (final Allele allele : likelihoods.alleles()) { + final int downsample = allele.isReference() ? refDownsample : altDownsampleMap.getOrDefault(allele, altDownsample); + if (readsByAllele.get(allele).size() > downsample) { + Collections.shuffle(readsByAllele.get(allele)); + readsByAllele.put(allele, readsByAllele.get(allele).subList(0, downsample)); + } + } + + final Map bestHaplotypes = new HashMap<>(); + samples.stream().flatMap(s -> haplotypeLikelihoods.bestAllelesBreakingTies(s).stream()) + .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele))); + + return vc.getAlleles().stream() + .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode, readGroupIndices)).collect(Collectors.toList())) + .collect(Collectors.toList()); + } + + + private static List featurize(final GATKRead read, final VariantContext vc, + final Map bestHaplotypes, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + final List result = new ArrayList<>(); + result.add(readGroupIndices.get(read.getReadGroup())); // this is read group metadata rather than part of the tensor + result.add(read.getMappingQuality()); + result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY)); + result.add(read.isFirstOfPair() ? 1 : 0); + result.add(read.isReverseStrand() ? 1 : 0); + + // distances from ends of read + final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0); + result.add(readPositionOfVariantStart); + result.add(read.getLength() - readPositionOfVariantStart); + + + result.add(Math.abs(read.getFragmentLength())); + + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) { + // distances from ends of fragment + final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart()); + final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength()); + result.add(vc.getStart() - fragmentStart); + result.add(fragmentEnd - vc.getEnd()); + } + + // Ultima-specific read tags + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) { + result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s + result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round + } + + // mismatches versus best haplotype + final Haplotype haplotype = bestHaplotypes.get(read); + + // TODO: fix this + // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina + if (!bestHaplotypes.containsKey(read)) { + logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(), + vc.getContig(), vc.getStart())); + result.add(3); + result.add(2); + + for (int n = 0; n < NUM_RANGED_FEATURES; n++) { + result.add(0); + } + } else { + byte[] haplotypeBases = haplotype.getBases(); + final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); + final GATKRead copy = read.copy(); + copy.setCigar(readToHaplotypeAlignment.getCigar()); + final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches; + result.add(mismatchCount); + + final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count(); + result.add((int) indelsVsBestHaplotype); + + final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset(); + final AlignmentStateMachine asm = new AlignmentStateMachine(copy); + asm.stepForwardOnGenome(); + final List rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList(); + + while (!asm.isRightEdge()) { + final PileupElement pe = asm.makePileupElement(); + final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart); + + // pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant + final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst(); + if (relevantRange.isPresent()) { + final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt()); + if (pe.isAfterInsertion()) { + featuresToAddTo[0]++; + } + + if (pe.isDeletion()) { + featuresToAddTo[1]++; + } else { + final byte base = pe.getBase(); + final byte qual = pe.getQual(); + final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype]; + + if (base != haplotypeBase) { + featuresToAddTo[2]++; + } + + if (qual < VERY_BAD_QUAL_THRESHOLD) { + featuresToAddTo[3]++; + } else if (qual < BAD_QUAL_THRESHOLD) { + featuresToAddTo[4]++; + } + } + } + asm.stepForwardOnGenome(); + } + + for (final int[] featuresToAdd : rangedFeatures) { + for (final int val : featuresToAdd) { + result.add(val); + } + } + } + // the +1 is for the read group index that comes before the features + Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures() + 1, "Wrong number of features"); + + return result; + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index d25c7a33fc5..a6c619f1b17 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample"; public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset"; public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth"; + public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode"; public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10; public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20; - public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20; + public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1; @Override protected int getDefaultMaxMnpDistance() { return 1; } @@ -205,10 +206,33 @@ public double getDefaultAlleleFrequency() { @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection") public File mutect3Dataset; + @Advanced + @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.") + public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA; + + public enum Mutect3DatasetMode { + ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES), + ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES); + + final private int numReadFeatures; + + Mutect3DatasetMode(final int numReadFeatures) { + this.numReadFeatures = numReadFeatures; + } + + public int getNumReadFeatures() { + return numReadFeatures; + } + } + /** * VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field) * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors. * If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions. + * + * Although the normal use of this input is in generating training data, it can also be used when generating test data + * for making Permutect calls. In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as + * usual, and Permutect uses the labels to analyze the quality of its results. */ @Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true) public FeatureInput mutect3TrainingTruth; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java index f9c3733acca..0120fa9dae3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; @@ -262,7 +263,8 @@ public boolean shouldTrackPileupsForAssemblyRegions() { @Override public void onTraversalStart() { VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), null, Collections.emptyList(), false, false); - m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), referenceArguments.getReferenceSpecifier(), annotatorEngine); + m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), + getBestAvailableSequenceDictionary(), referenceArguments.getReferenceSpecifier(), annotatorEngine); vcfWriter = createVCFWriter(outputVCF); if (m2Engine.emitReferenceConfidence()) { logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions."); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java index 1d2096f3c2b..c831a4169e0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Locatable; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -72,7 +73,7 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseable { private static final List STANDARD_MUTECT_INFO_FIELDS = Arrays.asList(GATKVCFConstants.NORMAL_LOG_10_ODDS_KEY, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, GATKVCFConstants.NORMAL_ARTIFACT_LOG_10_ODDS_KEY, - GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY, + GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY, GATKVCFConstants.GERMLINE_QUAL_KEY, GATKVCFConstants.CONTAMINATION_QUAL_KEY, GATKVCFConstants.SEQUENCING_QUAL_KEY, GATKVCFConstants.POLYMERASE_SLIPPAGE_QUAL_KEY, GATKVCFConstants.READ_ORIENTATION_QUAL_KEY, GATKVCFConstants.STRAND_QUAL_KEY, GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, GATKVCFConstants.N_COUNT_KEY, GATKVCFConstants.AS_UNIQUE_ALT_READ_SET_COUNT_KEY); @@ -94,6 +95,7 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab final private M2ArgumentCollection MTAC; private SAMFileHeader header; + private SAMSequenceDictionary sequenceDictionary; private final int minCallableDepth; public static final String CALLABLE_SITES_NAME = "callable"; @@ -136,9 +138,12 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab * @param referenceSpec reference specifier for the reference * @param annotatorEngine annotator engine built with desired annotations */ - public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs, final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) { + public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs, + final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header, + final SAMSequenceDictionary sequenceDictionary, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) { this.MTAC = Utils.nonNull(MTAC); this.header = Utils.nonNull(header); + this.sequenceDictionary = sequenceDictionary; minCallableDepth = MTAC.callableDepth; referenceReader = ReferenceUtils.createReferenceReader(Utils.nonNull(referenceSpec)); aligner = SmithWatermanAligner.getAligner(MTAC.smithWatermanImplementation); @@ -162,7 +167,7 @@ public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentColl annotationEngine = Utils.nonNull(annotatorEngine); assemblyEngine = MTAC.createReadThreadingAssembler(); likelihoodCalculationEngine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(MTAC.likelihoodArgs, MTAC.fbargs, true, MTAC.likelihoodArgs.likelihoodEngineImplementation); - genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine); + genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine, header, sequenceDictionary); haplotypeBAMWriter = AssemblyBasedCallerUtils.createBamWriter(MTAC, createBamOutIndex, createBamOutMD5, header); trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, header.getSequenceDictionary()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java index e6a080cd14f..ad708c8dac8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java @@ -1,21 +1,22 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.tuple.Triple; -import org.apache.commons.math3.util.CombinatoricsUtils; import org.apache.commons.math3.util.FastMath; -import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger; import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity; -import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets; import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases; import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine; import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.MathUtils; -import org.broadinstitute.hellbender.utils.NaturalLogUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix; @@ -46,8 +47,9 @@ private enum Label { ARTIFACT, VARIANT, UNLABELED, IGNORE } - // number of features for each vectorized read - private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ; + private final SAMSequenceDictionary sequenceDictionary; + + private final Map readGroupIndices = new HashMap<>(); // number of additional variant features for assembly complexity, reference context private static final int NUM_EXTRA_FEATURES = 9; @@ -71,6 +73,8 @@ private enum Label { private static final int MIN_REF = 5; private final PrintWriter printWriter; + private final PrintWriter contigPrintWriter; + private final PrintWriter readGroupPrintWriter; // number of nonartifact data to keep for each artifact datum private final int nonArtifactPerArtifact; @@ -85,9 +89,15 @@ private enum Label { private final EnumMap > unmatchedArtifactAltCounts; - public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount, final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples) { + public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount, + final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples, + final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) { try { printWriter = new PrintWriter(new FileWriter(Utils.nonNull(datasetFile))); + final File contigTableFile = datasetFile.toPath().resolveSibling("contigs.table").toFile(); + final File readGroupTableFile = datasetFile.toPath().resolveSibling("read-groups.table").toFile(); + contigPrintWriter = new PrintWriter(new FileWriter(contigTableFile)); + readGroupPrintWriter = new PrintWriter(new FileWriter(readGroupTableFile)); } catch (IOException ex) { throw new UserException.BadInput("Could not create dataset file writer"); } @@ -98,6 +108,12 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, this.maxRefCount = maxRefCount; this.maxAltCount = maxAltCount; + this.sequenceDictionary = sequenceDictionary; + final List readGroups = header.getReadGroups(); + for (int n = 0; n < readGroups.size(); n++) { + readGroupIndices.put(readGroups.get(n).getReadGroupId(), n); + } + unmatchedArtifactAltCounts = new EnumMap<>(VariantType.class); for (final VariantType type : VariantType.values()) { unmatchedArtifactAltCounts.put(type, new ArrayBlockingQueue<>(CAPACITY)); @@ -108,10 +124,11 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, public void addData(final ReferenceContext ref, final VariantContext vc, Optional > truthVCs, final AlleleLikelihoods
likelihoods, final AlleleLikelihoods logFragmentLikelihoods, - final AlleleLikelihoods logFragmentAlleleLikelihoods) { + final AlleleLikelihoods logFragmentAlleleLikelihoods, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { final String refBases = ReferenceBases.annotate(ref, vc); final String refAllele = vc.getReference().getBaseString(); - final String contig = vc.getContig(); + final int contigIndex = sequenceDictionary.getSequenceIndex(vc.getContig()); final int position = vc.getStart(); final Set tumorSamples = likelihoods.samples().stream().filter(sample -> !normalSamples.contains(sample)).collect(Collectors.toSet()); final int numAlt = vc.getNAlleles() - 1; @@ -132,6 +149,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona final int normalDepth = (int) MathUtils.sum(normalADs); final boolean hasNormal = normalDepth > 0; + final List allRefAlleles = new ArrayList<>(); + allRefAlleles.add(vc.getReference()); + truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference()))); + final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get(); + + // skip(1) excludes the reference allele + final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream() + .skip(1).toList(); + + final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream() + .filter(truthVC -> ! truthVC.isFiltered()) + .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream()) + .collect(Collectors.toSet()); + final List