diff --git a/.dockstore.yml b/.dockstore.yml index 72dd5485eab..a042e3a1bb4 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -309,8 +309,7 @@ workflows: branches: - master - ah_var_store - - vs_1396_bq_query_audit - - gg_VS-1395_Have_CreateVATFromVDS_MoreGoodness + - vs_1178_merge_master_to_ah_var_store_again tags: - /.*/ - name: GvsIngestTieout @@ -439,7 +438,7 @@ workflows: - master - ah_var_store - EchoCallset - - vs_1412_pgen_pvars_not_compressed + - vs_1178_merge_master_to_ah_var_store_again - name: MergePgenHierarchicalWdl subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/MergePgenHierarchical.wdl diff --git a/.github/actions/upload-gatk-test-results/action.yml b/.github/actions/upload-gatk-test-results/action.yml index b210216d2f0..e2c267e2c75 100644 --- a/.github/actions/upload-gatk-test-results/action.yml +++ b/.github/actions/upload-gatk-test-results/action.yml @@ -40,16 +40,17 @@ runs: name: test-results-${{ inputs.is-docker == 'true' && 'docker-' || '' }}${{ matrix.Java }}-${{ matrix.testType }} path: build/reports/tests - - name: Upload to codecov - run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash) - shell: bash + # Disabling codecov because it is timing out and failing builds that otherwise succeed. + ## - name: Upload to codecov + ## run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash) + ## shell: bash - name: Upload Reports if: ${{ inputs.only-artifact != 'true' }} id: uploadreports run: | gsutil -m cp -z html -z js -z xml -z css -r build/reports/tests gs:/${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/; - VIEW_URL=https://storage.googleapis.com${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/tests/testOnPackagedReleaseJar/index.html + VIEW_URL=https://storage.googleapis.com${{ env.HELLBENDER_TEST_LOGS }}${{ inputs.repo-path }}/tests/test/index.html echo "See the test report at ${VIEW_URL}"; echo view_url="${VIEW_URL}" >> $GITHUB_OUTPUT @@ -91,4 +92,4 @@ runs: run: | pip install --user PyGithub; python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }}; - shell: bash \ No newline at end of file + shell: bash diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml index 2f97d560b65..76fa7a9d07e 100644 --- a/.github/workflows/gatk-tests.yml +++ b/.github/workflows/gatk-tests.yml @@ -116,7 +116,7 @@ jobs: - name: 'Set up Cloud SDK' if: needs.check-secrets.outputs.google-credentials == 'true' - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/setup-gcloud@v2 - name: pull lfs files run: git lfs pull @@ -188,7 +188,7 @@ jobs: - name: 'Set up Cloud SDK' if: needs.check-secrets.outputs.google-credentials == 'true' - uses: google-github-actions/setup-gcloud@v0 + uses: google-github-actions/setup-gcloud@v2 - name: build test jars run: ./gradlew clean shadowTestClassJar shadowTestJar @@ -230,10 +230,6 @@ jobs: bash --init-file /gatk/gatkenv.rc /root/run_unit_tests.sh; TEST_EXIT_VALUE=$?; $( exit ${TEST_EXIT_VALUE} ); - sudo chmod -R a+w build/reports/; - mkdir build/reports/tests/test \ - && cp -rp build/reports/tests/testOnPackagedReleaseJar/* build/reports/tests/test \ - && rm -r build/reports/tests/testOnPackagedReleaseJar; - uses: ./.github/actions/upload-gatk-test-results if: always() diff --git a/.gitignore b/.gitignore index e95e5ab6094..a73810708a6 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ funcotator_tmp #Test generated dot files test*.dot +.vscode/ diff --git a/Dockerfile b/Dockerfile index d39a6ce95da..ceec398a9d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,18 @@ -ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.1.0 +ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0 # stage 1 for constructing the GATK zip FROM ${BASE_DOCKER} AS gradleBuild LABEL stage=gatkIntermediateBuildImage ARG RELEASE=false -RUN ls . + ADD . /gatk WORKDIR /gatk # Get an updated gcloud signing key, in case the one in the base image has expired -RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \ +#Download only resources required for the build, not for testing +RUN ls . && \ + rm /etc/apt/sources.list.d/google-cloud-sdk.list && \ apt update &&\ apt-key list && \ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ @@ -19,16 +21,13 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \ apt-get -y clean && \ apt-get -y autoclean && \ apt-get -y autoremove && \ - rm -rf /var/lib/apt/lists/* -RUN git lfs install --force - -#Download only resources required for the build, not for testing -RUN git lfs pull --include src/main/resources/large - -RUN export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE -RUN cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ -RUN unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts -RUN chmod -R a+rw /gatk/unzippedJar + rm -rf /var/lib/apt/lists/* && \ + git lfs install --force && \ + git lfs pull --include src/main/resources/large && \ + export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE && \ + cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ && \ + unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts && \ + chmod -R a+rw /gatk/unzippedJar FROM ${BASE_DOCKER} @@ -47,17 +46,17 @@ RUN chmod -R a+rw /gatk COPY --from=gradleBuild /gatk/unzippedJar . #Setup linked jars that may be needed for running gatk -RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar -RUN ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar -RUN ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar +RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar && \ + ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar && \ + ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar WORKDIR /root # Make sure we can see a help message -RUN java -jar gatk.jar -h -RUN mkdir /gatkCloneMountPoint -RUN mkdir /jars -RUN mkdir .gradle +RUN java -jar gatk.jar -h && \ + mkdir /gatkCloneMountPoint && \ + mkdir /jars && \ + mkdir .gradle WORKDIR /gatk @@ -80,28 +79,16 @@ RUN echo "source activate gatk" > /root/run_unit_tests.sh && \ echo "ln -s /gatkCloneMountPoint/build/ /gatkCloneMountPoint/scripts/docker/build" >> /root/run_unit_tests.sh && \ echo "cd /gatk/ && /gatkCloneMountPoint/gradlew -Dfile.encoding=UTF-8 -b /gatkCloneMountPoint/dockertest.gradle testOnPackagedReleaseJar jacocoTestReportOnPackagedReleaseJar -a -p /gatkCloneMountPoint" >> /root/run_unit_tests.sh -# TODO: Determine whether we actually need this. For now it seems to be required because the version of libstdc++ on -# TODO: the gatk base docker is out of date (maybe?) -RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test && \ - apt-get update && \ - apt-get -y upgrade libstdc++6 && \ - apt-get -y dist-upgrade - -WORKDIR /root -RUN cp -r /root/run_unit_tests.sh /gatk -RUN cp -r gatk.jar /gatk -ENV CLASSPATH /gatk/gatk.jar:$CLASSPATH +RUN cp -r /root/run_unit_tests.sh /gatk && \ + cp -r /root/gatk.jar /gatk +ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH # Start GATK Python environment -WORKDIR /gatk -ENV PATH $CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \ echo "source activate gatk" >> /gatk/gatkenv.rc && \ echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \ conda clean -afy && \ - find /opt/miniconda/ -follow -type f -name '*.a' -delete && \ - find /opt/miniconda/ -follow -type f -name '*.pyc' -delete && \ rm -rf /root/.cache/pip CMD ["bash", "--init-file", "/gatk/gatkenv.rc"] diff --git a/README.md b/README.md index 1dc2bb366b8..26e731c26db 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ releases of the toolkit. * [Requirements](#requirements) * [Quick Start Guide](#quickstart) * [Downloading GATK4](#downloading) + * [Tools Included in Docker Image](#dockerSoftware) * [Building GATK4](#building) * [Running GATK4](#running) * [Passing JVM options to gatk](#jvmoptions) @@ -115,6 +116,34 @@ You can download and run pre-built versions of GATK4 from the following places: * You can download a GATK4 docker image from [our dockerhub repository](https://hub.docker.com/r/broadinstitute/gatk/). We also host unstable nightly development builds on [this dockerhub repository](https://hub.docker.com/r/broadinstitute/gatk-nightly/). * Within the docker image, run gatk commands as usual from the default startup directory (/gatk). +### Tools Included in Docker Image + +Our docker image contains the following bioinformatics tools, which can be run by invoking the tool name from the command line: +* bedtools (v2.30.0) +* samtools (1.13) +* bcftools (1.13) +* tabix (1.13+ds) + +We also include an installation of Python3 (3.6.10) with the following popular packages included: +* numpy +* scipy +* tensorflow +* pymc3 +* keras +* scikit-learn +* matplotlib +* pandas +* biopython +* pyvcf +* pysam + +We also include an installation of R (3.6.2) with the following popular packages included: +* data.table +* dplyr +* ggplot2 + +For more details on system packages, see the GATK [Base Dockerfile](scripts/docker/gatkbase/Dockerfile) and for more details on the Python3/R packages, see the [Conda environment setup file](scripts/gatkcondaenv.yml.template). Versions for the Python3/R packages can be found there. + ## Building GATK4 * **To do a full build of GATK4, first clone the GATK repository using "git clone", then run:** diff --git a/build.gradle b/build.gradle index a41a1364aef..a8b3c976f64 100644 --- a/build.gradle +++ b/build.gradle @@ -16,6 +16,7 @@ plugins { id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating id 'com.palantir.git-version' version '0.5.1' //version helper + id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies } @@ -56,20 +57,20 @@ repositories { mavenLocal() } -final htsjdkVersion = System.getProperty('htsjdk.version','3.0.5') -final picardVersion = System.getProperty('picard.version','3.1.0') +final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0') +final picardVersion = System.getProperty('picard.version','3.1.1') final barclayVersion = System.getProperty('barclay.version','5.0.0') -final sparkVersion = System.getProperty('spark.version', '3.3.1') -final hadoopVersion = System.getProperty('hadoop.version', '3.3.1') -final disqVersion = System.getProperty('disq.version','0.3.6') -final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.0') -final bigQueryVersion = System.getProperty('bigQuery.version', '2.31.0') -final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.41.0') -final guavaVersion = System.getProperty('guava.version', '32.1.2-jre') +final sparkVersion = System.getProperty('spark.version', '3.5.0') +final hadoopVersion = System.getProperty('hadoop.version', '3.3.6') +final disqVersion = System.getProperty('disq.version','0.3.8') +final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.3') +final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0') +final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0') +final guavaVersion = System.getProperty('guava.version', '32.1.3-jre') final log4j2Version = System.getProperty('log4j2Version', '2.17.1') final testNGVersion = '7.0.0' -final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.0' +final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8' final baseJarName = 'gatk' final secondaryBaseJarName = 'hellbender' @@ -267,12 +268,12 @@ dependencies { // are routed to log4j implementation 'org.apache.logging.log4j:log4j-jcl:' + log4j2Version - implementation 'org.apache.commons:commons-lang3:3.5' - implementation 'org.apache.commons:commons-math3:3.5' + implementation 'org.apache.commons:commons-lang3:3.14.0' + implementation 'org.apache.commons:commons-math3:3.6.1' implementation 'org.hipparchus:hipparchus-stat:2.0' - implementation 'org.apache.commons:commons-collections4:4.1' - implementation 'org.apache.commons:commons-vfs2:2.0' - implementation 'org.apache.commons:commons-configuration2:2.4' + implementation 'org.apache.commons:commons-collections4:4.4' + implementation 'org.apache.commons:commons-vfs2:2.9.0' + implementation 'org.apache.commons:commons-configuration2:2.9.0' constraints { implementation('org.apache.commons:commons-text') { version { @@ -287,7 +288,7 @@ dependencies { implementation 'commons-io:commons-io:2.5' implementation 'org.reflections:reflections:0.9.10' - implementation 'it.unimi.dsi:fastutil:7.0.6' + implementation 'it.unimi.dsi:fastutil:7.0.13' implementation 'org.broadinstitute:hdf5-java-bindings:1.1.0-hdf5_2.11.0' implementation 'org.broadinstitute:gatk-native-bindings:1.0.0' @@ -297,8 +298,8 @@ dependencies { exclude group: 'org.apache.commons' } - //there is no mllib_2.12.15:3.3.0, so stay use 2.12:3.3.0 - implementation ('org.apache.spark:spark-mllib_2.12:3.3.0') { + // TODO: migrate to mllib_2.12.15? + implementation ('org.apache.spark:spark-mllib_2.12:' + sparkVersion) { // JUL is used by Google Dataflow as the backend logger, so exclude jul-to-slf4j to avoid a loop exclude module: 'jul-to-slf4j' exclude module: 'javax.servlet' @@ -306,15 +307,6 @@ dependencies { } implementation 'com.thoughtworks.paranamer:paranamer:2.8' - implementation 'org.bdgenomics.bdg-formats:bdg-formats:0.5.0' - implementation('org.bdgenomics.adam:adam-core-spark2_2.12:0.28.0') { - exclude group: 'org.slf4j' - exclude group: 'org.apache.hadoop' - exclude group: 'org.scala-lang' - exclude module: 'kryo' - exclude module: 'hadoop-bam' - } - implementation 'org.jgrapht:jgrapht-core:1.1.0' implementation 'org.jgrapht:jgrapht-io:1.1.0' @@ -344,10 +336,10 @@ dependencies { implementation 'org.broadinstitute:gatk-bwamem-jni:1.0.4' implementation 'org.broadinstitute:gatk-fermilite-jni:1.2.0' - implementation 'org.broadinstitute:http-nio:0.1.0-rc1' + implementation 'org.broadinstitute:http-nio:1.1.0' // Required for COSMIC Funcotator data source: - implementation 'org.xerial:sqlite-jdbc:3.36.0.3' + implementation 'org.xerial:sqlite-jdbc:3.44.1.0' // natural sort implementation('net.grey-panther:natural-comparator:1.1') @@ -986,6 +978,18 @@ task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) { } } +// scan-gradle-plugin security vulnerability scan +ossIndexAudit { + allConfigurations = false // if true includes the dependencies in all resolvable configurations. By default is false, meaning only 'compileClasspath', 'runtimeClasspath', 'releaseCompileClasspath' and 'releaseRuntimeClasspath' are considered + useCache = true // true by default + outputFormat = 'DEFAULT' // Optional, other values are: 'DEPENDENCY_GRAPH' prints dependency graph showing direct/transitive dependencies, 'JSON_CYCLONE_DX_1_4' prints a CycloneDX 1.4 SBOM in JSON format. + showAll = false // if true prints all dependencies. By default is false, meaning only dependencies with vulnerabilities will be printed. + printBanner = true // if true will print ASCII text banner. By default is true. + + // ossIndexAudit can be configured to exclude vulnerabilities from matching + // excludeVulnerabilityIds = ['39d74cc8-457a-4e57-89ef-a258420138c5'] // list containing ids of vulnerabilities to be ignored + // excludeCoordinates = ['commons-fileupload:commons-fileupload:1.3'] // list containing coordinate of components which if vulnerable should be ignored +} /** *This specifies what artifacts will be built and uploaded when performing a maven upload. diff --git a/build_docker_remote.sh b/build_docker_remote.sh index 72534a02b03..7eada9565c3 100755 --- a/build_docker_remote.sh +++ b/build_docker_remote.sh @@ -1,12 +1,26 @@ #!/usr/bin/env bash # -# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built in the cloud rather than locally. Pushing to dockerhub is not supported by this script. +# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built +# in the cloud rather than locally. Pushing to dockerhub is not supported by this script. # -# If you are pushing an image to our release repositories, be sure that you've followed -# the setup instructions here: -# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_docker -# and here: -# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_gcloud +# By default the images are pushed to the following GCR repository: +# +# us.gcr.io/broad-dsde-methods/broad-gatk-snapshots/gatk-remote-builds +# +# with a name like "${YOUR_USERNAME}-${GITHUB_TAG}-${GIT_HASH_FOR_TAG}" +# +# This script should not be used to push to our release repositories. After +# you've built and staged an image, run the scripts/docker/release_prebuilt_docker_image.sh +# script to push to the release repositories. +# +# Usage: build_docker_remote.sh -e -d [-str] +# +# where is the github tag (or hash when -s is used) to use in building the docker image (e.g. 4.2.6.1) +# and is a directory in which to clone the repo and stage the build (DO NOT SPECIFY YOUR WORKING DIRECTORY) +# Optional arguments: +# -s The GITHUB_TAG (-e parameter) is actually a github hash, not tag. +# -r Build this image with the release flag set to true, causing the version number to not end with SNAPSHOT +# -t The tag to assign image once it is finished constructing. NOTE: currently this MUST be on either GCR or the Google Artifact Registry. # # Have script stop if there is an error @@ -20,24 +34,35 @@ STAGING_CLONE_DIR=${PROJECT}_staging_temp ################################################# # Parsing arguments ################################################# -while getopts "e:sd:t:" option; do +while getopts "e:sd:t:r" option; do case "$option" in e) GITHUB_TAG="$OPTARG" ;; s) IS_HASH=true ;; d) STAGING_DIR="$OPTARG" ;; t) DOCKER_IMAGE_TAG="$OPTARG" ;; + r) RELEASE=true ;; esac done -if [ -z "$GITHUB_TAG" ]; then - printf "Option -e requires an argument.\n \ -Usage: %s: -e [-sdt] \n \ -where is the github tag (or hash when -s is used) to use in building the docker image\n \ -(e.g. bash build_docker_remote.sh -e 4.2.6.1 )\n \ +function usage() { + MESSAGE=$1 + printf "%s\n \ +Usage: build_docker_remote.sh -e -d [-str] \n \ +where is the github tag (or hash when -s is used) to use in building the docker image (e.g. 4.2.6.1)\n \ +and is a directory in which to clone the repo and stage the build (DO NOT SPECIFY YOUR WORKING DIRECTORY)\n \ Optional arguments: \n \ -s \t The GITHUB_TAG (-e parameter) is actually a github hash, not tag. \n \ --d \t staging directory to grab code from repo and build the docker image. If unspecified, then use whatever is in current dir (do not go to the repo). NEVER SPECIFY YOUR WORKING DIR \n \ --t \t The tag to assign image once it is finished constructing. NOTE: currently this MUST be on either GCR or the Google Artifact Registry. \n" $0 +-r \t Build this image with the release flag set to true, causing the version number to not end with SNAPSHOT \n \ +-t \t The tag to assign image once it is finished constructing. NOTE: currently this MUST be on either GCR or the Google Artifact Registry. \n" "$MESSAGE" +} + +if [ -z "$GITHUB_TAG" ]; then + usage "Option -e (github tag) requires an argument." + exit 1 +fi + +if [ -z "$STAGING_DIR" ]; then + usage "Option -d (staging directory) requires an argument." exit 1 fi @@ -49,6 +74,7 @@ echo "Other options (Blank is false)" echo "---------------" echo "This is a git hash: ${IS_HASH}" echo "Staging directory: ${STAGING_DIR}" +echo "Release: ${RELEASE}" ORIGINAL_WORKING_DIRECTORY=$(pwd) @@ -81,14 +107,26 @@ if [ -z "$DOCKER_IMAGE_TAG" ]; then DOCKER_IMAGE_TAG=${GCR_REPO}:$(whoami)-${GITHUB_TAG}-${GIT_HASH_FOR_TAG} fi +echo "steps:" >> cloudbuild.yaml +echo "- name: 'gcr.io/cloud-builders/docker'" >> cloudbuild.yaml +if [ -n "$RELEASE" ]; then + echo " args: [ 'build', '-t', '${DOCKER_IMAGE_TAG}', '--build-arg', 'RELEASE=true', '.' ]" >> cloudbuild.yaml +else + echo " args: [ 'build', '-t', '${DOCKER_IMAGE_TAG}', '.' ]" >> cloudbuild.yaml +fi +echo "- name: 'gcr.io/cloud-builders/docker'" >> cloudbuild.yaml +echo " args: [ 'push', '${DOCKER_IMAGE_TAG}' ]" >> cloudbuild.yaml + echo "Building image with the tag ${DOCKER_IMAGE_TAG}..." -SUBMIT_COMMAND="gcloud builds submit --tag ${DOCKER_IMAGE_TAG} --timeout=24h --machine-type n1_highcpu_8" +SUBMIT_COMMAND="gcloud builds submit --config cloudbuild.yaml --timeout=24h --machine-type n1_highcpu_32" echo "running the following gcloud command: ${SUBMIT_COMMAND}" ## We need to override the default .gcloudignore to preserve the .git directory which we need in order to download LFS files in the remote build. echo -n "" >> .gcloudignore ${SUBMIT_COMMAND} +echo "Image successfully built and pushed to ${DOCKER_IMAGE_TAG}" + cd ${ORIGINAL_WORKING_DIRECTORY} if [ -n "$STAGING_DIR" ] ; then rm -Rf ${STAGING_DIR}/${STAGING_CLONE_DIR} diff --git a/codecov.yml b/codecov.yml index 734831e7d4e..aa2f0e4e323 100644 --- a/codecov.yml +++ b/codecov.yml @@ -25,9 +25,15 @@ coverage: branches: null informational: true +ignore: + - "src/test" comment: layout: "header, diff, changes, sunburst, uncovered, tree" branches: null behavior: default after_n_builds: 8 # Wait until all 8 of the test suite jacoco test builds have been uploaded before writing a comment to the pr (no more incomplete coverage report emails) + +# Disable line-by-line codecov comments in the github diff page +github_checks: + annotations: false diff --git a/scripts/docker/README.md b/scripts/docker/README.md index 01e4bdb0f65..c61b71424f0 100644 --- a/scripts/docker/README.md +++ b/scripts/docker/README.md @@ -21,7 +21,7 @@ This repo contains the scripts for creating and pushing two docker images: - gatkbase -- a basic docker image that we do not expect to change very often. The GATK4 docker image uses this one (``FROM``) - gatk4 -- the official docker image for GATK4. The instructions in this document pertain to this image, unless otherwise stated. -``scripts/docker/gatkbase/build_docker_base.sh`` is a script to create the gatkbase docker image. +``scripts/docker/gatkbase/build_docker_base_cloud.sh`` and ``scripts/docker/gatkbase/build_docker_base_locally.sh`` are scripts to create the gatkbase docker image. ``build_docker.sh`` is a script to create the full gatk4 docker image. ``build_docker_remote.sh`` is a script to create the full gatk4 docker image using google cloud build remotely. This is useful if you can't build the docker image locally (for example if you have an M1 Macbook) NOTE: this requires the user first specify their project with the command `gcloud config set project ` to a project that has access to google cloud build. @@ -148,19 +148,24 @@ exit This is a base image that does not require any files in the gatk repo (except the build script and Dockerfile, obviously). GATK docker images are dependent on this one. **IMPORTANT** -- The gatkbase build script should be run from the ``scripts/docker/gatkbase`` directory. -- If you want to create a new version, you must modify the ``build_docker_base.sh`` script directly. Any changes should be committed to the repo. +- The gatkbase build scripts should be run from the ``scripts/docker/gatkbase`` directory. -#### Create gatkbase docker image and push it to docker hub +#### Build the gatkbase docker image on your local machine: ```bash -build_docker_base.sh -p +build_docker_base_locally.sh ``` -#### Create gatkbase docker image and do not push it to docker hub +#### Build the gatkbase docker image remotely using Google Cloud Build: + +```bash +build_docker_base_cloud.sh +``` + +#### Release a pre-built gatkbase image to the official repositories, after testing it: ```bash -build_docker_base.sh -p +release_prebuilt_base_image.sh ``` diff --git a/scripts/docker/dockertest.gradle b/scripts/docker/dockertest.gradle index bbbe4780d8e..692bc945db9 100644 --- a/scripts/docker/dockertest.gradle +++ b/scripts/docker/dockertest.gradle @@ -144,6 +144,9 @@ task testOnPackagedReleaseJar(type: Test){ jvmArgs = getJVMArgs(runtimeAddOpens, testAddOpens) + //Set this to match the name of the normal test output in order to simplify finding / uploading it + reports.html.outputLocation = file("$buildDir/reports/tests/test") + classpath = files( gatkJar, testDependencyJar, testClassesJar) testClassesDirs = files(testClassesUnpacked) } diff --git a/scripts/docker/gatkbase/Dockerfile b/scripts/docker/gatkbase/Dockerfile index 7f4d8cb6a7b..a4ba3ac557a 100644 --- a/scripts/docker/gatkbase/Dockerfile +++ b/scripts/docker/gatkbase/Dockerfile @@ -1,12 +1,21 @@ # Using OpenJDK 17 # This Dockerfile does not require any files that are in the GATK4 repo. -FROM ubuntu:18.04 +# NOTE: If you update the ubuntu version make sure to update the samtools/bcftools/bedtools versions in the README +FROM ubuntu:22.04 -#### Basic image utilities -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y --no-install-recommends \ +# Set environment variables. +# Avoid interactive prompts during apt installs/upgrades +ENV DEBIAN_FRONTEND="noninteractive" HOME="/root" JAVA_LIBRARY_PATH="/usr/lib/jni" DOWNLOAD_DIR="/downloads" CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh" CONDA_SHA256="c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a" CONDA_PATH="/opt/miniconda" PATH="/opt/miniconda/bin:$PATH" + +# Define working directory. +WORKDIR /root + +#### Basic image utilities, google cloud support, and miniconda +RUN apt update && \ + apt full-upgrade -y && \ + apt install -y --no-install-recommends \ python3 \ + python3-pip \ wget \ curl \ bc \ @@ -15,56 +24,41 @@ RUN apt-get update && \ less \ bedtools \ samtools \ + bcftools \ tabix \ + jq \ + git \ gpg-agent \ build-essential \ openjdk-17-jdk \ + vim \ software-properties-common && \ - apt-get -y clean && \ - apt-get -y autoclean && \ - apt-get -y autoremove && \ - rm -rf /var/lib/apt/lists/* - -RUN java -version - -#### Specific for google cloud support -RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ + apt -y clean && \ + apt -y autoclean && \ + apt -y autoremove && \ + rm -rf /var/lib/apt/lists/* && \ + java -version && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \ - apt-get update -y && \ - apt-get install -y --no-install-recommends google-cloud-cli && \ - apt-get -y clean && \ - apt-get -y autoclean && \ - apt-get -y autoremove && \ + apt update -y && \ + apt install -y --no-install-recommends google-cloud-cli && \ + apt -y clean && \ + apt -y autoclean && \ + apt -y autoremove && \ rm -rf /var/lib/apt/lists/* && \ # Remove the anthos cli tool and related files since they are very large and we / anyone using the docker are unlikely to use them # Remove the bundled python because we have python installed separately rm -rf /usr/lib/google-cloud-sdk/bin/anthoscli /usr/lib/google-cloud-sdk/platform/anthoscli_licenses /usr/lib/google-cloud-sdk/platform/bundledpythonunix && \ - find / -wholename "*__pycache__/*.pyc" -exec rm {} + - -# Set environment variables. -ENV HOME /root - -# Define working directory. -WORKDIR /root - -# Define default command. -CMD ["bash"] - -ENV JAVA_LIBRARY_PATH /usr/lib/jni - -# Install miniconda -ENV DOWNLOAD_DIR /downloads -ENV CONDA_URL https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -ENV CONDA_MD5 = "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787" -ENV CONDA_PATH /opt/miniconda -ENV PATH $CONDA_PATH/bin:$PATH -RUN mkdir $DOWNLOAD_DIR && \ + find / -wholename "*__pycache__/*.pyc" -exec rm {} + && \ + mkdir $DOWNLOAD_DIR && \ wget -nv -O $DOWNLOAD_DIR/miniconda.sh $CONDA_URL && \ - test "`md5sum $DOWNLOAD_DIR/miniconda.sh | awk -v FS=' ' '{print $1}'` = $CONDA_MD5" && \ + test "$(sha256sum $DOWNLOAD_DIR/miniconda.sh | awk -v FS=' ' -v ORS='' '{print $1}')" = "$CONDA_SHA256" && \ bash $DOWNLOAD_DIR/miniconda.sh -p $CONDA_PATH -b && \ rm $DOWNLOAD_DIR/miniconda.sh && \ conda clean -afy && \ - find /opt/miniconda/ -follow -type f -name '*.a' -delete && \ - find /opt/miniconda/ -follow -type f -name '*.pyc' -delete && \ - rm -rf /root/.cache/pip + conda config --set auto_update_conda false && \ + conda config --set solver libmamba && \ + rm -rf /root/.cache/pip + +CMD ["bash"] \ No newline at end of file diff --git a/scripts/docker/gatkbase/README.md b/scripts/docker/gatkbase/README.md index f020175ae78..9d8b2e81a8f 100644 --- a/scripts/docker/gatkbase/README.md +++ b/scripts/docker/gatkbase/README.md @@ -1,16 +1,15 @@ # How to update the GATK base docker image: -1. choose a new version number for the base image and manually update the version in `scripts/docker/gatkbase/build_docker_base.sh` -2. build the gatkbase image using that script and upload it to the [gatk-dev docker repo](https://hub.docker.com/r/broadinstitute/gatk-dev/) or [gcr-gatk-snapshots](us.gcr.io/broad-dsde-methods/broad-gatk-snapshots) - * cd to scripts/docker/gatkbase - * run `./build_docker_base.sh` - * `docker tag broadinstitute/gatk-dev:your-version-rc1` or whatever the correct tag is for where you want it uploaded - * `docker push tagname` -3. update the Dockerfile in the main gatk to use the image you pushed -4. commit the changes to the two docker files and to a new pull request -5. wait for the tests to pass and show it to a reviewer -6. push the base image to the official [gatk repo](https://hub.docker.com/r/broadinstitute/gatk) with the right name -7. update the main docker to point to the official version you just released -8. wait for tests to pass in travis and merge +1. In a branch, make whatever updates you need to the base image Dockerfile in `scripts/docker/gatkbase` +2. Choose a new version number for the new base image +3. Build the GATK base image by running either `build_docker_base_cloud.sh` (to build in the cloud using Google Cloud Build) or `build_docker_base_locally.sh` (to build on your local machine) from within the `scripts/docker/gatkbase` directory in your GATK clone. + * Both scripts take the version of the new image as their only argument. Eg., `build_docker_base_cloud.sh 3.0.0rc1` will create the remote image `us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area:gatkbase-3.0.0rc1` +4. If you built the image locally, you'll need to manually push it to the staging area at `us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area` +5. Test the new image using the GATK branch with your changes to the base image Dockerfile, by modifying the FROM clause at the top of the main GATK Dockerfile to point to the base image you staged, and submit a PR for the branch to let the test suite run. +6. Once tests pass and everything looks good, push the new base image to the release repositories using the `release_prebuilt_base_image.sh` script + * The release script takes as arguments the prebuilt image you've tested, and a final version number for release. For example: `release_prebuilt_base_image.sh us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area:gatkbase-3.0.0rc1 3.0.0` + * The release script looks for the image locally first, and if not found locally it pulls it from the remote repository before releasing. +7. Update the FROM clause in the main GATK Dockerfile in your GATK PR to point to the officially-released image, and merge it once tests pass. + diff --git a/scripts/docker/gatkbase/build_docker_base.sh b/scripts/docker/gatkbase/build_docker_base.sh deleted file mode 100755 index ca6440c6880..00000000000 --- a/scripts/docker/gatkbase/build_docker_base.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash - -# Have script stop if there is an error -set -e - - -REPO=broadinstitute -PROJECT=gatk -VERSION=2.3.0 -FULL_PATH=${REPO}/${PROJECT}:gatkbase-${VERSION} - -################################################# -# Parsing arguments -################################################# -while getopts "ph" option; do - case "$option" in - h) IS_HELP=true ;; - p) IS_PUSH=true ;; - esac -done - -if [ -n "$IS_HELP" ]; then - printf "Build the GATK4 base image.\n \ -Usage: %s: [-p] [-h] \n \ -Optional arguments: \n \ --p \t push image to docker hub once complete. \n \n" $0 - exit 1 -fi - -# Output the parameters -echo -e "\n" -echo -e "docker hub repo, project, and tag: ${FULL_PATH}\n\n" -echo "Other options (Blank is false)" -echo "---------------" -echo "Push to dockerhub: ${IS_PUSH}" - -# Login to dockerhub -if [ -n "${IS_PUSH}" ]; then - echo "Please login to dockerhub" - docker login -fi - -# Build -echo "Building image to tag ${FULL_PATH}..." -docker build --squash -t ${FULL_PATH} . - -## Push -if [ -n "${IS_PUSH}" ]; then - docker push ${FULL_PATH} -else - echo "Not pushing to dockerhub" -fi - diff --git a/scripts/docker/gatkbase/build_docker_base_cloud.sh b/scripts/docker/gatkbase/build_docker_base_cloud.sh new file mode 100755 index 00000000000..897b4b2a753 --- /dev/null +++ b/scripts/docker/gatkbase/build_docker_base_cloud.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# +# A script that builds the GATK base image using Google Cloud Build, and pushes it to +# a staging location at us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area +# +# Usage: build_docker_base_cloud.sh +# +# After staging the image, you should test it with GATK before actually releasing it +# using the release_prebuilt_base_image.sh script. You can test it by modifying the +# main GATK Dockerfile FROM clause to point temporarily at the staged base image, and +# submitting that as a PR to trigger a test suite run. +# + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +IMAGE_VERSION=$1 +IMAGE_NAME="us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area" +DOCKER_IMAGE_TAG="${IMAGE_NAME}:gatkbase-${IMAGE_VERSION}" + +gcloud builds submit --tag ${DOCKER_IMAGE_TAG} --timeout=24h --machine-type n1_highcpu_32 + +if [ $? -ne 0 ]; then + echo "gcloud builds submit failed" + exit 1 +fi + +echo "Successfully published image to staging area at ${DOCKER_IMAGE_TAG}" +exit 0 \ No newline at end of file diff --git a/scripts/docker/gatkbase/build_docker_base_locally.sh b/scripts/docker/gatkbase/build_docker_base_locally.sh new file mode 100755 index 00000000000..bc6acce2011 --- /dev/null +++ b/scripts/docker/gatkbase/build_docker_base_locally.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# A script that builds the GATK base image locally (but does not push it anywhere). +# +# Usage: build_docker_base_locally.sh +# +# After building the image, you should test it with GATK before actually releasing it +# using the release_prebuilt_base_image.sh script. You can test it by modifying the +# main GATK Dockerfile FROM clause to point temporarily at the candidate base image, and +# submitting that as a PR to trigger a test suite run. +# + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +IMAGE_VERSION=$1 +IMAGE_REPO="us.gcr.io/broad-dsde-methods/gatk-base-image-staging-area" +IMAGE_FULL_TAG="${IMAGE_REPO}:gatkbase-${IMAGE_VERSION}" + +# Build +echo "Building image to tag ${IMAGE_FULL_TAG}..." +docker build --squash -t "${IMAGE_FULL_TAG}" . + +if [ $? -ne 0 ]; then + echo "docker build failed" + exit 1 +fi + +echo "docker build succeeded" +exit 0 + diff --git a/scripts/docker/gatkbase/release_prebuilt_base_image.sh b/scripts/docker/gatkbase/release_prebuilt_base_image.sh new file mode 100755 index 00000000000..1647be93810 --- /dev/null +++ b/scripts/docker/gatkbase/release_prebuilt_base_image.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# +# A script that takes a prebuilt GATK base image, and pushes it to the GATK release repositories on +# dockerhub and GCR. Use the build_docker_base_cloud.sh or build_docker_base_locally.sh scripts to +# build the image before running this script. +# +# Usage: release_prebuilt_base_image.sh +# +# If the prebuilt image exists locally, this script will push the local version. Otherwise, +# it will pull the image from the remote repository before pushing it to the GATK release +# repositories. +# +# prebuilt_image: The pre-built image you want to release (make sure you've tested it!) +# May be either local or remote +# version_tag_for_release: The version of the GATK base image you're releasing (eg., 3.1.0) +# + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +PREBUILT_IMAGE="$1" +VERSION="$2" +DOCKERHUB_REPO="broadinstitute/gatk" +GCR_REPO="us.gcr.io/broad-gatk/gatk" +BASE_IMAGE_FULL_TAG="gatkbase-${VERSION}" + +function fatal_error() { + echo "$1" 1>&2 + exit 1 +} + +function docker_push() { + echo "Pushing to ${1}" + docker push "${1}" + if [ $? -ne 0 ]; then + fatal_error "Failed to push to ${1}" + fi +} + +# Test if the prebuilt image exists locally, and pull it if it doesn't +docker image inspect "${PREBUILT_IMAGE}" > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "Image ${PREBUILT_IMAGE} not found locally: attempting to pull it now" + docker pull "${PREBUILT_IMAGE}" + if [ $? -ne 0 ]; then + fatal_error "Failed to pull pre-built image ${PREBUILT_IMAGE}" + fi +else + echo "Image ${PREBUILT_IMAGE} found locally: pushing it to the release repositories" +fi + +docker tag "${PREBUILT_IMAGE}" "${DOCKERHUB_REPO}:${BASE_IMAGE_FULL_TAG}" +docker tag "${PREBUILT_IMAGE}" "${GCR_REPO}:${BASE_IMAGE_FULL_TAG}" + +docker_push "${DOCKERHUB_REPO}:${BASE_IMAGE_FULL_TAG}" +docker_push "${GCR_REPO}:${BASE_IMAGE_FULL_TAG}" + +exit 0 \ No newline at end of file diff --git a/scripts/docker/release_prebuilt_docker_image.sh b/scripts/docker/release_prebuilt_docker_image.sh new file mode 100644 index 00000000000..ca4b65b6c56 --- /dev/null +++ b/scripts/docker/release_prebuilt_docker_image.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# +# A script that takes a prebuilt docker image, and pushes it to the GATK release repositories on +# dockerhub and GCR +# +# Usage: release_prebuilt_docker_image.sh +# +# If the prebuilt image exists locally, this script will push the local version. Otherwise, +# it will pull the image from the remote repository before pushing it to the GATK release +# repositories. +# +# prebuilt_image: The pre-built image you want to release (make sure you've tested it!) +# May be either local or remote +# version_tag_for_release: The version of GATK you're releasing (eg., 4.0.0.0) +# + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +PREBUILT_IMAGE="$1" +VERSION="$2" +DOCKERHUB_REPO="broadinstitute/gatk" +GCR_REPO="us.gcr.io/broad-gatk/gatk" + +function fatal_error() { + echo "$1" 1>&2 + exit 1 +} + +function docker_push() { + echo "Pushing to ${1}" + docker push "${1}" + if [ $? -ne 0 ]; then + fatal_error "Failed to push to ${1}" + fi +} + +# Test if the prebuilt image exists locally, and pull it if it doesn't +docker image inspect "${PREBUILT_IMAGE}" > /dev/null 2>&1 +if [ $? -ne 0 ]; then + echo "Image ${PREBUILT_IMAGE} not found locally: attempting to pull it now" + docker pull "${PREBUILT_IMAGE}" + if [ $? -ne 0 ]; then + fatal_error "Failed to pull pre-built image ${PREBUILT_IMAGE}" + fi +else + echo "Image ${PREBUILT_IMAGE} found locally: pushing it to the release repositories" +fi + +docker tag "${PREBUILT_IMAGE}" "${DOCKERHUB_REPO}:${VERSION}" +docker tag "${DOCKERHUB_REPO}:${VERSION}" "${DOCKERHUB_REPO}:latest" +docker tag "${PREBUILT_IMAGE}" "${GCR_REPO}:${VERSION}" +docker tag "${GCR_REPO}:${VERSION}" "${GCR_REPO}:latest" + +docker_push "${DOCKERHUB_REPO}:${VERSION}" +docker_push "${DOCKERHUB_REPO}:latest" +docker_push "${GCR_REPO}:${VERSION}" +docker_push "${GCR_REPO}:latest" + +exit 0 \ No newline at end of file diff --git a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py index 8964c9dc294..826795e92b5 100755 --- a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py +++ b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py @@ -90,7 +90,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict): tsvReader = GenericTsvReader(inputFilename) headers = tsvReader.getFieldNames() print('Found headers (input): ' + str(headers)) - if "Translocation Name" not in headers: + if "TRANSLOCATION_NAME" not in headers: raise NotImplementedError("Could not find Translocation Name column in the input file.") outputHeaders = ['gene', 'fusion_genes', 'fusion_id'] @@ -99,7 +99,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict): fusionGeneDict = OrderedDict() last_i = 0 for i, line in enumerate(tsvReader): - fusion_gene_description = line['Translocation Name'] + fusion_gene_description = line['TRANSLOCATION_NAME'] if len(fusion_gene_description.strip()) == 0: # blank diff --git a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh index 800242de91a..b7a756a7932 100755 --- a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh +++ b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh @@ -18,6 +18,7 @@ set -e COSMIC_FILE=CosmicCompleteTargetedScreensMutantExport.tsv OUT_DB_FILE="Cosmic.db" +OUT_TMP_FOLDER="~/tmp" ################################################################################ @@ -29,6 +30,10 @@ if [[ $# -gt 1 ]] ; then OUT_DB_FILE=$2 fi +if [[ $# -gt 2 ]] ; then + OUT_TMP_FOLDER=$3 +fi + if [ ! -f ${COSMIC_FILE} ] ; then echo "ERROR: Given COSMIC file does not exist: ${COSMIC_FILE}" 1>&2 exit 1 @@ -42,6 +47,7 @@ sqlite3 ${OUT_DB_FILE} < cosmic/metadata.txt echo "User: ${EMAIL}" >> cosmic/metadata.txt diff --git a/scripts/funcotator/data_sources/downloadHgncDataSource.sh b/scripts/funcotator/data_sources/downloadHgncDataSource.sh index aa3d120d67d..b3e86a90b81 100755 --- a/scripts/funcotator/data_sources/downloadHgncDataSource.sh +++ b/scripts/funcotator/data_sources/downloadHgncDataSource.sh @@ -2,5 +2,5 @@ # Downloads the HGNC data source from the HGNC website. -curl 'https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' > hgnc_download_$(date +%b%d%Y).tsv +curl 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submi' > hgnc_download_$(date +%b%d%Y).tsv diff --git a/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh new file mode 100644 index 00000000000..5f124ef6a67 --- /dev/null +++ b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +#NOTE: This script has been checked in to aid in the release process for future Funcotator datasource bundles. + +echo "Making Tarballs of each Datasource Directory..." + +tar -zcvf funcotator_dataSources.v1.8.hg38.20230908s.tar.gz funcotator_dataSources.v1.8.hg38.20230908s +tar -zcvf funcotator_dataSources.v1.8.hg38.20230908g.tar.gz funcotator_dataSources.v1.8.hg38.20230908g +tar -zcvf funcotator_dataSources.v1.8.hg19.20230908s.tar.gz funcotator_dataSources.v1.8.hg19.20230908s +tar -zcvf funcotator_dataSources.v1.8.hg19.20230908g.tar.gz funcotator_dataSources.v1.8.hg19.20230908g + +echo "Making the various hashfiles for release" + +find funcotator_dataSources.v1.8.hg38.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908s.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908s.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz > funcotator_dataSources.v1.8.hg38.20230908s.sha256 + +find funcotator_dataSources.v1.8.hg38.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908g.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908g.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz > funcotator_dataSources.v1.8.hg38.20230908g.sha256 + +find funcotator_dataSources.v1.8.hg19.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908s.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908s.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz > funcotator_dataSources.v1.8.hg19.20230908s.sha256 + +find funcotator_dataSources.v1.8.hg19.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908g.dir.long.md5sum +md5sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908g.dir.md5sum +sha256sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz > funcotator_dataSources.v1.8.hg19.20230908g.sha256 \ No newline at end of file diff --git a/scripts/funcotator/data_sources/getGencode.sh b/scripts/funcotator/data_sources/getGencode.sh index 7ab0316787b..aaf159add43 100755 --- a/scripts/funcotator/data_sources/getGencode.sh +++ b/scripts/funcotator/data_sources/getGencode.sh @@ -15,7 +15,7 @@ MAXARGS=0 # Latest release numbers for our references. # Update these numbers when a new Gencode is released. -LATEST_RELEASE=34 +LATEST_RELEASE=43 DATA_SOURCE_NAME="Gencode" OUT_DIR_NAME='gencode' diff --git a/scripts/funcotator/data_sources/getGencodeXHGNC.sh b/scripts/funcotator/data_sources/getGencodeXHGNC.sh index 656e6c8bc45..cb4ada59b1f 100755 --- a/scripts/funcotator/data_sources/getGencodeXHGNC.sh +++ b/scripts/funcotator/data_sources/getGencodeXHGNC.sh @@ -8,10 +8,10 @@ outFileBaseName="gencode_xhgnc" outExt=".tsv" hg19db="homo_sapiens_core_75_37" -hg38db="homo_sapiens_core_90_38" +hg38db="homo_sapiens_core_110_38" hg19FileName=${outFileBaseName}_v75_37.hg19${outExt} -hg38FileName=${outFileBaseName}_v90_38.hg38${outExt} +hg38FileName=${outFileBaseName}_v110_38.hg38${outExt} ################################################################################ diff --git a/scripts/funcotator/data_sources/getGencodeXRefseq.sh b/scripts/funcotator/data_sources/getGencodeXRefseq.sh index 58c06455e4b..809fe22fcbb 100755 --- a/scripts/funcotator/data_sources/getGencodeXRefseq.sh +++ b/scripts/funcotator/data_sources/getGencodeXRefseq.sh @@ -8,10 +8,10 @@ outFileBaseName="gencode_xrefseq" outExt=".tsv" hg19db="homo_sapiens_core_75_37" -hg38db="homo_sapiens_core_90_38" +hg38db="homo_sapiens_core_110_38" hg19FileName=${outFileBaseName}_v75_37.hg19${outExt} -hg38FileName=${outFileBaseName}_v90_38.hg38${outExt} +hg38FileName=${outFileBaseName}_v110_38.hg38${outExt} ################################################################################ diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template index fa115de3325..75284b829e8 100644 --- a/scripts/gatkcondaenv.yml.template +++ b/scripts/gatkcondaenv.yml.template @@ -10,6 +10,8 @@ # used by the testGATKPythonEnvironmentPackagePresent test in PythonEnvironmentIntegrationTest needs to be updated # to reflect the changes. # +# NOTE: If you update any of the packages below, please make sure the main README is up to date with the latest package install information. +# name: $condaEnvName channels: # if channels other than conda-forge are added and the channel order is changed (note that conda channel_priority is currently set to flexible), @@ -20,7 +22,7 @@ dependencies: # core python dependencies - conda-forge::python=3.6.10 # do not update -- pip=20.0.2 # specifying channel may cause a warning to be emitted by conda +- conda-forge::pip=21.3.1 - conda-forge::mkl=2019.5 # MKL typically provides dramatic performance increases for theano, tensorflow, and other key dependencies - conda-forge::mkl-service=2.3.0 - conda-forge::joblib=1.1.1 # must pin joblib - versions after 1.1.1 no longer support python 3.6 diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 9ff90cedef8..18f7004419f 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -95,6 +95,7 @@ workflow Mutect2 { # additional modes and outputs File? realignment_index_bundle String? realignment_extra_args + File? dragstr_model Boolean run_orientation_bias_mixture_model_filter = false Boolean make_bamout = false Boolean compress_vcfs = false @@ -178,6 +179,7 @@ workflow Mutect2 { getpileupsummaries_extra_args = getpileupsummaries_extra_args, variants_for_contamination = variants_for_contamination, variants_for_contamination_idx = variants_for_contamination_idx, + dragstr_model = dragstr_model, make_bamout = make_bamout, run_ob_filter = run_orientation_bias_mixture_model_filter, compress_vcfs = compress_vcfs, @@ -312,6 +314,8 @@ workflow Mutect2 { File? maf_segments = CalculateContamination.maf_segments File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table File? m3_dataset = Concatenate.concatenated + File permutect_contigs_table = select_first(M2.permutect_contigs_table) + File permutect_read_groups_table = select_first(M2.permutect_read_groups_table) } } @@ -380,6 +384,7 @@ task M2 { File? gga_vcf_idx File? variants_for_contamination File? variants_for_contamination_idx + File? dragstr_model File? gatk_override @@ -439,26 +444,27 @@ task M2 { touch bamout.bam touch f1r2.tar.gz touch dataset.txt - echo "" > normal_name.txt - - gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_reads} -O tumor_name.txt -encode \ - ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} - tumor_command_line="-I ~{tumor_reads} -tumor `cat tumor_name.txt`" + touch contigs.table + touch read-groups.table if [[ ! -z "~{normal_reads}" ]]; then - gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_name.txt -encode \ + gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_names.txt -encode \ ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} - normal_command_line="-I ~{normal_reads} -normal `cat normal_name.txt`" + # add "-normal " to the start of each line and " " to the end, then remove newlines + # to get -normal sample1 -normal sample2 etc + normal_sample_line=`awk '{ print "-normal", $0 }' normal_names.txt | tr '\n' ' '` fi gatk --java-options "-Xmx~{command_mem}m" Mutect2 \ -R ~{ref_fasta} \ - $tumor_command_line \ - $normal_command_line \ + -I ~{tumor_reads} \ + ~{"-I " + normal_reads} \ + $normal_sample_line \ ~{"--germline-resource " + gnomad} \ ~{"-pon " + pon} \ ~{"-L " + intervals} \ ~{"--alleles " + gga_vcf} \ + ~{"--dragstr-params-path " + dragstr_model} \ -O "~{output_vcf}" \ ~{true='--bam-output bamout.bam' false='' make_bamout} \ ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ @@ -474,7 +480,7 @@ task M2 { # If the variants for contamination and the intervals for this scatter don't intersect, GetPileupSummaries # throws an error. However, there is nothing wrong with an empty intersection for our purposes; it simply doesn't - # contribute to the merged pileup summaries that we create downstream. We implement this by with array outputs. + # contribute to the merged pileup summaries that we create downstream. We implement this via array outputs. # If the tool errors, no table is created and the glob yields an empty array. set +e @@ -509,13 +515,13 @@ task M2 { File unfiltered_vcf = "~{output_vcf}" File unfiltered_vcf_idx = "~{output_vcf_idx}" File output_bamOut = "bamout.bam" - String tumor_sample = read_string("tumor_name.txt") - String normal_sample = read_string("normal_name.txt") File stats = "~{output_stats}" File f1r2_counts = "f1r2.tar.gz" Array[File] tumor_pileups = glob("*tumor-pileups.table") Array[File] normal_pileups = glob("*normal-pileups.table") File m3_dataset = "dataset.txt" + File permutect_contigs_table = "contigs.table" + File permutect_read_groups_table = "read-groups.table" } } diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 025c0897be2..06bec7fc38a 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -72,9 +72,9 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-06-04-alpine-504f661f6b0d" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-06-20-alpine-b2d6c7230b2f" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-06-17-gatkbase-37738272d538" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-06-20-gatkbase-a0cb5fba8c33" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/DefaultGATKReadFilterArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/DefaultGATKReadFilterArgumentCollection.java index 1ab72090f15..1e17765d341 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/DefaultGATKReadFilterArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/DefaultGATKReadFilterArgumentCollection.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.cmdline.GATKPlugin; +import com.google.common.collect.Lists; import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; @@ -23,6 +24,11 @@ public class DefaultGATKReadFilterArgumentCollection extends GATKReadFilterArgum doc="Read filters to be applied before analysis", optional=true, common = true) public final List userEnabledReadFilterNames = new ArrayList<>(); // preserve order + @Argument(fullName = ReadFilterArgumentDefinitions.INVERTED_READ_FILTER_LONG_NAME, + shortName = ReadFilterArgumentDefinitions.INVERTED_READ_FILTER_SHORT_NAME, + doc="Inverted (with flipped acceptance/failure conditions) read filters applied before analysis (after regular read filters).", optional=true, common = true) + public final List userEnabledInvertedReadFilterNames = new ArrayList<>(); // preserve order + @Argument(fullName = ReadFilterArgumentDefinitions.DISABLE_READ_FILTER_LONG_NAME, shortName = ReadFilterArgumentDefinitions.DISABLE_READ_FILTER_SHORT_NAME, doc="Read filters to be disabled before analysis", optional=true, common = true) @@ -34,12 +40,28 @@ public class DefaultGATKReadFilterArgumentCollection extends GATKReadFilterArgum doc = "Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)", common = true, optional = true) public boolean disableToolDefaultReadFilters = false; + private List fullUserEnabledReadFilterNames = new ArrayList<>(); + /** Returns the list with the union of inverted and non-inverted read filters provided by the user. Includes user default filters followed by the user inverted filters. */ + public List getAllUserEnabledReadFilterNames() { + if (fullUserEnabledReadFilterNames.isEmpty()) { + fullUserEnabledReadFilterNames.addAll(userEnabledReadFilterNames); + fullUserEnabledReadFilterNames.addAll(userEnabledInvertedReadFilterNames); + } + return fullUserEnabledReadFilterNames; + } + /** Returns the list with the read filters provided by the user, preserving the order. */ @Override public List getUserEnabledReadFilterNames() { return userEnabledReadFilterNames; } + /** Returns the list with the inverted read filters provided by the user, preserving the order. */ + @Override + public List getUserEnabledInvertedReadFilterNames() { + return userEnabledInvertedReadFilterNames; + } + /** Returns the set of filters disabled by the user. */ @Override public List getUserDisabledReadFilterNames() { diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterArgumentCollection.java index b9fa58bded8..2027371780f 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterArgumentCollection.java @@ -19,6 +19,16 @@ public abstract class GATKReadFilterArgumentCollection implements Serializable { */ public abstract List getUserEnabledReadFilterNames(); + /** + * Returns the inverted enabled read filters. Applied after the regular read filters. Order should be honored. + */ + public abstract List getUserEnabledInvertedReadFilterNames(); + + /** + * Returns the union of all of the user enabled read filters and the user enabled inverted read filters. + */ + public abstract List getAllUserEnabledReadFilterNames(); + /** * Returns the disabled read filter names. Order should be honored. */ diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterPluginDescriptor.java b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterPluginDescriptor.java index ff6806b7279..726576626b3 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterPluginDescriptor.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKReadFilterPluginDescriptor.java @@ -175,7 +175,7 @@ public boolean isDependentArgumentAllowed(final Class predecessorClass) { // for the user to subsequently disable the required predecessor. That case is caught during final // validation done by the validateArguments method. String predecessorName = predecessorClass.getSimpleName(); - boolean isAllowed = userArgs.getUserEnabledReadFilterNames().contains(predecessorName) + boolean isAllowed = userArgs.getAllUserEnabledReadFilterNames().contains(predecessorName) || (toolDefaultReadFilters.get(predecessorName) != null); if (isAllowed) { // Keep track of the ones we allow so we can validate later that they weren't subsequently disabled @@ -225,6 +225,10 @@ private List getUserEnabledInstances() { ReadFilter rf = allDiscoveredReadFilters.get(s); filters.add(rf); }); + userArgs.getUserEnabledInvertedReadFilterNames().forEach(s -> { + ReadFilter rf = allDiscoveredReadFilters.get(s).negate(); + filters.add(rf); + }); return filters; } @@ -294,7 +298,7 @@ public void validateAndResolvePlugins() { } // throw if a filter is both enabled *and* disabled by the user - final Set enabledAndDisabled = new HashSet<>(userArgs.getUserEnabledReadFilterNames()); + final Set enabledAndDisabled = new HashSet<>(userArgs.getAllUserEnabledReadFilterNames()); enabledAndDisabled.retainAll(userArgs.getUserDisabledReadFilterNames()); if (!enabledAndDisabled.isEmpty()) { final String badFiltersList = Utils.join(", ", enabledAndDisabled); @@ -302,6 +306,15 @@ public void validateAndResolvePlugins() { String.format("The read filter(s): %s are both enabled and disabled", badFiltersList)); } + // throw if a filter is enabled and inverted by the user + final Set enabledAndInverted = new HashSet<>(userArgs.getUserEnabledReadFilterNames()); + enabledAndInverted.retainAll(userArgs.getUserEnabledInvertedReadFilterNames()); + if (!enabledAndInverted.isEmpty()) { + final String badFiltersList = Utils.join(", ", enabledAndDisabled); + throw new CommandLineException( + String.format("The read filter(s): %s are both enabled and inverted", badFiltersList)); + } + // throw if a disabled filter doesn't exist; warn if it wasn't enabled by the tool in the first place userArgs.getUserDisabledReadFilterNames().forEach(s -> { if (!allDiscoveredReadFilters.containsKey(s)) { @@ -319,6 +332,14 @@ public void validateAndResolvePlugins() { logger.warn(String.format("Redundant enabled filter (%s) is enabled for this tool by default", s)); }); + // throw if a filter is both default and inverted by the user as we do not want to inadvertently filter all reads from the input + final Set redundantInv = new HashSet<>(toolDefaultReadFilters.keySet()); + redundantInv.retainAll(userArgs.getUserEnabledInvertedReadFilterNames()); + if (!userArgs.getDisableToolDefaultReadFilters() && redundantInv.size() > 0) { + throw new CommandLineException( + String.format("The read filter(s): %s exist as tool default filters and were inverted, disable tool default read filters in order to use this filter", redundantInv)); + } + // Throw if args were specified for a filter that was also disabled, or that was not enabled by the // tool by default. // @@ -346,7 +367,7 @@ public void validateAndResolvePlugins() { // throw if a filter name was specified that has no corresponding instance final Map requestedReadFilters = new HashMap<>(); - userArgs.getUserEnabledReadFilterNames().forEach(s -> { + userArgs.getAllUserEnabledReadFilterNames().forEach(s -> { ReadFilter trf = allDiscoveredReadFilters.get(s); if (null == trf) { if (!toolDefaultReadFilters.containsKey(s)) { @@ -370,7 +391,7 @@ public void validateAndResolvePlugins() { */ public boolean isDisabledFilter(final String filterName) { return userArgs.getUserDisabledReadFilterNames().contains(filterName) - || (userArgs.getDisableToolDefaultReadFilters() && !userArgs.getUserEnabledReadFilterNames().contains(filterName)); + || (userArgs.getDisableToolDefaultReadFilters() && !userArgs.getAllUserEnabledReadFilterNames().contains(filterName)); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java index 9f1e27f74ad..9b8d016e14d 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java @@ -9,9 +9,11 @@ private ReadFilterArgumentDefinitions(){} // GATKReadFilterPluginDescriptor arguments public static final String READ_FILTER_LONG_NAME = "read-filter"; + public static final String INVERTED_READ_FILTER_LONG_NAME = "inverted-read-filter"; public static final String DISABLE_READ_FILTER_LONG_NAME = "disable-read-filter"; public static final String DISABLE_TOOL_DEFAULT_READ_FILTERS = "disable-tool-default-read-filters"; public static final String READ_FILTER_SHORT_NAME = "RF"; + public static final String INVERTED_READ_FILTER_SHORT_NAME = "XRF"; public static final String DISABLE_READ_FILTER_SHORT_NAME = "DF"; // ReadFilter arguments diff --git a/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java index e59aac64532..0e729576172 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java @@ -132,11 +132,17 @@ public void traverse() { .forEachOrdered(variant -> { final SimpleInterval variantInterval = new SimpleInterval(variant); - apply(variant, - Collections.singletonList(variant), - new ReadsContext(reads, variantInterval, readFilter), - new ReferenceContext(reference, variantInterval), - new FeatureContext(features, variantInterval)); + + try{ + apply(variant, + Collections.singletonList(variant), + new ReadsContext(reads, variantInterval, readFilter), + new ReferenceContext(reference, variantInterval), + new FeatureContext(features, variantInterval)); + } catch (final IllegalStateException e) { + throw new GATKException("Exception thrown at " + variant.getContig() + ":" + variant.getStart() + + " " + variant.toString(), e); + } progressMeter.update(variantInterval); }); @@ -158,11 +164,16 @@ public void traverse() { postTransformer) .collect(Collectors.toList()); if (!filteredVariants.isEmpty()) { - apply(locus, - filteredVariants, - new ReadsContext(reads, locus, readFilter), - new ReferenceContext(reference, locus), - new FeatureContext(features, locus)); + try { + apply(locus, + filteredVariants, + new ReadsContext(reads, locus, readFilter), + new ReferenceContext(reference, locus), + new FeatureContext(features, locus)); + } catch (final IllegalStateException e) { + throw new GATKException("Exception thrown at first variant start " + filteredVariants.get(0).getContig() + ":" + filteredVariants.get(0).getStart() + + " " + filteredVariants.get(0).toString(), e); + } progressMeter.update(locus); } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/CountingReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/CountingReadFilter.java index cdaf4c0ddbc..9318c750b89 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/filters/CountingReadFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/CountingReadFilter.java @@ -76,7 +76,7 @@ public void resetFilteredCount() { filteredCount = 0; } - public String getName() {return delegateFilter.getClass().getSimpleName();} + public String getName() {return delegateFilter.getName();} // Returns a summary line with filter counts organized by level public String getSummaryLine() {return getSummaryLineForLevel(0);} diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java index f3003b00cd0..56981866260 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadFilter.java @@ -30,7 +30,8 @@ public abstract class ReadFilter implements Predicate, Serializable { public void setHeader(SAMFileHeader samHeader) { this.samHeader = samHeader; } - private static class ReadFilterNegate extends ReadFilter { + @VisibleForTesting + public static class ReadFilterNegate extends ReadFilter { private static final long serialVersionUID = 1L; private final ReadFilter delegate; @@ -44,6 +45,11 @@ protected ReadFilterNegate(ReadFilter delegate) { public boolean test( GATKRead read ) { return !delegate.test(read); } + + @Override + public String getName() { + return "NOT " + delegate.getName(); + } } protected abstract static class ReadFilterBinOp extends ReadFilter { @@ -80,6 +86,11 @@ protected static class ReadFilterAnd extends ReadFilterBinOp { @Override public boolean test( GATKRead read ) { return lhs.test(read) && rhs.test(read); } + + @Override + public String getName() { + return lhs.getName() + " AND " + rhs.getName(); + } } private static class ReadFilterOr extends ReadFilterBinOp { @@ -89,6 +100,11 @@ private static class ReadFilterOr extends ReadFilterBinOp { @Override public boolean test( GATKRead read ) { return lhs.test(read) || rhs.test(read);} + + @Override + public String getName() { + return lhs.getName() + " OR " + rhs.getName(); + } } // It turns out, this is necessary. Please don't remove it. @@ -144,4 +160,11 @@ public ReadFilter or( ReadFilter other ) { @Override public abstract boolean test( GATKRead read ); + + /** + * Retruns the display name for logs for this filter + */ + String getName() { + return this.getClass().getSimpleName(); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java index 48f8099e620..ec576c9dea1 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKRegistrator.java @@ -13,7 +13,6 @@ import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.spark.serializer.KryoRegistrator; -import org.bdgenomics.adam.serialization.ADAMKryoRegistrator; import org.broadinstitute.hellbender.tools.funcotator.FuncotationMap; import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation; import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata; @@ -35,8 +34,6 @@ */ public class GATKRegistrator implements KryoRegistrator { - private final ADAMKryoRegistrator ADAMregistrator = new ADAMKryoRegistrator(); - public GATKRegistrator() {} /** @@ -76,32 +73,6 @@ public Allele newInstance() { @Override public void registerClasses(Kryo kryo) { - - registerGATKClasses(kryo); - - - // register the ADAM data types using Avro serialization, including: - // AlignmentRecord - // Genotype - // Variant - // DatabaseVariantAnnotation - // NucleotideContigFragment - // Contig - // StructuralVariant - // VariantCallingAnnotations - // VariantEffect - // DatabaseVariantAnnotation - // Dbxref - // Feature - // ReferencePosition - // ReferencePositionPair - // SingleReadBucket - // IndelRealignmentTarget - // TargetSet - // ZippedTargetSet - ADAMregistrator.registerClasses(kryo); - - //do this before and after ADAM to try and force our registrations to win out registerGATKClasses(kryo); } diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java index 00474a6d38d..8e5bc3a9fbe 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/GATKSparkTool.java @@ -334,22 +334,11 @@ public JavaRDD getUnfilteredReads() { } protected JavaRDD getGatkReadJavaRDD(TraversalParameters traversalParameters, ReadsSparkSource source, GATKPath inputSpecifier) { - JavaRDD output; - // TODO: This if statement is a temporary hack until #959 gets resolve - if (inputSpecifier.hasExtension(".adam")) { - try { - output = source.getADAMReads(inputSpecifier, traversalParameters, getHeaderForReads()); - } catch (IOException e) { - throw new UserException("Failed to read ADAM file " + inputSpecifier, e); - } - - } else { - if (hasCramInput() && !hasReference()){ - throw UserException.MISSING_REFERENCE_FOR_CRAM; - } - output = source.getParallelReads(inputSpecifier, referenceArguments.getReferenceSpecifier(), traversalParameters, bamPartitionSplitSize, useNio); + if (hasCramInput() && !hasReference()) { + throw UserException.MISSING_REFERENCE_FOR_CRAM; } - return output; + + return source.getParallelReads(inputSpecifier, referenceArguments.getReferenceSpecifier(), traversalParameters, bamPartitionSplitSize, useNio); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/DirectFullByteArrayByteAccess.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/DirectFullByteArrayByteAccess.java deleted file mode 100644 index 4570e298fdd..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/DirectFullByteArrayByteAccess.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.hellbender.engine.spark.datasources; - -import org.bdgenomics.utils.io.ByteArrayByteAccess; - -/** - * A version of org.bdgenomics.utils.io.ByteArrayByteAccess that makes no copies of byte array used for initialization. - * However DirectFullByteArrayByteAccess.readFully can only return a reference to the full underlying byte array. - * Therefore, the user should exercise caution that the underlying data does not get mutated. - */ -class DirectFullByteArrayByteAccess extends ByteArrayByteAccess { - private static final long serialVersionUID = 1L; - - DirectFullByteArrayByteAccess(byte[] bytes) { - super(bytes); - } - - @Override - public byte[] readFully(long offset, int length) { - if ((offset != 0) || (length != this.length())) { - throw new IllegalArgumentException("readFully can only return a reference to the full underlying byte array"); - } - return this.bytes(); - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java index 96e932f5c0c..764faeeced5 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSink.java @@ -6,28 +6,20 @@ import htsjdk.samtools.util.FileExtensions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.parquet.avro.AvroParquetOutputFormat; -import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; -import org.bdgenomics.adam.models.ReadGroupDictionary; -import org.bdgenomics.adam.models.SequenceDictionary; -import org.bdgenomics.formats.avro.AlignmentRecord; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.gcs.BucketUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.GATKReadToBDGAlignmentRecordConverter; import org.broadinstitute.hellbender.utils.read.HeaderlessSAMRecordCoordinateComparator; import org.broadinstitute.hellbender.utils.read.ReadsWriteFormat; import org.broadinstitute.hellbender.utils.spark.SparkUtils; import org.disq_bio.disq.*; -import scala.Tuple2; import java.io.IOException; import java.util.Comparator; @@ -131,11 +123,6 @@ public static void writeReads( ReadsFormatWriteOption formatWriteOption = ReadsFormatWriteOption.BAM; // use BAM if output file is a directory FileCardinalityWriteOption fileCardinalityWriteOption = FileCardinalityWriteOption.MULTIPLE; writeReads(ctx, absoluteOutputFile, referencePathSpecifier, readsToOutput, header, splittingIndexGranularity, formatWriteOption, fileCardinalityWriteOption); - } else if (format == ReadsWriteFormat.ADAM) { - if (outputPartsDir!=null) { - throw new GATKException(String.format("You specified the bam output parts directory %s, but requested an ADAM output format which does not use this option",outputPartsDir)); - } - writeReadsADAM(ctx, absoluteOutputFile, readsToOutput, header); } } @@ -155,29 +142,6 @@ private static void writeReads( .write(htsjdkReadsRdd, outputFile, writeOptions); } - private static void writeReadsADAM( - final JavaSparkContext ctx, final String outputFile, final JavaRDD reads, - final SAMFileHeader header) throws IOException { - final SequenceDictionary seqDict = SequenceDictionary.fromSAMSequenceDictionary(header.getSequenceDictionary()); - final ReadGroupDictionary readGroups = ReadGroupDictionary.fromSAMHeader(header); - final JavaPairRDD rddAlignmentRecords = - reads.map(read -> { - read.setHeaderStrict(header); - AlignmentRecord alignmentRecord = GATKReadToBDGAlignmentRecordConverter.convert(read, seqDict, readGroups); - read.setHeaderStrict(null); // Restore the header to its previous state so as not to surprise the caller - return alignmentRecord; - }).mapToPair(alignmentRecord -> new Tuple2<>(null, alignmentRecord)); - // instantiating a Job is necessary here in order to set the Hadoop Configuration... - final Job job = Job.getInstance(ctx.hadoopConfiguration()); - // ...here, which sets a config property that the AvroParquetOutputFormat needs when writing data. Specifically, - // we are writing the Avro schema to the Configuration as a JSON string. The AvroParquetOutputFormat class knows - // how to translate objects in the Avro data model to the Parquet primitives that get written. - AvroParquetOutputFormat.setSchema(job, AlignmentRecord.getClassSchema()); - deleteHadoopFile(outputFile, ctx.hadoopConfiguration()); - rddAlignmentRecords.saveAsNewAPIHadoopFile( - outputFile, Void.class, AlignmentRecord.class, AvroParquetOutputFormat.class, job.getConfiguration()); - } - private static void deleteHadoopFile(String fileToObliterate, Configuration conf) throws IOException { final Path pathToDelete = new Path(fileToObliterate); pathToDelete.getFileSystem(conf).delete(pathToDelete, true); diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java index 7795015208b..7b2ef39789f 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReadsSparkSource.java @@ -4,12 +4,8 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; -import org.apache.hadoop.mapreduce.Job; -import org.apache.parquet.avro.AvroParquetInputFormat; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.broadcast.Broadcast; -import org.bdgenomics.formats.avro.AlignmentRecord; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.ReadsPathDataSource; @@ -136,30 +132,6 @@ public JavaRDD getParallelReads(final GATKPath readsPathSpecifier, fin return getParallelReads(readsPathSpecifier, referencePathSpecifier, null /* all reads */, splitSize); } - /** - * Loads ADAM reads stored as Parquet. - * @param inputPathSpecifier path to the Parquet data - * @return RDD of (ADAM-backed) GATKReads from the file. - */ - public JavaRDD getADAMReads(final GATKPath inputPathSpecifier, final TraversalParameters traversalParameters, final SAMFileHeader header) throws IOException { - Job job = Job.getInstance(ctx.hadoopConfiguration()); - AvroParquetInputFormat.setAvroReadSchema(job, AlignmentRecord.getClassSchema()); - Broadcast bHeader; - if (header == null) { - bHeader= ctx.broadcast(null); - } else { - bHeader = ctx.broadcast(header); - } - @SuppressWarnings("unchecked") - JavaRDD recordsRdd = ctx.newAPIHadoopFile( - inputPathSpecifier.getRawInputString(), AvroParquetInputFormat.class, Void.class, AlignmentRecord.class, job.getConfiguration()) - .values(); - JavaRDD readsRdd = recordsRdd.map(record -> new BDGAlignmentRecordToGATKReadAdapter(record, bHeader.getValue())); - JavaRDD filteredRdd = readsRdd.filter(record -> samRecordOverlaps(record.convertToSAMRecord(header), traversalParameters)); - - return fixPartitionsIfQueryGrouped(ctx, header, filteredRdd); - } - /** * Loads the header using Hadoop-BAM. * @param filePathSpecifier path to the bam. diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReferenceTwoBitSparkSource.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReferenceTwoBitSparkSource.java index dd68491a7bc..7d3ab8e3e30 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReferenceTwoBitSparkSource.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/datasources/ReferenceTwoBitSparkSource.java @@ -1,27 +1,17 @@ package org.broadinstitute.hellbender.engine.spark.datasources; -import com.google.common.io.ByteStreams; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; -import org.bdgenomics.adam.models.ReferenceRegion; -import org.bdgenomics.adam.util.TwoBitFile; -import org.bdgenomics.adam.util.TwoBitRecord; -import org.bdgenomics.formats.avro.Strand; -import org.bdgenomics.utils.io.ByteAccess; +import htsjdk.samtools.reference.ReferenceSequence; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.gcs.BucketUtils; import org.broadinstitute.hellbender.utils.reference.ReferenceBases; -import scala.Tuple2; -import scala.collection.JavaConversions; +import org.broadinstitute.hellbender.utils.reference.TwoBitReference; import java.io.IOException; import java.io.Serializable; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; + /** * A ReferenceSource impl that is backed by a .2bit representation of a reference genome. This loads an entire .2bit @@ -34,8 +24,7 @@ public class ReferenceTwoBitSparkSource implements ReferenceSparkSource, Seriali public static final String TWO_BIT_EXTENSION = ".2bit"; private final String referenceURL; - private final TwoBitFile twoBitFile; - private final Map twoBitSeqEntries; + private final TwoBitReference twoBitFile; public ReferenceTwoBitSparkSource( GATKPath referencePathSpecifier) throws IOException { // It would simplify this class if we could cache the GATKPath, but ReferenceFileSparkSource @@ -43,13 +32,7 @@ public ReferenceTwoBitSparkSource( GATKPath referencePathSpecifier) throws IOExc // issue during broadcast with the Java 11 GATK build. See https://issues.apache.org/jira/browse/SPARK-26963. this.referenceURL = referencePathSpecifier.getRawInputString(); Utils.validateArg(isTwoBit(referencePathSpecifier), "ReferenceTwoBitSource can only take .2bit files"); - byte[] bytes = ByteStreams.toByteArray(BucketUtils.openFile(this.referenceURL)); - ByteAccess byteAccess = new DirectFullByteArrayByteAccess(bytes); - this.twoBitFile = new TwoBitFile(byteAccess); - this.twoBitSeqEntries = new LinkedHashMap<>(); - for (Tuple2 pair: JavaConversions.seqAsJavaList(twoBitFile.seqRecords())) { - twoBitSeqEntries.put(pair._1, pair._2); - } + this.twoBitFile = new TwoBitReference(new GATKPath(referenceURL)); } /** @@ -63,36 +46,24 @@ public ReferenceTwoBitSparkSource( GATKPath referencePathSpecifier) throws IOExc @Override public ReferenceBases getReferenceBases(SimpleInterval interval) throws IOException { final SimpleInterval queryInterval = cropIntervalAtContigEnd(interval); - final String bases = twoBitFile.extract(simpleIntervalToReferenceRegion(queryInterval)); - return new ReferenceBases(bases.getBytes(), queryInterval); + final ReferenceSequence result = twoBitFile.getReferenceBases(queryInterval); + return new ReferenceBases(result.getBases(), queryInterval); } @Override public SAMSequenceDictionary getReferenceSequenceDictionary(SAMSequenceDictionary optReadSequenceDictionaryToMatch) throws IOException { - List records = twoBitSeqEntries.entrySet().stream() - .map(pair -> new SAMSequenceRecord(pair.getKey(), pair.getValue().dnaSize())) - .collect(Collectors.toList()); - return new SAMSequenceDictionary(records); + return twoBitFile.getSequenceDictionary(); } public static boolean isTwoBit(final GATKPath referenceSpecifier) { return referenceSpecifier.getURI().getPath().endsWith(TWO_BIT_EXTENSION); } - private static ReferenceRegion simpleIntervalToReferenceRegion(SimpleInterval interval) { - // ADAM and GA4GH both use zero-based half-open intervals for genome coordinates - String contig = interval.getContig(); - long start = interval.getGA4GHStart(); - long end = interval.getGA4GHEnd(); - return new ReferenceRegion(contig, start, end, Strand.UNKNOWN); - } - private SimpleInterval cropIntervalAtContigEnd( final SimpleInterval interval ) { // The 2bit query API does not support queries beyond the ends of contigs, so we need // to truncate our interval at the contig end if necessary. - final TwoBitRecord contigRecord = twoBitSeqEntries.get(interval.getContig()); + final SAMSequenceRecord contigRecord = twoBitFile.getSequenceDictionary().getSequence(interval.getContig()); Utils.nonNull(contigRecord, () -> "Contig " + interval.getContig() + " not found in reference dictionary"); - return new SimpleInterval(interval.getContig(), interval.getStart(), Math.min(interval.getEnd(), contigRecord.dnaSize())); + return new SimpleInterval(interval.getContig(), interval.getStart(), Math.min(interval.getEnd(), contigRecord.getSequenceLength())); } - } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java b/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java index 007918fb234..45373fc1efd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/AnalyzeSaturationMutagenesis.java @@ -131,6 +131,9 @@ public final class AnalyzeSaturationMutagenesis extends GATKTool { @Argument(doc = "paired mode evaluation of variants (combine mates, when possible)", fullName = "paired-mode") private static boolean pairedMode = true; + @Argument(doc = "don't discard disjoint mates (i.e., combine variants from both reads)", fullName = "dont-ignore-disjoint-pairs") + private static boolean noIgnoreDisjointPairs = false; + @Argument(doc = "write BAM of rejected reads", fullName = "write-rejected-reads") private static boolean writeRejectedReads = false; @@ -1948,7 +1951,11 @@ private static Interval calculateShortFragmentTrim( final GATKRead read, final I read2.setAttribute(ReportType.REPORT_TYPE_ATTRIBUTE_KEY, reportType.attributeValue); rejectedReadsBAMWriter.addRead(read2); } - } else { // mates are disjoint + } else if (noIgnoreDisjointPairs) { // mates are disjoint, process both + final ReadReport combinedReport = new ReadReport(report1, report2); + final ReportType reportType = combinedReport.updateCounts(codonTracker, variationCounts, reference); + disjointPairCounts.bumpCount(reportType); + } else { // mates are disjoint, use the first one final ReportType ignoredMate = ReportType.IGNORED_MATE; if ( read1.isFirstOfPair() ) { processReport(read1, report1, disjointPairCounts); @@ -1968,6 +1975,7 @@ private static Interval calculateShortFragmentTrim( final GATKRead read, final I } } + private static void processReport( final GATKRead read, final ReadReport readReport, final ReportTypeCounts counts ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java index 3fa41b46a04..695d1385e4c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java @@ -10,7 +10,6 @@ public class FlowBasedArgumentCollection implements Serializable { private static final long serialVersionUID = 0; public static final String FLOW_USE_T0_TAG = "flow-use-t0-tag"; - public static final String PROBABILITY_RATIO_THRESHOLD_LONG_NAME = "flow-probability-threshold"; public static final String REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME = "flow-remove-non-single-base-pair-indels"; public static final String REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME = "flow-remove-one-zero-probs"; public static final String NUMBER_OF_POSSIBLE_PROBS_LONG_NAME = "flow-quantization-bins"; @@ -27,8 +26,7 @@ public class FlowBasedArgumentCollection implements Serializable { - private static final double DEFAULT_RATIO_THRESHOLD = 0.003; - private static final double DEFAULT_FILLING_VALUE = 0.001; + public static final double DEFAULT_FILLING_VALUE = 0.001; private static final boolean DEFAULT_REMOVE_LONGER_INDELS = false; private static final boolean DEFAULT_REMOVE_ONE_TO_ZERO = false; private static final boolean DEFAULT_SYMMETRIC_INDELS = false; @@ -45,10 +43,6 @@ public class FlowBasedArgumentCollection implements Serializable { @Argument(fullName = FLOW_USE_T0_TAG, doc = "Use t0 tag if exists in the read to create flow matrix", optional = true) public boolean useT0Tag = DEFAULT_FLOW_USE_T0_TAG; - @Advanced - @Argument(fullName = PROBABILITY_RATIO_THRESHOLD_LONG_NAME, doc = "Lowest probability ratio to be used as an option", optional = true) - public double probabilityRatioThreshold = DEFAULT_RATIO_THRESHOLD; - @Advanced @Argument(fullName = REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME, doc = "Should the probabilities of more then 1 indel be used", optional = true) public boolean removeLongerThanOneIndels = DEFAULT_REMOVE_LONGER_INDELS; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/GatherVcfsCloud.java b/src/main/java/org/broadinstitute/hellbender/tools/GatherVcfsCloud.java index fcfbc926e7a..8f4b2ba5037 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/GatherVcfsCloud.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/GatherVcfsCloud.java @@ -102,7 +102,6 @@ oneLineSummary = "Gathers multiple VCF files from a scatter operation into a single VCF file", programGroup = VariantManipulationProgramGroup.class ) -@BetaFeature public final class GatherVcfsCloud extends CommandLineProgram { public static final String IGNORE_SAFETY_CHECKS_LONG_NAME = "ignore-safety-checks"; @@ -131,7 +130,7 @@ public final class GatherVcfsCloud extends CommandLineProgram { @Argument(fullName = "progress-logger-frequency", doc = "The frequency with which the progress is logged to output (i.e. every N records)") - public static Integer progressLoggerFrequency = 10000; + public Integer progressLoggerFrequency = 10000; @Advanced @Argument(fullName = IGNORE_SAFETY_CHECKS_LONG_NAME, doc = "Disable sanity checks to improve performance, may result in silently creating corrupted outputs data") @@ -192,7 +191,7 @@ protected Object doWork() { break; case CONVENTIONAL: log.info("Gathering by conventional means."); - gatherConventionally(sequenceDictionary, createIndex, inputPaths, output, cloudPrefetchBuffer, disableContigOrderingCheck); + gatherConventionally(sequenceDictionary, createIndex, inputPaths, output, cloudPrefetchBuffer, disableContigOrderingCheck, progressLoggerFrequency); break; default: throw new GATKException.ShouldNeverReachHereException("Invalid gather type: " + gatherType + ". Please report this bug to the developers."); @@ -303,7 +302,8 @@ private static void gatherConventionally(final SAMSequenceDictionary sequenceDic final List inputFiles, final File outputFile, final int cloudPrefetchBuffer, - final boolean disableContigOrderingCheck) { + final boolean disableContigOrderingCheck, + final int progressLoggerFrequency) { final EnumSet options = EnumSet.copyOf(VariantContextWriterBuilder.DEFAULT_OPTIONS); if (createIndex) options.add(Options.INDEX_ON_THE_FLY); else options.remove(Options.INDEX_ON_THE_FLY); try (final VariantContextWriter out = new VariantContextWriterBuilder().setOutputFile(outputFile) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java b/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java index f66556f539d..6aa20a9dc5c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java @@ -86,18 +86,16 @@ public void onTraversalStart() { } final List sampleNames = getHeaderForReads().getReadGroups().stream().map(s -> s.getSample()).distinct().collect(Collectors.toList()); - if (sampleNames.size() > 1) { - throw new UserException.BadInput("The given input bam has more than one unique sample name: " + StringUtils.join(sampleNames, ", ")); - } if (sampleNames.size() == 0) { throw new UserException.BadInput("The given bam input has no sample names."); } try (final OutputStreamWriter fileWriter = new OutputStreamWriter(outputSampleNameFile.getOutputStream())) { - final String rawSample = sampleNames.get(0); - final String outputSample = urlEncode ? IOUtils.urlEncode(rawSample) : rawSample; - fileWriter.write(outputSample); + final String outputSamplesOnSeparateLines = sampleNames.stream() + .map(rawSample -> urlEncode ? IOUtils.urlEncode(rawSample) : rawSample) + .collect(Collectors.joining("\n")); + fileWriter.write(outputSamplesOnSeparateLines); } catch (final IOException ioe) { throw new UserException(String.format("Could not write to output file %s.", outputSampleNameFile), ioe); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java b/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java new file mode 100644 index 00000000000..76a0410c777 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java @@ -0,0 +1,79 @@ +package org.broadinstitute.hellbender.tools; + +import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.tools.filediagnostics.HTSAnalyzer; +import org.broadinstitute.hellbender.tools.filediagnostics.HTSAnalyzerFactory; +import picard.cmdline.programgroups.OtherProgramGroup; + +import java.io.File; + +/** + * A diagnostic tool that prints meta information about a GATK input file. + * + * Works on files ending in .cram, .crai, and .bai. + * + * Sample Usage: + * + * gatk PrintFileDiagnostics \ + * -I input.cram \ + * -count-limit 10 + */ +@ExperimentalFeature +@WorkflowProperties +@CommandLineProgramProperties( + summary = "Print diagnostic information about a genomics file to stdout", + oneLineSummary = "Print diagnostic information about a genomics file to stdout", + programGroup = OtherProgramGroup.class +) +public class PrintFileDiagnostics extends CommandLineProgram { + + @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, + doc = "Input path for diagnostics", + optional = false, + common = true) + @WorkflowInput + public GATKPath inputPath; + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Outut file for diagnostics (must be a local file)", + optional = false, + common = true) + @WorkflowInput + public File outputFile; + + @Argument(shortName="count-limit", + fullName="count-limit", + doc="Limit on how much output to emit (.cram only)") + private long countLimit = 1000; + + private HTSAnalyzer htsAnalyzer; + + @Override + protected void onStartup() { + super.onStartup(); + htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputFile, countLimit); + } + + @Override + protected Object doWork() { + htsAnalyzer.analyze(); + return 0; + } + + @Override + protected void onShutdown() { + if ( htsAnalyzer != null ) { + try { + htsAnalyzer.close(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java index 5d6521850b5..bdf70dc5ccf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java @@ -2,6 +2,8 @@ import htsjdk.samtools.util.Locatable; import htsjdk.tribble.Feature; +import htsjdk.tribble.NamedFeature; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.param.ParamUtils; @@ -19,7 +21,17 @@ public class SimpleCount implements Locatable, Feature { public SimpleCount(final SimpleInterval interval, final int count) { this.interval = Utils.nonNull(interval); - this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count."); + this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd() + "."); + } + + public SimpleCount(final NamedFeature namedFeature) { + try { + int count = Integer.parseInt(namedFeature.getName()); + this.interval = new SimpleInterval(namedFeature); + this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + "."); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Error parsing name into integer for feature at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + "."); + } } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java index 421dda1ec1d..3c34b34bf43 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java @@ -95,7 +95,7 @@ public void composeVariantContextHeader(final SAMSequenceDictionary sequenceDict /* header lines related to genotype formatting */ result.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, - VCFHeaderLineType.Integer, "Genotype")); + VCFHeaderLineType.String, "Genotype")); result.addMetaDataLine(new VCFFormatHeaderLine(CN, 1, VCFHeaderLineType.Integer, "Copy number maximum a posteriori value")); result.addMetaDataLine(new VCFFormatHeaderLine(CNLP, VCFHeaderLineCount.UNBOUNDED, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/DragstrParametersEstimator.java b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/DragstrParametersEstimator.java index 1cc37c6abe6..cac8afcae93 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/dragstr/DragstrParametersEstimator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/dragstr/DragstrParametersEstimator.java @@ -1,6 +1,6 @@ package org.broadinstitute.hellbender.tools.dragstr; -import org.apache.commons.lang.math.IntRange; +import org.apache.commons.lang3.Range; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.dragstr.DragstrParams; @@ -143,38 +143,38 @@ private void estimatePeriod(final int period, final DragstrParamsBuilder destina } } - final ArrayDeque pending = new ArrayDeque<>(hyperParameters.maxRepeatLength); + final ArrayDeque> pending = new ArrayDeque<>(hyperParameters.maxRepeatLength); if (rightFlank >= leftFlank) { // We fill 'pending' with the repeat-length groups that will be analyze: // [1 .. leftFlank], leftFlank + 1, leftFlank + 2, ... , [rightFlank .. maxRepeats+] - pending.add(new IntRange(1, leftFlank)); + pending.add(Range.of(1, leftFlank)); for (leftFlank++; leftFlank <= rightFlank; leftFlank++) { - pending.add(new IntRange(leftFlank)); + pending.add(Range.is(leftFlank)); } - pending.add(new IntRange(++rightFlank, hyperParameters.maxRepeatLength)); + pending.add(Range.of(++rightFlank, hyperParameters.maxRepeatLength)); } else { // no enough data we simply put all repeat lengths is one single group: - pending.add(new IntRange(1, hyperParameters.maxRepeatLength)); + pending.add(Range.of(1, hyperParameters.maxRepeatLength)); } - IntRange last = null; + Range last = null; // Done will contain the ranges already processed. - final ArrayDeque done = new ArrayDeque<>(hyperParameters.maxRepeatLength); + final ArrayDeque> done = new ArrayDeque<>(hyperParameters.maxRepeatLength); do { - final IntRange next = pending.pop(); + final Range next = pending.pop(); estimatePeriodRepeatInterval(period, next, destination, cases); - final double gp1 = destination.gp(period, next.getMinimumInteger()); - final double api1 = destination.api(period, next.getMinimumInteger()); + final double gp1 = destination.gp(period, next.getMinimum()); + final double api1 = destination.api(period, next.getMinimum()); // if GP and API are "decreasing" with respect those from smaller repeat length // then we accepted them: - if (last == null || (destination.gp(period, last.getMaximumInteger()) >= gp1 && - destination.api(period, last.getMaximumInteger()) + hyperParameters.apiMonothresh >= api1)) { + if (last == null || (destination.gp(period, last.getMaximum()) >= gp1 && + destination.api(period, last.getMaximum()) + hyperParameters.apiMonothresh >= api1)) { done.addLast(last = next); // if not, the we group back this repeat-length group/range with last one and re-estimate (next-loop). } else { - pending.push(new IntRange(last.getMinimumNumber(), next.getMaximumNumber())); + pending.push(Range.of(last.getMinimum(), next.getMaximum())); done.removeLast(); last = !done.isEmpty() ? done.getLast() : null; } @@ -183,14 +183,14 @@ private void estimatePeriod(final int period, final DragstrParamsBuilder destina // Given a observed het/hom ratio the total number of reads, the variable ones and the length of the STR in bases. // calculate the optimal gp and api that maximizes the likelihood. - private void estimatePeriodRepeatInterval(final int period, final IntRange repeatRange, + private void estimatePeriodRepeatInterval(final int period, final Range repeatRange, final DragstrParamsBuilder builder, final StratifiedDragstrLocusCases cases) { int maxApiIdx = -1; int maxGpIdx = -1; double maxLog10Prob = Double.NEGATIVE_INFINITY; - final int minRepeat = repeatRange.getMinimumInteger(); - final int maxRepeat = repeatRange.getMaximumInteger(); + final int minRepeat = repeatRange.getMinimum(); + final int maxRepeat = repeatRange.getMaximum(); final int periodIdx = period - 1; final double maxLog10PHet = log10HetOverHomVar - Math.log10(1 + hyperParameters.hetToHomRatio); for (int i = 0; i < log10ApiValues.length; i++) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java new file mode 100644 index 00000000000..1de153092ae --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java @@ -0,0 +1,31 @@ +package org.broadinstitute.hellbender.tools.filediagnostics; + +import htsjdk.samtools.BAMIndexer; +import org.broadinstitute.hellbender.engine.GATKPath; + +import java.io.File; +import java.io.IOException; + +/** + * Analyzer for BAI files. + */ +public class BAIAnalyzer extends HTSAnalyzer { + + public BAIAnalyzer(final GATKPath inputPath, final File outputFile) { + super(inputPath, outputFile); + } + + /** + * Run the analyzer for the file. + */ + protected void doAnalysis() { + System.out.println(String.format("\nOutput written to %s\n", outputFile)); + BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), outputFile, true); + } + + @Override + public void close() throws IOException { + } + +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java new file mode 100644 index 00000000000..dcbe8109938 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java @@ -0,0 +1,59 @@ +package org.broadinstitute.hellbender.tools.filediagnostics; + +import htsjdk.samtools.CRAMCRAIIndexer; +import htsjdk.samtools.cram.CRAIIndex; +import htsjdk.samtools.util.RuntimeIOException; +import org.broadinstitute.hellbender.engine.GATKPath; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Analyzer for CRAM (.crai) index files. + */ +public class CRAIAnalyzer extends HTSAnalyzer { + + final FileOutputStream fos; + + public CRAIAnalyzer(final GATKPath inputPath, final File outputFile) { + super(inputPath, outputFile); + try { + fos = new FileOutputStream(outputFile); + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } + + protected void emitln(final String s) { + try { + fos.write(s.getBytes()); + fos.write('\n'); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Run the analyzer for the file. + */ + protected void doAnalysis() { + try (final InputStream is = inputPath.getInputStream()) { + final CRAIIndex craiIndex = CRAMCRAIIndexer.readIndex(is); + emitln("\nSeqId AlignmentStart AlignmentSpan ContainerOffset SliceOffset SliceSize\n"); + craiIndex.getCRAIEntries().stream().forEach(e -> emitln(e.toString())); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws IOException { + if (fos != null) { + fos.close(); + } + } + +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java new file mode 100644 index 00000000000..76933277e99 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java @@ -0,0 +1,308 @@ +package org.broadinstitute.hellbender.tools.filediagnostics; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.cram.build.CramIO; +import htsjdk.samtools.cram.encoding.CRAMEncoding; +import htsjdk.samtools.cram.encoding.EncodingFactory; +import htsjdk.samtools.cram.io.ITF8; +import htsjdk.samtools.cram.structure.*; +import htsjdk.samtools.seekablestream.SeekablePathStream; +import htsjdk.samtools.util.RuntimeIOException; +import org.broadinstitute.hellbender.engine.GATKPath; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Base64; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Analyzer for CRAM files. Displays metadata for each CRAM container, + * slice, and block. + * + * Note: the analyzer does not require a reference for the since it only + * enumerates metadata and doesn't attempt to dereference the reads. + */ +public class CRAMAnalyzer extends HTSAnalyzer { + + final Map externalDataSeriesDataSizes = new LinkedHashMap<>(); + final Map externalTagDataSizes = new LinkedHashMap<>(); + long coreBlocksDataSize = 0L; + long recordCount = 0; + final long countLimit; + final FileOutputStream fos; + + public CRAMAnalyzer(final GATKPath inputPathName, final File outputFile, final long countLimit) { + super(inputPathName, outputFile); + this.countLimit = countLimit; + try { + fos = new FileOutputStream(outputFile); + } catch (final IOException e) { + throw new RuntimeIOException(e); + } + } + + @Override + public void close() throws IOException { + if (fos != null) { + fos.close(); + } + } + + protected void emitln(final String s) { + try { + fos.write(s.getBytes()); + fos.write('\n'); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Run the analyzer for the file. + */ + protected void doAnalysis() { + int containerCount = 0; + try (final SeekablePathStream seekableStream = new SeekablePathStream(this.inputPath.toPath())) { + final CramHeader cramHeader = analyzeCRAMHeader(seekableStream); + boolean isEOF = false; + while (!isEOF && containerCount < countLimit) { + long containerOffset = seekableStream.position(); + final Container container = new Container( + cramHeader.getCRAMVersion(), + seekableStream, + containerOffset); + isEOF = analyzeContainer(container, ++containerCount) ; + } + } + catch (IOException e) { + throw new RuntimeIOException(e); + } + + emitln("\nTotal Record Count: " + recordCount); + emitDataDistribution(); + } + + /** + * Display metadata for a CRAM file header. + * + */ + public CramHeader analyzeCRAMHeader(InputStream is) { + final CramHeader cramHeader = CramIO.readCramHeader(is); + emitln("\nCRAM File: " + inputPath); + emitln("CRAM Version: " + cramHeader.getCRAMVersion().toString()); + emitln("CRAM ID Contents: " + String.format("%s", Base64.getEncoder().encodeToString(cramHeader.getId()))); + + final SAMFileHeader samHeader = Container.readSAMFileHeaderContainer(cramHeader.getCRAMVersion(), is, inputPath.getRawInputString()); + emitln("\n" + samHeader.toString()); + final SAMSequenceDictionary dict = samHeader.getSequenceDictionary(); + emitln(dict.toString()); + dict.getSequences().forEach(e -> emitln(e.toString())); + return cramHeader; + } + + /** + * Display metadata for a CRAM file container. + * return true if container is EOF container + */ + public boolean analyzeContainer(Container container, int containerCount) { + final ContainerHeader containerHeader = container.getContainerHeader(); + emitln(String.format( + "\n***Container #:%d %s byteOffset=%d", + containerCount, containerHeader.toString(), container.getContainerByteOffset())); + if (container.isEOF()) { + return true; + } + // contentID to DataSeries (excludes tags ?) + final Map dataSeriesByContentID = analyzeCompressionHeader(container.getCompressionHeader()); + int sliceCount = 0; + for (final Slice slice : container.getSlices()) { + analyzeSlice(slice, ++sliceCount, dataSeriesByContentID); + } + return false; + } + + public Map analyzeCompressionHeader(final CompressionHeader compressionHeader) { + //preservation map, data series encoding map, and tag encoding map + analyzePreservationMap(compressionHeader); + final Map dataSeriesByContentID = analyzeDataSeriesEncodingMap(compressionHeader); + analyzeTagEncodingMap(compressionHeader); + return dataSeriesByContentID; + } + + public void analyzePreservationMap(final CompressionHeader compressionHeader) { + emitln(String.format( + "Requires reference (%b); Preserved read names (%b); APDelta (%b)", + compressionHeader.isReferenceRequired(), + compressionHeader.isPreserveReadNames(), + compressionHeader.isAPDelta())); + } + + public Map analyzeDataSeriesEncodingMap(final CompressionHeader compressionHeader) { + // Since each container can in theory use a different set of ids for the data series in that + // container, we need to reconstruct the mapping for each container, and return it so that as + // we process Slice blocks and track data, we can calculate the data series to which we should + // accrue that block's data + final Map dataSeriesByContentID = new LinkedHashMap<>(); + + emitln("\nData Series Encodings:\n"); + final CompressionHeaderEncodingMap encodingMap = compressionHeader.getEncodingMap(); + for (final DataSeries dataSeries: DataSeries.values()) { + final EncodingDescriptor encodingDescriptor = encodingMap.getEncodingDescriptorForDataSeries(dataSeries); + if (encodingDescriptor == null) { + emitln(String.format("%-50s not present", + String.format("DataSeries (%s/%s)", + dataSeries.getCanonicalName(), + dataSeries.name()))); + } else { + // the encoding map has an encoding descriptor for this data series; determine if its external + // and if so, update the contentID to data series map so we can track how much data is used for + // the blocks for this data series + final Integer externalContentID = contentIDFromExternalDescriptor(dataSeries.getType(), encodingDescriptor); + if (externalContentID != null) { + dataSeriesByContentID.put(externalContentID, dataSeries); + } + emitln(String.format("%-50s %s", + String.format("DataSeries (%s/%s)", + dataSeries.getCanonicalName(), + dataSeries.name()), + encodingDescriptorAsString( + dataSeries.getType(), + encodingDescriptor))); + } + } + return dataSeriesByContentID; + } + + public void analyzeTagEncodingMap(final CompressionHeader compressionHeader) { + emitln("\nTag Encodings:"); + for (final Map.Entry entry : compressionHeader.getTagEncodingMap().entrySet()) { + final Integer contentID = entry.getKey(); // is this content ID ? + final EncodingDescriptor ep = entry.getValue(); + emitln(String.format("%-50s %s", + String.format("Content ID/Tag (%s/%s)", + contentID, + decomposeTagNameAndType(contentID)), + encodingDescriptorAsString(DataSeriesType.BYTE_ARRAY, ep))); + } + } + + /** + * Display metadata for a CRAM container slice. + * + */ + public void analyzeSlice(final Slice slice, final int sliceCount, final Map dataSeriesByContentID) { + emitln(String.format("\n******Slice #: %d %s", + sliceCount, + slice.toString())); + emitln(String.format("%-50s %s", + "Header block ", + slice.getSliceHeaderBlock())); + emitln(String.format("%-50s %s", + "Core block ", + slice.getSliceBlocks().getCoreBlock())); + + if (slice.getEmbeddedReferenceContentID() != Slice.EMBEDDED_REFERENCE_ABSENT_CONTENT_ID) { + emitln(String.format("Embedded reference block ID %d", slice.getEmbeddedReferenceContentID())); + } + + slice.getSliceBlocks().getExternalContentIDs().forEach((id) -> { + final String blockID = dataSeriesByContentID.get(id) == null ? + Integer.toString(id) : // not a fixed data series (i.e., tag block) + dataSeriesByContentID.get(id).getCanonicalName(); + emitln(String.format("%-50s %s", + String.format("External Block (%s):", blockID), + slice.getSliceBlocks().getExternalBlock(id).toString())); + }); + + updateDataDistribution(slice, dataSeriesByContentID); + recordCount += slice.getNumberOfRecords(); + } + + final void updateDataDistribution(final Slice slice, final Map dataSeriesByContentID) { + final SliceBlocks sliceBlocks = slice.getSliceBlocks(); + + coreBlocksDataSize += sliceBlocks.getCoreBlock().getCompressedContentSize(); + final Map tagContentIDs = slice.getCompressionHeader().getTagEncodingMap(); + for (final Integer contentID : sliceBlocks.getExternalContentIDs()) { + //if (tagContentIDs.containsKey(contentID)) { + final DataSeries ds = dataSeriesByContentID.get(contentID); + if (ds == null) { + // accrue to tag data + externalTagDataSizes.merge( + contentID, + Long.valueOf(slice.getSliceBlocks().getExternalBlock(contentID).getCompressedContentSize()), + (oldValue, increment) -> oldValue + increment); + } else { + // accrue to fixed DataSeries ID + externalDataSeriesDataSizes.merge( + ds, + Long.valueOf(sliceBlocks.getExternalBlock(contentID).getCompressedContentSize()), + (oldValue, increment) -> oldValue + increment); + } + } + } + + public void emitDataDistribution() { + emitln("\nCore Block(s) Total: " + String.format("%,d\n", coreBlocksDataSize)); + emitln("External Data Series Totals (external block resolution - all core data encodings accrue to the core block):\n"); + for (final DataSeries ds : externalDataSeriesDataSizes.keySet()) { + emitln(String.format("%s: %,d", ds, externalDataSeriesDataSizes.get(ds))); + } + + emitln("\nTag Series Distribution:\n"); + for (final Map.Entry externalEntry : externalTagDataSizes.entrySet()) { + final Integer contentID = externalEntry.getKey(); + final String tagName = String.format("%d (%s)", contentID, decomposeTagNameAndType(externalEntry.getKey())); + emitln(String.format("%s: %,d", tagName, externalEntry.getValue())); + } + } + + private String encodingDescriptorAsString(final DataSeriesType dsType, final EncodingDescriptor descriptor) { + final String encodingIDString = EncodingID.values()[descriptor.getEncodingID().getId()].toString(); + final CRAMEncoding encoding = EncodingFactory.createCRAMEncoding( + dsType, descriptor.getEncodingID(), descriptor.getEncodingParameters()); + return String.format("%s (%s)", encodingIDString, encoding.toString()); + } + + private Integer contentIDFromExternalDescriptor(final DataSeriesType dsType, final EncodingDescriptor descriptor) { + final CRAMEncoding encoding = EncodingFactory.createCRAMEncoding( + dsType, descriptor.getEncodingID(), descriptor.getEncodingParameters()); + switch (descriptor.getEncodingID()) { + case EXTERNAL: + return ITF8.readUnsignedITF8(encoding.toSerializedEncodingParams()); + + case BYTE_ARRAY_STOP: + final ByteBuffer buf = ByteBuffer.wrap(encoding.toSerializedEncodingParams()); + buf.order(ByteOrder.LITTLE_ENDIAN); + final byte stopByte = buf.get(); // discard it + return ITF8.readUnsignedITF8(buf); + + // Everything else is either not external, or is hybrid (BYTE_ARRAY_LEN), so we can't + // track the data for these at the block level since the data foes either partially or + // fully into the core block + case BYTE_ARRAY_LEN: + case NULL: + case GOLOMB: + case HUFFMAN: + case BETA: + case SUBEXPONENTIAL: + case GOLOMB_RICE: + case GAMMA: + default: + return null; + } + } + + private String decomposeTagNameAndType(final int contentID) { + return ReadTag.intToNameType4Bytes(contentID); + } + +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java new file mode 100644 index 00000000000..8339dfaccc4 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java @@ -0,0 +1,46 @@ +package org.broadinstitute.hellbender.tools.filediagnostics; + +import htsjdk.samtools.util.Log; +import org.broadinstitute.hellbender.engine.GATKPath; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; + +/** + * Base class for alignment file analyzers. + */ +public abstract class HTSAnalyzer implements Closeable { + + protected GATKPath inputPath; + protected File outputFile; + + public HTSAnalyzer(final GATKPath filePath, final File outputFile) { + this.inputPath = filePath; + this.outputFile = outputFile; + } + + /** + * Run the analyzer for the file specified by fileName. + */ + public void analyze() { + // set and then reset the global log level + Log.setGlobalLogLevel(Log.LogLevel.ERROR); + doAnalysis(); + emitln(""); + Log.setGlobalLogLevel(Log.LogLevel.DEBUG); + } + + /** + * Emit a string followed by a newline to the output destination. + * + * @param s + */ + protected void emitln(final String s) { + System.out.println(s); + } + + protected abstract void doAnalysis(); + + public abstract void close() throws IOException; +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java new file mode 100644 index 00000000000..d58438a7b61 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java @@ -0,0 +1,25 @@ +package org.broadinstitute.hellbender.tools.filediagnostics; + +import htsjdk.samtools.util.FileExtensions; +import org.broadinstitute.hellbender.engine.GATKPath; + +import java.io.File; + +/** + * Class for creating an analyzer based on an alignment file type. + */ +public class HTSAnalyzerFactory { + + public static HTSAnalyzer getFileAnalyzer(final GATKPath inputPath, final File outputFile, final long countLimit) { + System.out.println(inputPath.getRawInputString()); + if (inputPath.isCram()) { + return new CRAMAnalyzer(inputPath, outputFile, countLimit); + } else if (inputPath.hasExtension(FileExtensions.CRAM_INDEX)) { + return new CRAIAnalyzer(inputPath, outputFile); + } else if (inputPath.hasExtension(FileExtensions.BAI_INDEX)) { + return new BAIAnalyzer(inputPath, outputFile); + } else { + throw new RuntimeException("Unsupported diagnostic file type: " + inputPath.getRawInputString()); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java index 74aec83c89e..12843950a48 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java @@ -874,7 +874,6 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, // Get the correct reference for B37/HG19 compliance: // This is necessary because of the variant transformation that gets applied in VariantWalkerBase::apply. final ReferenceContext correctReferenceContext = funcotatorEngine.getCorrectReferenceContext(variant, referenceContext); - // Place the variant on our queue to be funcotated: enqueueAndHandleVariant(variant, correctReferenceContext, featureContext); } @@ -924,7 +923,11 @@ protected void enqueueAndHandleVariant(final VariantContext variant, final Refer final FuncotationMap funcotationMap = funcotatorEngine.createFuncotationMapForVariant(variant, referenceContext, featureContext); + // This is necessary because we want to revert the variant contig name change if it was applied in the FuncotatorEngine::getCorrectVariantContextForReference method before outputting the vcf. + // NOTE: this will only revert the variantContext if it was originally changed (only for B37 VCFs) + final VariantContext variantContextForOutput = funcotatorEngine.getCorrectVariantContextForOutput(variant); + // At this point there is only one transcript ID in the funcotation map if canonical or best effect are selected - outputRenderer.write(variant, funcotationMap); + outputRenderer.write(variantContextForOutput, funcotationMap); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java index 8468594d506..80b7a5bce40 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java @@ -32,8 +32,8 @@ *

* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways: *

    - *
  • For somatic data sources:
    {@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}
  • - *
  • For germline data sources:
    {@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}
  • + *
  • For somatic data sources:
    {@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}
  • + *
  • For germline data sources:
    {@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}
  • *
*

* @@ -63,6 +63,8 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { public static final String GERMLINE_ARG_LONG_NAME = "germline"; public static final String OVERWRITE_ARG_LONG_NAME = "overwrite-output-file"; public static final String EXTRACT_AFTER_DOWNLOAD = "extract-after-download"; + public static final String HG38_ARG_LONG_NAME = "hg38"; + public static final String HG19_ARG_LONG_NAME = "hg19"; //================================================================================================================== // Private Static Members: @@ -73,18 +75,27 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { // Private Static Members: // Set to always get the latest version of the data sources: - private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + - DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(); + private static final String HG38_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(38); + private static final String HG19_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH + + DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(19); + + private static final String HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; + private static final String HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; - private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER; @VisibleForTesting - static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); - private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + private static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + private static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); - public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; + public static final String HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; + public static final String HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;; - public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); - private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + public static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + public static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION); + private static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); + private static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION); //================================================================================================================== // Private Members: @@ -129,6 +140,23 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram { optional = true) protected boolean extractDataSourcesAfterDownload = false; + @Argument( + shortName = HG38_ARG_LONG_NAME, + fullName = HG38_ARG_LONG_NAME, + mutex = {HG19_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG}, + doc = "If set, will extract data from the HG38 data sources bucket.", + optional = true) + protected boolean getHg38Datasources = false; + + @Argument( + //TODO should these be MUTEX or should one be allowed to download either? + shortName = HG19_ARG_LONG_NAME, + fullName = HG19_ARG_LONG_NAME, + mutex = {HG38_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG}, + doc = "If set, will extract data from the HG19 data sources bucket.", + optional = true) + protected boolean getHg19Datasources = false; + // Testing arguments: @Hidden @Advanced @@ -164,6 +192,11 @@ protected void onStartup() { throw new UserException("Must select either somatic or germline datasources."); } + // Make sure the user specified at least one reference source to download: + if ((!getHg38Datasources) && (!getHg19Datasources) && (testingOverrideDataSourcesPath == null)) { + throw new UserException("Must select either HG19 or HG38 datasources."); + } + // Make sure the testing inputs are correct: if ( ((testingOverrideDataSourcesPath == null) && (testingOverrideDataSourcesSha256Path != null)) || ((testingOverrideDataSourcesSha256Path == null) && (testingOverrideDataSourcesPath != null)) ) { @@ -184,14 +217,26 @@ protected Object doWork() { // Get the correct data source: if ( getSomaticDataSources ) { - dataSourceDescription = "Somatic"; - dataSourcesPath = SOMATIC_GCLOUD_DATASOURCES_PATH; - dataSourcesSha256Path = SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + if (getHg38Datasources) { + dataSourceDescription = "HG38_Somatic"; + dataSourcesPath = HG38_SOMATIC_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + } else { // Okay because HG38 and HG19 datasources are currently MUTEX and at least one is required + dataSourceDescription = "HG19_Somatic"; + dataSourcesPath = HG19_SOMATIC_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH; + } } else if ( getGermlineDataSources ) { - dataSourceDescription = "Germline"; - dataSourcesPath = GERMLINE_GCLOUD_DATASOURCES_PATH; - dataSourcesSha256Path = GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + if (getHg38Datasources) { + dataSourceDescription = "HG38_Germline"; + dataSourcesPath = HG38_GERMLINE_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + } else { + dataSourceDescription = "HG19_Germline"; + dataSourcesPath = HG19_GERMLINE_GCLOUD_DATASOURCES_PATH; + dataSourcesSha256Path = HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH; + } } else { // Test case: diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java index a82313adf84..73a256a794c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java @@ -79,6 +79,12 @@ public final class FuncotatorEngine implements AutoCloseable { */ private final boolean mustConvertInputContigsToHg19; + /** + * Whether the output variant contigs must be converted back to B37 from hg19 before being returned. + * (NOTE: this means that the output contigs will continue to use B37 contig names even if internally we converted them to hg19) + */ + private boolean mustRevertVariantContigsFromHg19ToB37 = false; + /** * Whether this {@link FuncotatorEngine} has only produced annotations on variants that have been labeled by the * {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotationFactory} as {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#IGR}. @@ -327,6 +333,22 @@ private VariantContext getCorrectVariantContextForReference(final VariantContext } } + /** + * Create a new {@link VariantContext} which will match the given Reference if there is a mismatch for input between the B37 reference and the HG19 reference. + * @param variant A {@link VariantContext} object containing the variant to convert. + * @return A {@link VariantContext} whose contig has been transformed to HG19 if requested by the user. Otherwise, an identical variant. + */ + VariantContext getCorrectVariantContextForOutput(final VariantContext variant) { + if ( mustRevertVariantContigsFromHg19ToB37 ) { + final VariantContextBuilder vcb = new VariantContextBuilder(variant); + vcb.chr(FuncotatorUtils.convertHG19ContigToB37Contig(variant.getContig())); + return vcb.make(); + } + else { + return variant; + } + } + /** * @return The default {@link VariantTransformer} which will automatically convert from the B37 reference standard to the HG19 reference standard for contig names. */ @@ -483,7 +505,7 @@ private boolean determineReferenceAndDatasourceCompatibility() { } else if ( funcotatorArgs.referenceVersion.equals(BaseFuncotatorArgumentCollection.FuncotatorReferenceVersionHg19) && FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants) ) { - logger.info("VCF sequence dictionary detected as B37 in HG19 annotation mode. Performing conversion."); + logger.info("VCF sequence dictionary detected as B37 in HG19 annotation mode. Performing conversion. (NOTE: the output VCF will still be B37)"); mustConvertInputContigsToHg19 = true; } else { @@ -505,6 +527,11 @@ else if ( funcotatorArgs.referenceVersion.equals(BaseFuncotatorArgumentCollectio "There MAY be some errors (e.g. in the Y chromosome, but possibly in other places as well) due to changes between the two references."); } + // Record whether we need to revert the contigs back to B37 after annotation: + if (mustConvertInputContigsToHg19 && FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants)) { + this.mustRevertVariantContigsFromHg19ToB37 = true; + } + return mustConvertInputContigsToHg19; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java index 48dd6560bee..01791264a01 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java @@ -53,7 +53,9 @@ private DataSourceUtils() {} private static final String MANIFEST_SOURCE_LINE_START = "Source:"; private static final String MANIFEST_ALT_SOURCE_LINE_START = "Alternate Source:"; @VisibleForTesting - static final Pattern VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + static final Pattern OLD_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + static final Pattern NEW_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.hg(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)"); + private static final Pattern SOURCE_PATTERN = Pattern.compile(MANIFEST_SOURCE_LINE_START + "\\s+(ftp.*)"); private static final Pattern ALT_SOURCE_PATTERN = Pattern.compile(MANIFEST_ALT_SOURCE_LINE_START + "\\s+(gs.*)"); @@ -69,9 +71,9 @@ private DataSourceUtils() {} @VisibleForTesting static final int MAX_MAJOR_VERSION_NUMBER = 1; @VisibleForTesting - static final int MAX_MINOR_VERSION_NUMBER = 7; + static final int MAX_MINOR_VERSION_NUMBER = 8; @VisibleForTesting - static final LocalDate MAX_DATE = LocalDate.of(2020, Month.MAY, 21); + static final LocalDate MAX_DATE = LocalDate.of(2023, Month.SEPTEMBER, 8); //================================================================================================================== // Public Static Members: @@ -80,7 +82,7 @@ private DataSourceUtils() {} public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = getDataSourceMinVersionString(); /** The maximum supported version of the data sources for funcotator to run. */ - public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString(); + public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString(38); public static final String MANIFEST_FILE_NAME = "MANIFEST.txt"; public static final String DATA_SOURCES_FTP_PATH = "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/"; @@ -88,6 +90,8 @@ private DataSourceUtils() {} public static final String DATA_SOURCES_NAME_PREFIX = "funcotator_dataSources"; public static final String DS_SOMATIC_NAME_MODIFIER = "s"; public static final String DS_GERMLINE_NAME_MODIFIER = "g"; + public static final String DS_HG38_NAME_MODIFIER = "hg38"; + public static final String DS_HG19_NAME_MODIFIER = "hg19"; public static final String DS_EXTENSION = ".tar.gz"; public static final String DS_CHECKSUM_EXTENSION = ".sha256"; @@ -137,8 +141,8 @@ public static String getDataSourceMinVersionString() { * {@link #MAX_DATE} * @return A {@link String} representing the Max version information as it would appear in the data sources file name. */ - public static String getDataSourceMaxVersionString() { - return getDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, MAX_DATE); + public static String getDataSourceMaxVersionString(final int ref) { + return getNewDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, ref, MAX_DATE); } @@ -159,6 +163,25 @@ public static String getDataSourceVersionString(final int major, final int minor date.getDayOfMonth() ); } + /** + * Get the string representing the given version information for funcotator as it would be written in the data sources + * release files. + * @param major {@code int} representing the major version of the data sources to use. + * @param minor {@code int} representing the minor version of the data sources to use. + * @param ref {@code int} representing the hg reference number of the data sources to use. + * @param date {@link LocalDate} representing the date of the data sources to use. + * @return A {@link String} representing the given version information as it would appear in the data sources file name. + */ + public static String getNewDataSourceVersionString(final int major, final int minor, final int ref, final LocalDate date) { + return String.format("v%d.%d.hg%d.%d%02d%02d", + major, + minor, + ref, + date.getYear(), + date.getMonthValue(), + date.getDayOfMonth() + ); + } /** * Initializes the data sources for {@link Funcotator}. @@ -704,7 +727,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) { while ((line != null) && ((version == null) || (source == null) || (alternateSource == null))) { if (version == null && line.startsWith(MANIFEST_VERSION_LINE_START)) { - final Matcher matcher = VERSION_PATTERN.matcher(line); + final Matcher matcher = NEW_VERSION_PATTERN.matcher(line); if ( matcher.matches() ) { versionMajor = Integer.valueOf(matcher.group(1)); versionMinor = Integer.valueOf(matcher.group(2)); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRenderer.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRenderer.java index 1ddaee9245a..44f0f879880 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRenderer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRenderer.java @@ -6,7 +6,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.GATKException; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java index 359ab701153..9a7be220c34 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java @@ -178,19 +178,20 @@ public class MafOutputRendererConstants { static final Map VariantClassificationMapInverse; // Output Field Name Map Defaults: - static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "gene", "Gene"); + //TODO these are hardcoded to gencode versions and should be updated to generalize to any version of gencode (see https://github.com/broadinstitute/gatk/issues/8482) + static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "Gencode_43_hugoSymbol", "gene", "Gene"); static final List OutputFieldNameMap_Entrez_Gene_Id = Arrays.asList(FieldName_Entrez_Gene_Id, "HGNC_Entrez_Gene_ID", "HGNC_Entrez Gene ID", "HGNC_Entrez_Gene_ID(supplied_by_NCBI)", "HGNC_Entrez Gene ID(supplied by NCBI)", "entrez_id", "gene_id"); static final List OutputFieldNameMap_Center = Arrays.asList(FieldName_Center, "center"); - static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "ncbi_build"); - static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom"); - static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "start", "Start", "start_pos", "pos"); - static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "end", "End", "end_pos"); + static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "Gencode_43_ncbiBuild", "ncbi_build"); + static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "Gencode_43_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom"); + static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "Gencode_43_start", "start", "Start", "start_pos", "pos"); + static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "Gencode_43_end", "end", "End", "end_pos"); static final List OutputFieldNameMap_Strand = Collections.singletonList(FieldName_Strand); - static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "variant_classification"); - static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "variant_type"); - static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "ref", "ref_allele", "reference_allele"); - static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "ref", "ref_allele", "reference_allele"); - static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1"); + static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "Gencode_43_variantClassification", "variant_classification"); + static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "Gencode_43_variantType", "variant_type"); + static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "Gencode_43_refAllele", "ref", "ref_allele", "reference_allele"); + static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "Gencode_43_tumorSeqAllele1", "ref", "ref_allele", "reference_allele"); + static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "Gencode_43_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1"); static final List OutputFieldNameMap_dbSNP_RS = Arrays.asList(FieldName_dbSNP_RS, "dbsnp_rs", "dbSNP_RSPOS"); static final List OutputFieldNameMap_dbSNP_Val_Status = Arrays.asList(FieldName_dbSNP_Val_Status, MAF_DBSNP_VAL_STATUS_FIELD, "dbsnp_val_status", DBSNP_VLD_NAME); static final List OutputFieldNameMap_Tumor_Sample_Barcode = Arrays.asList(FieldName_Tumor_Sample_Barcode, "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name"); @@ -212,15 +213,15 @@ public class MafOutputRendererConstants { static final List OutputFieldNameMap_Sequencer = Arrays.asList(FieldName_Sequencer, "sequencer", "platform"); static final List OutputFieldNameMap_Tumor_Sample_UUID = Arrays.asList(FieldName_Tumor_Sample_UUID, "tumor_uuid", "case_uuid", "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name", "Tumor_Sample_Barcode"); static final List OutputFieldNameMap_Matched_Norm_Sample_UUID = Arrays.asList(FieldName_Matched_Norm_Sample_UUID, "normal_uuid", "control_uuid", "normal_barcode", "normal_id", "control_barcode", "control_id", "normal_name", "sample_name", "Matched_Norm_Sample_Barcode"); - static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "genome_change"); - static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "annotation_transcript", "transcript_id"); - static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "transcript_strand"); - static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "transcript_exon"); - static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "transcript_position"); - static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "transcript_change"); - static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "codon_change"); - static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "protein_change"); - static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "other_transcripts"); + static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "Gencode_43_genomeChange", "genome_change"); + static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "Gencode_43_annotationTranscript", "annotation_transcript", "transcript_id"); + static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "Gencode_43_transcriptStrand", "transcript_strand"); + static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "Gencode_43_transcriptExon", "transcript_exon"); + static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "Gencode_43_transcriptPos", "transcript_position"); + static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "Gencode_43_cDnaChange", "transcript_change"); + static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "Gencode_43_codonChange", "codon_change"); + static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "Gencode_43_proteinChange", "protein_change"); + static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "Gencode_43_otherTranscripts", "other_transcripts"); static final List OutputFieldNameMap_Refseq_mRNA_Id = Arrays.asList(FieldName_Refseq_mRNA_Id, "Gencode_XRefSeq_mRNA_id", "gencode_xref_refseq_mRNA_id", "ENSEMBL_RefSeq_mRNA_accession", "RefSeq_mRNA_Id", "HGNC_RefSeq IDs"); static final List OutputFieldNameMap_Refseq_prot_Id = Arrays.asList(FieldName_Refseq_prot_Id, "Gencode_XRefSeq_prot_acc", "gencode_xref_refseq_prot_acc", "ENSEMBL_RefSeq_protein_accession", "RefSeq_prot_Id"); static final List OutputFieldNameMap_SwissProt_acc_Id = Arrays.asList(FieldName_SwissProt_acc_Id, "Simple_Uniprot_uniprot_accession", "uniprot_accession", "UniProt_uniprot_accession"); @@ -243,8 +244,8 @@ public class MafOutputRendererConstants { static final List OutputFieldNameMap_TCGAscape_Amplification_Peaks = Arrays.asList(FieldName_TCGAscape_Amplification_Peaks, "TCGAScape_Amplification_Peaks"); static final List OutputFieldNameMap_TCGAscape_Deletion_Peaks = Arrays.asList(FieldName_TCGAscape_Deletion_Peaks, "TCGAScape_Deletion_Peaks"); static final List OutputFieldNameMap_DrugBank = Arrays.asList(FieldName_DrugBank, "Simple_Uniprot_DrugBank", "UniProt_DrugBank"); - static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "ref_context"); - static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "gc_content"); + static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "Gencode_43_referenceContext", "ref_context"); + static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "Gencode_43_gcContent", "gc_content"); static final List OutputFieldNameMap_CCLE_ONCOMAP_overlapping_mutations = Arrays.asList(FieldName_CCLE_ONCOMAP_overlapping_mutations, "CCLE_By_GP_overlapping_mutations"); static final List OutputFieldNameMap_CCLE_ONCOMAP_total_mutations_in_gene = Arrays.asList(FieldName_CCLE_ONCOMAP_total_mutations_in_gene, "CCLE_By_Gene_total_mutations_in_gene"); static final List OutputFieldNameMap_CGC_Mutation_Type = Arrays.asList(FieldName_CGC_Mutation_Type, "CGC_Mutation Type"); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/simpletsvoutput/SimpleTsvOutputRenderer.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/simpletsvoutput/SimpleTsvOutputRenderer.java index b9459f5a9ad..9cb98f130ef 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/simpletsvoutput/SimpleTsvOutputRenderer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/simpletsvoutput/SimpleTsvOutputRenderer.java @@ -18,7 +18,6 @@ import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; import org.broadinstitute.hellbender.utils.tsv.TableUtils; import org.broadinstitute.hellbender.utils.tsv.TableWriter; -import org.codehaus.plexus.util.StringUtils; import java.io.IOException; import java.nio.file.Path; @@ -356,7 +355,15 @@ excludedOutputFields, createColumnNameToAliasesMap(Resource.getResourceContentsA @VisibleForTesting static String[] splitAndTrim(final String text, final String separator ) { - return Stream.of(StringUtils.split(text, separator)).map(String::trim).toArray(String[]::new); + final String[] tokens = Utils.split(text, separator).stream().map(String::trim).toArray(String[]::new); + + // If we get a single empty token after trimming, return an empty array, as this is what + // the tests for this class expect: + if ( tokens.length == 1 && tokens[0].isEmpty() ) { + return new String[]{}; + } + + return tokens; } @VisibleForTesting diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/vcfOutput/VcfOutputRenderer.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/vcfOutput/VcfOutputRenderer.java index 06efb116fb7..25cc4a4c804 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/vcfOutput/VcfOutputRenderer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/vcfOutput/VcfOutputRenderer.java @@ -197,7 +197,8 @@ public void write(final VariantContext variant, final FuncotationMap txToFuncota variantContextOutputBuilder.genotypes( variant.getGenotypes() ); // Render and add our VCF line: - vcfWriter.add( variantContextOutputBuilder.make() ); + VariantContext out = variantContextOutputBuilder.make(); + vcfWriter.add( out ); } private Funcotation createManualAnnotationFuncotation(final Allele altAllele) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java index bf5b3a61198..fd58587abd4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genomicsdb/GenomicsDBImport.java @@ -22,6 +22,7 @@ import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.GATKTool; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; @@ -36,29 +37,29 @@ import org.genomicsdb.importer.GenomicsDBImporter; import org.genomicsdb.model.BatchCompletionCallbackFunctionArgument; import org.genomicsdb.model.Coordinates; -import org.genomicsdb.model.GenomicsDBCallsetsMapProto; import org.genomicsdb.model.GenomicsDBImportConfiguration; import org.genomicsdb.model.GenomicsDBVidMapProto; import org.genomicsdb.model.ImportConfig; import java.io.File; import java.io.IOException; +import java.io.Serial; import java.net.URI; import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Objects; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; -import java.util.Arrays; import java.util.concurrent.CompletionException; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -67,9 +68,6 @@ import java.util.concurrent.ThreadFactory; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBGetAbsolutePath; -import static org.broadinstitute.hellbender.tools.genomicsdb.GATKGenomicsDBUtils.genomicsDBApppendPaths; - /** * Import single-sample GVCFs into GenomicsDB before joint genotyping. * @@ -146,7 +144,7 @@ * * * It is also possible to specify an explicit index for only a subset of the samples: - * + * *
  *  sample1      sample1.vcf.gz
  *  sample2      sample2.vcf.gz      sample2.vcf.gz.tbi
@@ -218,12 +216,14 @@ public final class GenomicsDBImport extends GATKTool {
     public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel";
     public static final String MERGE_CONTIGS_INTO_NUM_PARTITIONS = "merge-contigs-into-num-partitions";
     public static final String BYPASS_FEATURE_READER = "bypass-feature-reader";
+    public static final String VCF_HEADER_OVERRIDE = "header";
     public static final int INTERVAL_LIST_SIZE_WARNING_THRESHOLD = 100;
     public static final int ARRAY_COLUMN_BOUNDS_START = 0;
     public static final int ARRAY_COLUMN_BOUNDS_END = 1;
 
     public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS;
     public static final String USE_GCS_HDFS_CONNECTOR = GenomicsDBArgumentCollection.USE_GCS_HDFS_CONNECTOR;
+    public static final String AVOID_NIO = "avoid-nio";
 
     @Argument(fullName = WORKSPACE_ARG_LONG_NAME,
               doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " +
@@ -239,7 +239,7 @@ public final class GenomicsDBImport extends GATKTool {
                     "when using the "+INTERVAL_LIST_LONG_NAME+" option. " +
                     "Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " +
                     "Must point to an existing workspace.",
-              mutex = {WORKSPACE_ARG_LONG_NAME})
+              mutex = {WORKSPACE_ARG_LONG_NAME, VCF_HEADER_OVERRIDE})
     private String incrementalImportWorkspace;
 
     @Argument(fullName = SEGMENT_SIZE_ARG_LONG_NAME,
@@ -254,7 +254,7 @@ public final class GenomicsDBImport extends GATKTool {
                     " data for only a single sample. Either this or " + SAMPLE_NAME_MAP_LONG_NAME +
                     " must be specified.",
               optional = true,
-              mutex = {SAMPLE_NAME_MAP_LONG_NAME})
+              mutex = {SAMPLE_NAME_MAP_LONG_NAME, AVOID_NIO})
     private List variantPaths;
 
     @Argument(fullName = VCF_BUFFER_SIZE_ARG_NAME,
@@ -364,6 +364,13 @@ public final class GenomicsDBImport extends GATKTool {
             optional = true)
     private boolean sharedPosixFSOptimizations = false;
 
+    @Argument(fullName = VCF_HEADER_OVERRIDE,
+        doc = "Specify a vcf file to use instead of reading and combining headers from the input vcfs",
+        optional = true,
+        mutex ={INCREMENTAL_WORKSPACE_ARG_LONG_NAME}
+    )
+    private FeatureInput headerOverride = null;
+
     @Argument(fullName = BYPASS_FEATURE_READER,
             doc = "Use htslib to read input VCFs instead of GATK's FeatureReader. This will reduce memory usage and potentially speed up " +
                   "the import. Lower memory requirements may also enable parallelism through " + MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL +
@@ -371,6 +378,15 @@ public final class GenomicsDBImport extends GATKTool {
             optional = true)
     private boolean bypassFeatureReader = false;
 
+    @Argument(fullName = AVOID_NIO,
+                doc = "Do not attempt to open the input vcf file paths in java.  This can only be used with " + BYPASS_FEATURE_READER
+            + ".  It allows operating on file systems which GenomicsDB understands how to open but GATK does not.  This will disable "
+            + "many of the sanity checks.",
+            mutex = {StandardArgumentDefinitions.VARIANT_LONG_NAME}
+    )
+    @Advanced
+    private boolean avoidNio = false;
+
     @Argument(fullName = USE_GCS_HDFS_CONNECTOR,
             doc = "Use the GCS HDFS Connector instead of the native GCS SDK client with gs:// URLs.",
             optional = true)
@@ -440,10 +456,6 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
     // Path to combined VCF header file to be written by GenomicsDBImporter
     private String vcfHeaderFile;
 
-    // GenomicsDB callset map protobuf structure containing all callset names
-    // used to write the callset json file on traversal success
-    private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB;
-
     //in-progress batchCount
     private int batchCount = 1;
 
@@ -463,11 +475,14 @@ public void onStartup() {
         initializeWorkspaceAndToolMode();
         assertVariantPathsOrSampleNameFileWasSpecified();
         assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive();
+        assertAvoidNioConditionsAreValid();
         initializeHeaderAndSampleMappings();
         initializeIntervals();
         super.onStartup();
     }
 
+
+
     private void initializeWorkspaceAndToolMode() {
         if (incrementalImportWorkspace != null && !incrementalImportWorkspace.isEmpty()) {
             doIncrementalImport = true;
@@ -495,6 +510,24 @@ private void assertVariantPathsOrSampleNameFileWasSpecified(){
         }
     }
 
+    private void assertAvoidNioConditionsAreValid() {
+        if (avoidNio && (!bypassFeatureReader || headerOverride == null) ){
+            final List missing = new ArrayList<>();
+            if(!bypassFeatureReader){
+                missing.add(BYPASS_FEATURE_READER);
+            }
+            if(headerOverride == null){
+                missing.add(VCF_HEADER_OVERRIDE);
+            }
+            final String missingArgs = String.join(" and ", missing);
+
+            // this potentially produces and exception with bad grammar but that's probably ok
+            throw new CommandLineException.MissingArgument(missingArgs, "If --" +AVOID_NIO + " is set then --" + BYPASS_FEATURE_READER
+                    + " and --" + VCF_HEADER_OVERRIDE + " must also be specified.");
+
+        }
+    }
+
     private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importer,
                                                           List intervals) {
         GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping();
@@ -523,32 +556,37 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe
      */
     private void initializeHeaderAndSampleMappings() {
         // Only one of -V and --sampleNameMapFile may be specified
-        if (variantPaths != null && variantPaths.size() > 0) {
+        if (variantPaths != null && !variantPaths.isEmpty()) {
             // -V was specified
             final List headers = new ArrayList<>(variantPaths.size());
             sampleNameMap = new SampleNameMap();
-            for (final String variantPathString : variantPaths) {
-                final Path variantPath = IOUtils.getPath(variantPathString);
-                if (bypassFeatureReader) {
-                    GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
-                }
-                final  VCFHeader header = getHeaderFromPath(variantPath, null);
-                Utils.validate(header != null, "Null header was found in " + variantPath + ".");
-                assertGVCFHasOnlyOneSample(variantPathString, header);
-                headers.add(header);
 
-                final String sampleName = header.getGenotypeSamples().get(0);
-                try {
-                    sampleNameMap.addSample(sampleName, new URI(variantPathString));
-                }
-                catch(final URISyntaxException e) {
-                    throw new UserException("Malformed URI "+e.toString(), e);
+            if(headerOverride == null) {
+                for (final String variantPathString : variantPaths) {
+                    final Path variantPath = IOUtils.getPath(variantPathString);
+                    if (bypassFeatureReader) { // avoid-nio can't be set here because it requires headerOverride
+                        GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
+                    }
+                    final VCFHeader header = getHeaderFromPath(variantPath);
+                    Utils.validate(header != null, "Null header was found in " + variantPath + ".");
+                    assertGVCFHasOnlyOneSample(variantPathString, header);
+                    headers.add(header);
+
+                    final String sampleName = header.getGenotypeSamples().get(0);
+                    try {
+                        sampleNameMap.addSample(sampleName, new URI(variantPathString));
+                    } catch (final URISyntaxException e) {
+                        throw new UserException("Malformed URI " + e.getMessage(), e);
+                    }
                 }
+                mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
+                mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
+            } else {
+                final VCFHeader header = getHeaderFromPath(headerOverride.toPath());
+                mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
+                mergedHeaderSequenceDictionary = header.getSequenceDictionary();
             }
-            mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
-            mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
             mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
-
         } else if (sampleNameMapFile != null) {
             // --sampleNameMap was specified
 
@@ -556,31 +594,34 @@ private void initializeHeaderAndSampleMappings() {
             //the resulting database will have incorrect sample names
             //see https://github.com/broadinstitute/gatk/issues/3682 for more information
             // The SampleNameMap class guarantees that the samples will be sorted correctly.
-            sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader);
+            sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile),
+                    bypassFeatureReader && !avoidNio);
 
             final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey();
-            final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
-            final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample);
-            final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath);
 
+            final VCFHeader header;
+            if(headerOverride == null){
+                final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
+                header = getHeaderFromPath(firstVCFPath);
+            } else {
+                header = getHeaderFromPath(headerOverride.toPath());
+            }
             //getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
-            mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder());
+            mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
             mergedHeaderSequenceDictionary = header.getSequenceDictionary();
             mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
-        }
-        else if (getIntervalsFromExistingWorkspace){
+        } else if (getIntervalsFromExistingWorkspace){
             final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
             IOUtils.assertPathsAreReadable(vcfHeader);
             final String header = GenomicsDBUtils.readEntireFile(vcfHeader);
             try {
                 File tempHeader = IOUtils.createTempFile("tempheader", ".vcf");
-                Files.write(tempHeader.toPath(), header.getBytes(StandardCharsets.UTF_8));
+                Files.writeString(tempHeader.toPath(), header);
                 mergedHeaderSequenceDictionary = VCFFileReader.getSequenceDictionary(tempHeader);
             } catch (final IOException e) {
-                throw new UserException("Unable to create temporary header file to get sequence dictionary");
+                throw new UserException("Unable to create temporary header file to get sequence dictionary", e);
             }
-        }
-        else {
+        } else {
             throw new UserException(StandardArgumentDefinitions.VARIANT_LONG_NAME+" or "+
                     SAMPLE_NAME_MAP_LONG_NAME+" must be specified unless "+
                     INTERVAL_LIST_LONG_NAME+" is specified");
@@ -599,8 +640,12 @@ else if (getIntervalsFromExistingWorkspace){
         }
     }
 
-    private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) {
-        try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) {
+    private VCFHeader getHeaderFromPath(final Path variantPath) {
+        //TODO make this mangling unecessary
+        final String variantURI = variantPath.toAbsolutePath().toUri().toString();
+        try(final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
+                BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
+                BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer))) {
             return (VCFHeader) reader.getHeader();
         } catch (final IOException e) {
             throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e);
@@ -633,9 +678,9 @@ private void writeIntervalListToFile() {
     @Override
     public void onTraversalStart() {
         String workspaceDir = overwriteCreateOrCheckWorkspace();
-        vidMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
-        callsetMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
-        vcfHeaderFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
+        vidMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
+        callsetMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
+        vcfHeaderFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
         if (getIntervalsFromExistingWorkspace) {
             // intervals may be null if merge-contigs-into-num-partitions was used to create the workspace
             // if so, we need to wait for vid to be generated before writing out the interval list
@@ -775,7 +820,7 @@ private List generateIntervalListFromWorkspace() {
             final int start = Integer.parseInt(partitionInfo[1]);
             final int end = Integer.parseInt(partitionInfo[2]);
             return new SimpleInterval(contig, start, end);
-        }).filter(o -> o != null).collect(Collectors.toList());
+        }).filter(Objects::nonNull).collect(Collectors.toList());
     }
 
     private ImportConfig createImportConfig(final int batchSize) {
@@ -785,7 +830,7 @@ private ImportConfig createImportConfig(final int batchSize) {
                 GenomicsDBImportConfiguration.ImportConfiguration.newBuilder();
         importConfigurationBuilder.addAllColumnPartitions(partitions);
         importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample);
-        importConfigurationBuilder.setFailIfUpdating(true && !doIncrementalImport);
+        importConfigurationBuilder.setFailIfUpdating(!doIncrementalImport);
         importConfigurationBuilder.setSegmentSize(segmentSize);
         importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation);
         importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations);
@@ -936,7 +981,7 @@ private FeatureReader getReaderFromPath(final Path variantPath,
             /* Anonymous FeatureReader subclass that wraps returned iterators to ensure that the GVCFs do not
              * contain MNPs.
              */
-            return new FeatureReader() {
+            return new FeatureReader<>() {
                 /** Iterator that asserts that variants are not MNPs. */
                 class NoMnpIterator implements CloseableTribbleIterator {
                     private final CloseableTribbleIterator inner;
@@ -971,7 +1016,8 @@ public VariantContext next() {
                     return new NoMnpIterator(reader.query(chr, start, end));
                 }
 
-                @Override public CloseableTribbleIterator iterator() throws IOException {
+                @Override
+                public CloseableTribbleIterator iterator() throws IOException {
                     return new NoMnpIterator(reader.iterator());
                 }
             };
@@ -990,7 +1036,7 @@ public VariantContext next() {
      * @return  The workspace directory
      */
     private String overwriteCreateOrCheckWorkspace() {
-        String workspaceDir = genomicsDBGetAbsolutePath(workspace);
+        String workspaceDir = GATKGenomicsDBUtils.genomicsDBGetAbsolutePath(workspace);
         // From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid
         //   returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first.
         //   returnCode = -1 : path was not a directory
@@ -1016,7 +1062,7 @@ private String overwriteCreateOrCheckWorkspace() {
     }
 
     static class UnableToCreateGenomicsDBWorkspace extends UserException {
-        private static final long serialVersionUID = 1L;
+        @Serial private static final long serialVersionUID = 1L;
 
         UnableToCreateGenomicsDBWorkspace(final String message){
             super(message);
@@ -1028,7 +1074,7 @@ static class UnableToCreateGenomicsDBWorkspace extends UserException {
      * dictionary (as returned by {@link #getBestAvailableSequenceDictionary})
      * to parse/verify them. Does nothing if no intervals were specified.
      */
-    protected void initializeIntervals() {
+    void initializeIntervals() {
         if (intervalArgumentCollection.intervalsSpecified()) {
             if (getIntervalsFromExistingWorkspace || doIncrementalImport) {
                 logger.warn(INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" was set, so ignoring specified intervals." +
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
index 234fee68b3b..83f1048f535 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
@@ -3,7 +3,7 @@
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
 
 import java.util.ArrayList;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
index 935a52a159f..6cd7246fe92 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -5,8 +5,8 @@
 import htsjdk.variant.vcf.VCFConstants;
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang.math.LongRange;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.LongRange;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.engine.FeatureContext;
@@ -39,10 +39,10 @@ public class ExtractCohortEngine {
     // a better, ref-dependent different manner if/when we get around to supporting other references
     // See: https://en.wikipedia.org/wiki/Pseudoautosomal_region
     private final List PARRegions = List.of(
-            new LongRange(23000000010001L, 23000002781479L),    // PAR1, X
-            new LongRange(24000000010001L, 24000002781479L),    // PAR1, Y
-            new LongRange(23000155701383L, 23000156030895L),    // PAR2, X
-            new LongRange(24000056887903L, 24000057217415L));   // PAR2, Y
+            LongRange.of(23000000010001L, 23000002781479L),    // PAR1, X
+            LongRange.of(24000000010001L, 24000002781479L),    // PAR1, Y
+            LongRange.of(23000155701383L, 23000156030895L),    // PAR2, X
+            LongRange.of(24000056887903L, 24000057217415L));   // PAR2, Y
 
     private final boolean printDebugInformation;
     private final int localSortMaxRecordsInRam;
@@ -679,7 +679,7 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI
 
     private boolean isInPAR(final long location) {
         for (LongRange region : PARRegions) {
-            if (region.containsLong(location)) {
+            if (region.contains(location)) {
                 return true;
             }
         }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
index a81a57d8be3..8ab3b6b9d6f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
@@ -2,7 +2,7 @@
 
 import htsjdk.samtools.util.Locatable;
 import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
 import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
 
 public class ExtractCohortFilterRecord implements Locatable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
index ffa1e44e77e..623c0678902 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
@@ -2,7 +2,7 @@
 
 import htsjdk.samtools.util.Locatable;
 import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
 import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
 
 import java.util.Objects;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
index 8f929262b7a..e5cda0ef47e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
@@ -3,7 +3,7 @@
 import htsjdk.samtools.util.Locatable;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
 import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
 
 public class ReferenceRecord implements Locatable, Comparable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
index a82ad989d26..6df437f01e2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
@@ -4,7 +4,7 @@
 import htsjdk.variant.variantcontext.Genotype;
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.exceptions.UserException;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
index de6a914e42c..f1f71a6eb7f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
@@ -1,5 +1,6 @@
 package org.broadinstitute.hellbender.tools.spark;
 
+import com.google.common.io.Files;
 import htsjdk.samtools.*;
 import htsjdk.samtools.BAMSBIIndexer;
 import htsjdk.samtools.seekablestream.SeekableFileStream;
@@ -18,7 +19,6 @@
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.read.ReadConstants;
-import org.codehaus.plexus.util.FileUtils;
 import picard.cmdline.programgroups.OtherProgramGroup;
 
 import java.io.*;
@@ -166,7 +166,7 @@ private static void assertBamIsCoordinateSorted(final SAMFileHeader header) {
     private static void assertIsBam(final File inputBam) {
         if(!BamFileIoUtils.isBamFile(inputBam)) {
             throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a "
-                    + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified.");
+                    + "file with extension "+ Files.getFileExtension(inputBam.getName()) + " was specified.");
         }
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 95bb083f03b..de91be5e5b6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -89,7 +89,10 @@ public enum ComplexVariantSubtype {
         piDUP_RF,
         dDUP,
         dDUP_iDEL,
-        INS_iDEL
+        INS_iDEL,
+        CTX_PP_QQ,
+        CTX_PQ_QP,
+        CTX_INV
     }
 
     // not defined in output vcf header but used in internal id that is currently output in the ID column
@@ -163,6 +166,7 @@ public enum ComplexVariantSubtype {
     public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT";
     public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS";
     public static final String TSS_DUP = "PREDICTED_TSS_DUP";
+    public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP";
 
     // SVTYPE classes
     public enum StructuralVariantAnnotationType {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
index 6c016f84436..2f51a7df114 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
@@ -4,7 +4,7 @@
 import com.esotericsoftware.kryo.Kryo;
 import com.esotericsoftware.kryo.io.Input;
 import com.esotericsoftware.kryo.io.Output;
-import com.netflix.servo.util.VisibleForTesting;
+import com.google.common.annotations.VisibleForTesting;
 import htsjdk.samtools.*;
 import htsjdk.samtools.util.Locatable;
 import htsjdk.samtools.util.SequenceUtil;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 8edcd595881..0f95878b389 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
         Utils.nonNull(dictionary);
         validatePosition(contigA, positionA, dictionary);
         validatePosition(contigB, positionB, dictionary);
-        Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
-                "End coordinate cannot precede start");
+        // CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
+        // B references the source sequence.
+        if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+            Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
+                    "End coordinate cannot precede start");
+        }
     }
 
     private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
index da4e66f3fb6..9ca5056a679 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
@@ -31,6 +31,7 @@
  */
 public class ClosestSVFinder {
 
+    protected final boolean sortOutput;
     protected final Map truthIdToItemMap;
     protected final Map idToClusterMap;
     private final SVConcordanceLinkage linkage;
@@ -49,7 +50,9 @@ public class ClosestSVFinder {
      */
     public ClosestSVFinder(final SVConcordanceLinkage linkage,
                            final Function collapser,
+                           final boolean sortOutput,
                            final SAMSequenceDictionary dictionary) {
+        this.sortOutput = sortOutput;
         this.linkage = Utils.nonNull(linkage);
         this.collapser = Utils.nonNull(collapser);
         outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary));
@@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage,
         lastItemContig = null;
     }
 
+    /**
+     * Sorts output by default
+     */
+    public ClosestSVFinder(final SVConcordanceLinkage linkage,
+                           final Function collapser,
+                           final SAMSequenceDictionary dictionary) {
+        this(linkage, collapser, true, dictionary);
+    }
+
     /**
      * Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered.
      * @param force flushes all variants in the output buffer regardless of position
@@ -70,8 +82,12 @@ public List flush(final boolean force) {
                 .map(c -> new ClosestPair(c.getItem(), c.getClosest()))
                 .map(collapser)
                 .collect(Collectors.toList());
-        outputBuffer.addAll(collapsedRecords);
-        return flushBuffer(force);
+        if (sortOutput) {
+            outputBuffer.addAll(collapsedRecords);
+            return flushBuffer(force);
+        } else {
+            return collapsedRecords;
+        }
     }
 
     /**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java
index 50c98a43f3d..23942237008 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/CombineGVCFs.java
@@ -461,7 +461,7 @@ private VariantContext referenceBlockMerge(final List vcs, final
                 final GenotypeBuilder gBuilder = new GenotypeBuilder(g);
                 GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
                             gBuilder, assignmentMethod,
-                            g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g.getAlleles(), null);
+                            g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g, null);
                 genotypes.add(gBuilder.make());
             }
         }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
index 312a90872e7..9036ecdd44d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
@@ -150,8 +150,9 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
 
         final VariantContext result;
 
+        // only re-genotype polymorphic sites
         if ( originalVC.isVariant()  && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) {
-            // only re-genotype polymorphic sites
+            // note that the calculateGenotypes method also calculates the QUAL score
             final VariantContext regenotypedVC = calculateGenotypes(originalVC, includeNonVariants);
             if (regenotypedVC == null) {
                 return null;
@@ -186,7 +187,7 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
             //don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
             vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
             VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
-            return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false, keepSB)).make();
+            return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(annotated, false, keepSB)).make();
         } else if (includeNonVariants) {
             // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
             VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true, false)).make();
@@ -461,24 +462,29 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final
                 attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, GenotypeGVCFs.PHASED_HOM_VAR_STRING);
             }
 
-            // create AD if it's not there
-            if ( !oldGT.hasAD() && vc.isVariant() ) {
+            // create AD if it's not there, but only if there's data
+            if ( !oldGT.hasAD() && vc.isVariant() && depth > 0) {
                 final int[] AD = new int[vc.getNAlleles()];
                 AD[0] = depth;
                 builder.AD(AD);
             }
 
             if ( createRefGTs ) {
-                // move the GQ to RGQ
-                if (oldGT.hasGQ()) {
+                 //keep 0 depth samples and 0 GQ samples as no-call
+                if (depth > 0 && oldGT.hasGQ()) {
+                    if (oldGT.getGQ() > 0) {
+                        final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
+                        builder.alleles(refAlleles);
+                    } else {
+                        builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
+                    }
+
+                    // move the GQ to RGQ
                     builder.noGQ();
                     attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ());
-                }
-
-                //keep 0 depth samples and 0 GQ samples as no-call
-                if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) {
-                    final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
-                    builder.alleles(refAlleles);
+                } else {
+                    builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
+                    builder.noGQ().noDP();
                 }
 
                 // also, the PLs are technically no longer usable
@@ -494,8 +500,8 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final
      *  Does this genotype look like it has no reads and should be excluded from annotations?
      */
     static boolean excludeFromAnnotations(Genotype oldGT) {
-        return oldGT.isHomRef() && !oldGT.hasPL()
-                && ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP())
+        return (oldGT.isHomRef() || oldGT.isNoCall())
+                && (!oldGT.hasDP() || oldGT.getDP() == 0)
                 && oldGT.hasGQ() && oldGT.getGQ() == 0;
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
index 6c36592b6ae..16b5142fb84 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
@@ -4,7 +4,7 @@
 import htsjdk.samtools.CigarElement;
 import htsjdk.samtools.CigarOperator;
 import org.aeonbits.owner.util.Collections;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.argparser.ExperimentalFeature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
index f57d50a7cad..9484da25a9a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
@@ -439,7 +439,8 @@ protected static void removeStaleAttributesAfterMerge(final Map
         attributes.remove(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
         attributes.remove(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
         attributes.remove(VCFConstants.END_KEY);
-        attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
+        attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
+        attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
     }
 
     /**
@@ -579,6 +580,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
             final int ploidy = g.getPloidy();
             final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
             if (!doSomaticMerge) {
+                //do attribute subsetting
                 if (g.hasPL() || g.hasAD()) {
                     int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
                     if (g.hasPL()) {
@@ -590,8 +592,10 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
                     if (g.hasAD()) {
                         genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
                     }
+
+               }
                 //clean up low confidence hom refs for better annotations later
-                } else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
+                if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
                     genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
                 }
             }
@@ -658,7 +662,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
             GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
                     genotypeBuilder, assignmentMethod,
                     g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
-                    targetAlleles, originalGTAlleles, null);
+                    targetAlleles, new GenotypeBuilder(g).alleles(originalGTAlleles).make(), null);
             mergedGenotypes.add(genotypeBuilder.make());
         }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
index b0ff4985cec..7ba0a0baa24 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -3,14 +3,19 @@
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.Genotype;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
+import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
 
 import java.util.*;
 
 public final class AnnotationUtils {
+
+    public static final String ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX = "AS_";
     public static final String ALLELE_SPECIFIC_RAW_DELIM = "|";
     public static final String ALLELE_SPECIFIC_REDUCED_DELIM = ",";
     public static final String ALLELE_SPECIFIC_SPLIT_REGEX = "\\|"; //String.split takes a regex, so we need to escape the pipe
@@ -74,16 +79,23 @@ public static List decodeAnyASList( final String somethingList) {
      * @param annotation the annotation to be tested
      * @return true if the annotation is expected to have values per-allele
      */
-    public static boolean isAlleleSpecific(final InfoFieldAnnotation annotation) {
+    public static boolean isAlleleSpecific(final VariantAnnotation annotation) {
         return annotation instanceof AlleleSpecificAnnotation;
     }
 
+    public static boolean isAlleleSpecificGatkKey(final String annotationKey) {
+        final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey);
+        return header.getCountType().equals(VCFHeaderLineCount.A) ||
+                header.getCountType().equals(VCFHeaderLineCount.R) ||
+                annotationKey.startsWith(ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX);
+    }
+
     /**
-     * Handles all the Java and htsjdk parsing shenanigans
-     * @param rawDataString should not have surrounding brackets
+     * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
+     * @param rawDataString may have surrounding brackets, with raw delimiter
      * @return
      */
-    public static List getAlleleLengthListOfString(String rawDataString) {
+    public static List getAlleleLengthListOfStringFromRawData(String rawDataString) {
         if (rawDataString == null) {
             return Collections.emptyList();
         }
@@ -93,6 +105,21 @@ public static List getAlleleLengthListOfString(String rawDataString) {
         return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data
     }
 
+    /**
+     * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
+     * @param dataString may have surrounding brackets, with reduced delimieter
+     * @return
+     */
+    public static List getAlleleLengthListOfString(String dataString) {
+        if (dataString == null) {
+            return Collections.emptyList();
+        }
+        if (dataString.startsWith("[")) {
+            dataString = dataString.substring(1, dataString.length() - 1).replaceAll("\\s", "");
+        }
+        return Arrays.asList(dataString.split(ALLELE_SPECIFIC_REDUCED_DELIM, -1)); //-1 to keep empty data
+    }
+
     static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) {
         final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
                 " and possibly subsequent");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index 931a6bb5256..8ea5771b78e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -4,12 +4,13 @@
 import htsjdk.samtools.util.Locatable;
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.apache.commons.lang3.tuple.Triple;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotation;
 import org.broadinstitute.hellbender.utils.MathUtils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.haplotype.Event;
@@ -27,7 +28,7 @@
  */
 @DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
         summary="Describe the complexity of an assembly region")
-public class AssemblyComplexity implements JumboInfoAnnotation {
+public class AssemblyComplexity implements JumboInfoAnnotation, AlleleSpecificAnnotation {
 
     @Argument(fullName = "assembly-complexity-reference-mode",
             doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes",
@@ -189,5 +190,4 @@ private static int uniqueVariants(final Haplotype hap1, final Haplotype hap2, fi
     private static int editDistance(final Haplotype hap1, final Haplotype hap2, final int excludedPosition) {
         return uniqueVariants(hap1, hap2, excludedPosition) + uniqueVariants(hap2, hap1, excludedPosition);
     }
-
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
deleted file mode 100644
index c870d9693ee..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
+++ /dev/null
@@ -1,175 +0,0 @@
-package org.broadinstitute.hellbender.tools.walkers.annotator;
-
-
-import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
-import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
-import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
-import org.broadinstitute.hellbender.engine.FeatureContext;
-import org.broadinstitute.hellbender.engine.ReferenceContext;
-import org.broadinstitute.hellbender.utils.Utils;
-import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
-import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.help.HelpConstants;
-import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
-import org.broadinstitute.hellbender.utils.read.Fragment;
-import org.broadinstitute.hellbender.utils.read.GATKRead;
-import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
-import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
-import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-/**
- * For each sample and for each allele a list feature vectors of supporting reads
- * In order to reduce the number of delimiter characters, we flatten featurized reads.  For example, suppose allele 1 has featurized reads
- * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
- * 1,2,3,4|5,6,7,8
- */
-@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
-        summary="Featurized read sets for Mutect3 training data")
-public class FeaturizedReadSets implements JumboGenotypeAnnotation {
-    public static final int DEFAULT_BASE_QUALITY = 25;
-
-    private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
-
-    public static final int FEATURES_PER_READ = 11;
-
-    private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
-
-    // downsample ref reads to this count if needed
-    private final int maxRefCount;
-
-    public FeaturizedReadSets(final int maxRefCount) {
-        this.maxRefCount = maxRefCount;
-    }
-
-    public FeaturizedReadSets() {
-        this(DEFAULT_MAX_REF_COUNT);
-    }
-
-    @Override
-    public void annotate(final ReferenceContext ref,
-                         final FeatureContext features,
-                         final VariantContext vc,
-                         final Genotype g,
-                         final GenotypeBuilder gb,
-                         final AlleleLikelihoods likelihoods,
-                         final AlleleLikelihoods fragmentLikelihoods,
-                         final AlleleLikelihoods haplotypeLikelihoods) {
-        Utils.nonNull(vc);
-        Utils.nonNull(g);
-        Utils.nonNull(gb);
-
-        if ( likelihoods == null) {
-            return;
-        }
-
-        final List>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
-                likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);
-
-        // flatten twice: over all reads supporting each allele and over all alleles
-        // we can partition by allele with the countsInAlleleOrder annotation
-        // and by read using the constant feature vector size
-        final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
-                .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
-                .mapToInt(n -> n)
-                .toArray();
-
-        final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();
-
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
-        gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
-    }
-
-    public static List>> getReadVectors(final VariantContext vc,
-                                                           final Collection samples,
-                                                           final AlleleLikelihoods likelihoods,
-                                                           final AlleleLikelihoods haplotypeLikelihoods,
-                                                           final int refDownsample,
-                                                           final int altDownsample) {
-        return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
-    }
-
-    // returns Lists (in allele order) of lists of read vectors supporting each allele
-    public static List>> getReadVectors(final VariantContext vc,
-                                                           final Collection samples,
-                                                           final AlleleLikelihoods likelihoods,
-                                                           final AlleleLikelihoods haplotypeLikelihoods,
-                                                           final int refDownsample,
-                                                           final int altDownsample,
-                                                           final Map altDownsampleMap) {
-        final Map> readsByAllele = likelihoods.alleles().stream()
-                .collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
-
-        samples.stream().flatMap(s -> likelihoods.bestAllelesBreakingTies(s).stream())
-                .filter(ba -> ba.isInformative())
-                .forEach(ba -> readsByAllele.get(ba.allele).add(ba.evidence));
-
-        // downsample if necessary
-        final Allele refAllele = likelihoods.alleles().stream().filter(Allele::isReference).findFirst().get();
-        for (final Allele allele : likelihoods.alleles()) {
-            final int downsample = allele.isReference() ? refDownsample : altDownsampleMap.getOrDefault(allele, altDownsample);
-            if (readsByAllele.get(allele).size() > downsample) {
-                Collections.shuffle(readsByAllele.get(allele));
-                readsByAllele.put(allele, readsByAllele.get(allele).subList(0, downsample));
-            }
-        }
-
-        final Map bestHaplotypes = new HashMap<>();
-        samples.stream().flatMap(s -> haplotypeLikelihoods.bestAllelesBreakingTies(s).stream())
-                .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
-
-        return vc.getAlleles().stream()
-                .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
-                .collect(Collectors.toList());
-    }
-
-
-    @Override
-    public List getKeyNames() {
-        return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
-    }
-
-    private static List featurize(final GATKRead read, final VariantContext vc, final Map bestHaplotypes) {
-        final List result = new ArrayList<>();
-        result.add(read.getMappingQuality());
-        result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
-        result.add(read.isFirstOfPair() ? 1 : 0);
-        result.add(read.isReverseStrand() ? 1 : 0);
-
-        // distances from ends of read
-        final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
-        result.add(readPosition);
-        result.add(read.getLength() - readPosition);
-
-
-        result.add(Math.abs(read.getFragmentLength()));
-
-        // distances from ends of fragment
-        final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
-        final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
-        result.add(vc.getStart() - fragmentStart);
-        result.add(fragmentEnd - vc.getEnd());
-
-        // mismatches versus best haplotype
-        final Haplotype haplotype = bestHaplotypes.get(read);
-        final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
-        final GATKRead copy = read.copy();
-        copy.setCigar(readToHaplotypeAlignment.getCigar());
-        final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
-        result.add(mismatchCount);
-
-        final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
-        result.add((int) indelsVsBestHaplotype);
-        Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");
-
-        return result;
-    }
-
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
index addbd29dfd7..96a749de50c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
@@ -4,7 +4,7 @@
 import htsjdk.variant.variantcontext.Genotype;
 import htsjdk.variant.variantcontext.GenotypeBuilder;
 import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
index 9c2d99b97b7..3ef5ade7148 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
@@ -4,7 +4,7 @@
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFHeaderLineType;
 import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
index e83be35f50f..930a6a51033 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -145,6 +145,10 @@ public List getInfoAnnotations() {
         return Collections.unmodifiableList(infoAnnotations);
     }
 
+    public List getJumboInfoAnnotations() {
+        return Collections.unmodifiableList(jumboInfoAnnotations);
+    }
+
     /**
      *
      * @param infoAnnotationClassName   the name of the Java class, NOT the annotation VCF key
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
index 9d78efd6d1b..f8bb3b3cea9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
@@ -7,7 +7,7 @@
 import htsjdk.variant.vcf.VCFCompoundHeaderLine;
 import htsjdk.variant.vcf.VCFHeaderLine;
 import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.broadinstitute.barclay.help.DocumentedFeature;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
index 5b4af7ec0ff..ef01d6fbf62 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
@@ -5,7 +5,7 @@
 import htsjdk.variant.variantcontext.GenotypesContext;
 import htsjdk.variant.variantcontext.VariantContext;
 import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
index d8466d0d997..b521f157d28 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
@@ -153,7 +153,7 @@ public  Map finalizeRawData(final VariantContext vc, final Varia
     }
 
     protected void parseRawDataString(ReducibleAnnotationData> myData) {
-        List values = AnnotationUtils.getAlleleLengthListOfString(myData.getRawData());
+        List values = AnnotationUtils.getAlleleLengthListOfStringFromRawData(myData.getRawData());
         if (values.size() != myData.getAlleles().size()) {
             throw new IllegalStateException("Number of alleles and number of allele-specific entries do not match.  " +
                     "Allele-specific annotations should have an entry for each allele including the reference.");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
index 0c4a07f015a..11890925065 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
@@ -4,6 +4,7 @@
 import htsjdk.variant.variantcontext.VariantContext;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.tools.walkers.annotator.InfoFieldAnnotation;
@@ -33,7 +34,6 @@
  * are accumulated as well.
  */
 public abstract class FlowAnnotatorBase implements InfoFieldAnnotation {
-    private final static Logger logger = LogManager.getLogger(FlowAnnotatorBase.class);
     protected final OneShotLogger flowMissingOneShotLogger = new OneShotLogger(FlowAnnotatorBase.class);
 
 
@@ -149,7 +149,7 @@ private String establishReadGroupFlowOrder(final LocalContext localContext, fina
         // if here, no flow order was found. may we use a default?
         if ( isActualFlowOrderRequired() ) {
             localContext.generateAnnotation = false;
-            flowMissingOneShotLogger.warn("this.getClass().getSimpleName() + \" annotation will not be calculated, no '\" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + \"' argument provided\"");
+            flowMissingOneShotLogger.warn(this.getClass().getSimpleName() + " annotation will not be calculated, no '" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + "' argument provided");
         }
 
         return FlowBasedRead.DEFAULT_FLOW_ORDER;
@@ -201,7 +201,7 @@ protected void variantType(final VariantContext vc, final LocalContext localCont
             if (isSpecial(alleles.get(i))){
                 continue;
             }
-            if ((localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
+            if ((localContext.hmerIndelLength==null) || (localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
                 isHmer=false;
             }
         }
@@ -249,7 +249,12 @@ protected void isHmerIndel(final VariantContext vc, final LocalContext localCont
             // get byte before and after
             final byte        before = getReferenceNucleotide(localContext, vc.getStart() - 1);
             final byte[]      after = getReferenceHmerPlus(localContext, vc.getEnd() + 1, MOTIF_SIZE);
-
+            if (after.length==0){
+                flowMissingOneShotLogger.warn("Failed to get hmer from reference, probably because the " +
+                        "variant is very close to the end of the chromosome, isHmerIndel and RightMotif annotations will not be calculated. " +
+                        "Variant position: " + vc.getContig() + ":" + vc.getEnd() + 1);
+                return;
+            }
             // build two haplotypes. add byte before and after
             final byte[]      refHap = buildHaplotype(before, ref.getBases(), after);
             final byte[]      altHap = buildHaplotype(before, alt.getBases(), after);
@@ -337,6 +342,12 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
             }
 
             String  motif = getRefMotif(localContext, vc.getStart() - MOTIF_SIZE, MOTIF_SIZE);
+            if (motif.length() != MOTIF_SIZE){
+                flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because the variant is very close to the " +
+                        "start of the chromosome. LeftMotif annotation will not be calculated. " +
+                        "Variant position: " + vc.getContig() + ":" + vc.getStart());
+                return;
+            }
             if ( a.length() != refLength ) {
                 motif = motif.substring(1) + vc.getReference().getBaseString().substring(0, 1);
             }
@@ -349,8 +360,13 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
     protected void getRightMotif(final VariantContext vc, final LocalContext localContext) {
 
         final int         refLength = vc.getReference().length();
-        final String      motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
-
+        String      motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
+        if (motif.length() != MOTIF_SIZE){
+            flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because " +
+                    "the variant is close to the end of the chromosome. RightMotif annotation will not be calculated. " +
+                    "Variant position: " + vc.getContig() + ":" + vc.getStart());
+            return;
+        }
         // fill empty entries (non indel alelles)
         for ( int i = 0 ; i < localContext.rightMotif.size() ; i++ ) {
             if ( localContext.rightMotif.get(i) == null ) {
@@ -365,6 +381,11 @@ protected void gcContent(final VariantContext vc, final LocalContext localContex
 
         final int         begin = vc.getStart() - (GC_CONTENT_SIZE / 2);
         final String      seq = getRefMotif(localContext, begin + 1, GC_CONTENT_SIZE);
+        if ( seq.length() != GC_CONTENT_SIZE ) {
+            flowMissingOneShotLogger.warn("gcContent will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
+                    " because the variant is too close to the edge of the chromosome");
+            return;
+        }
         int         gcCount = 0;
         for ( byte b : seq.getBytes() ) {
             if ( b == 'G' || b == 'C' ) {
@@ -423,11 +444,11 @@ protected void cycleSkip(final VariantContext vc, final LocalContext localContex
         localContext.attributes.put(GATKVCFConstants.FLOW_CYCLESKIP_STATUS, css);
     }
 
-    // get a single nucleoid from reference
+    // get a single nucleotid from reference
     private byte getReferenceNucleotide(final LocalContext localContext, final int start) {
         final int         index = start - localContext.ref.getWindow().getStart();
         final byte[]      bases = localContext.ref.getBases();
-        Utils.validIndex(index, bases.length);
+        Utils.validIndex(index, bases.length); // do not catch, if here the location is outside of the reference, there is a problem!
         return bases[index];
     }
 
@@ -435,7 +456,13 @@ private byte getReferenceNucleotide(final LocalContext localContext, final int s
     private byte[] getReferenceHmerPlus(final LocalContext localContext, final int start, final int additional) {
         int               index = start - localContext.ref.getWindow().getStart();
         final byte[]      bases = localContext.ref.getBases();
-        Utils.validIndex(index, bases.length);
+        try {
+            Utils.validIndex(index, bases.length);
+        } catch (IllegalArgumentException e) {
+            flowMissingOneShotLogger.warn("Failed to get hmer from reference, too close to the edge. " +
+                    "Position: " + localContext.ref.getContig() + ":" + index);
+            return new byte[0];
+        }
 
         // get hmer
         final StringBuilder sb = new StringBuilder();
@@ -457,8 +484,12 @@ private String getRefMotif(final LocalContext localContext, final int start, fin
         final byte[]      bases = localContext.ref.getBases();
         final int         startIndex = start - localContext.ref.getWindow().getStart();
         final int         endIndex = startIndex + length;
-        Utils.validIndex(startIndex, bases.length);
-        Utils.validIndex(endIndex-1, bases.length);
+        try {
+            Utils.validIndex(startIndex, bases.length);
+            Utils.validIndex(endIndex-1, bases.length);
+        } catch (IllegalArgumentException e) {
+            return "";
+        }
         return new String(Arrays.copyOfRange(bases, startIndex, endIndex));
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
index 44b07f6fcfe..93420133913 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
@@ -1,7 +1,7 @@
 package org.broadinstitute.hellbender.tools.walkers.contamination;
 
 import htsjdk.samtools.util.OverlapDetector;
-import org.apache.commons.lang.mutable.MutableDouble;
+import org.apache.commons.lang3.mutable.MutableDouble;
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.math3.optim.univariate.UnivariatePointValuePair;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
index 1a023488270..607f6691c58 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
@@ -13,7 +13,7 @@
 import java.util.stream.IntStream;
 
 public class ContaminationSegmenter {
-    public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.between(0.1, 0.9);
+    public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.of(0.1, 0.9);
     public static final double KERNEL_SEGMENTER_LINEAR_COST = 1.0;
     public static final double KERNEL_SEGMENTER_LOG_LINEAR_COST = 1.0;
     public static final int KERNEL_SEGMENTER_DIMENSION = 100;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
index 9c4f09d1bcd..9d6c40a691c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
@@ -9,6 +9,7 @@
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
 import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
 import org.broadinstitute.hellbender.engine.filters.ReadFilter;
 import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
 import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
@@ -100,6 +101,13 @@
  * file that have AF of 0.01 or more.
  * 

* + *

+ * Finally for those using mappers other than bwa mem or dragen-os, {@code --minimum-mapping-quality} threshold is + * set to 50, which limits the usable reads that tool considers for generating pileups. Certain mappers are known to + * assign scores less that this threshold even for the unique mappings. If you observe all empty results in your + * summary file please adjust the {@code --minimum-mapping-quality} parameter according to your input files. + *

+ * */ @CommandLineProgramProperties( summary = "Tabulates pileup metrics for inferring contamination", @@ -112,8 +120,7 @@ public class GetPileupSummaries extends LocusWalker { public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency"; public static final String MAX_SITE_AF_SHORT_NAME = "max-af"; public static final String MIN_SITE_AF_SHORT_NAME = "min-af"; - public static final String MIN_MAPPING_QUALITY_LONG_NAME = "min-mapping-quality"; - public static final String MIN_MAPPING_QUALITY_SHORT_NAME = "mmq"; + private static final double DEFAULT_MIN_POPULATION_AF = 0.01; private static final double DEFAULT_MAX_POPULATION_AF = 0.2; @@ -137,9 +144,6 @@ public class GetPileupSummaries extends LocusWalker { doc = "Maximum population allele frequency of sites to consider.", optional = true) private double maxPopulationAlleleFrequency = DEFAULT_MAX_POPULATION_AF; - @Argument(fullName = MIN_MAPPING_QUALITY_LONG_NAME, shortName = MIN_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum read mapping quality", optional = true) - private int minMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY; - private boolean sawVariantsWithoutAlleleFrequency = false; private boolean sawVariantsWithAlleleFrequency = false; @@ -168,6 +172,7 @@ public boolean requiresFeatures() { @Override public List getDefaultReadFilters() { final List filters = new ArrayList<>(); + filters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY)); filters.add(ReadFilterLibrary.MAPPING_QUALITY_AVAILABLE); filters.add(ReadFilterLibrary.MAPPING_QUALITY_NOT_ZERO); filters.add(ReadFilterLibrary.MAPPED); @@ -208,8 +213,7 @@ public void apply(AlignmentContext alignmentContext, ReferenceContext referenceC final VariantContext vc = vcs.get(0); if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) { - final ReadPileup pileup = alignmentContext.getBasePileup() - .makeFilteredPileup(pe -> pe.getRead().getMappingQuality() >= minMappingQuality); + final ReadPileup pileup = alignmentContext.getBasePileup(); try { writer.writeRecord(new PileupSummary(vc, pileup)); } catch (final IOException ex) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java new file mode 100644 index 00000000000..83ac040011c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java @@ -0,0 +1,626 @@ +package org.broadinstitute.hellbender.tools.walkers.featuremapping; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMUtils; +import org.apache.commons.lang3.StringUtils; +import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.engine.ReadWalker; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.groundtruth.SeriesStats; +import org.broadinstitute.hellbender.utils.read.*; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +@CommandLineProgramProperties( + summary = "This program converts the flow qualities that Ultima Genomics CRAM reports to more conventional base qualities. " + + "Specifically, the generated quality will report the probability that a specific base is a sequencing error mismatch, " + + "while auxilary tags qa, qt, qg, qc report specific probability that a specific base X is a A->X error. " + + "Since mismatch error in flow-based chemistries can only occur as a result of several indel errors, " + + "we implemented several strategies to estimate the probability of a mismatch which can be specified" + + "using the svnq-mode parameter: " + + "Legacy - the quality value from flow matrix is used. " + + "Optimistic - assuming that the probability of the indel errors are p1 and p2, then snvq=p1*p2 - assuming they always coincide. " + + "Pessimistic - snvq=(1-p1)*(1-p2) - assuming they never coincide. " + + "Geometric - snvq=sqrt(Optimistic*Pessimistic) - i.e. the geometric mean of the optimistic and Pessimistic modes. " + + "The Geometric is set as the default mode", + oneLineSummary = "Add SNV Quality to the flow-based CRAM", + programGroup = FlowBasedProgramGroup.class +) + +@DocumentedFeature +@ExperimentalFeature +public final class AddFlowSNVQuality extends ReadWalker { + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "File to which reads should be written") + @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) + public GATKPath output = null; + private SAMFileGATKReadWriter outputWriter; + + @ArgumentCollection + public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); + + @ArgumentCollection + public AddFlowSNVQualityArgumentCollection aqArgs = new AddFlowSNVQualityArgumentCollection(); + + public static final char PHRED_ASCII_BASE = '!'; + + public static final int ERROR_PROB_BAND_1LESS = 0; + public static final int ERROR_PROB_BAND_KEY = 1; + public static final int ERROR_PROB_BAND_1MORE = 2; + public static final int ERROR_PROB_BANDS = 3; + + public double minLikelihoodProbRate = 1e-6; + public int maxQualityScore = 60; + + // locals + private SeriesStats inputQualStats = new SeriesStats(); + private SeriesStats outputBQStats = new SeriesStats(); + private SeriesStats outputQAltStats = new SeriesStats(); + private SeriesStats outputQCalledStats = new SeriesStats(); + private SeriesStats outputSumPStats = new SeriesStats(); + + // private class to hold the base probabilities and SNVQ probabilties for a read + class ReadProbs { + double[] baseProbs; + double[][] snvqProbs; // length of first dimension is flow order length + } + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + outputWriter = createSAMWriter(output, true); + } + + @Override + public void closeTool() { + super.closeTool(); + if ( outputWriter != null ) { + outputWriter.close(); + } + + try { + if ( aqArgs.debugCollectStatsInto != null ) + printStats(aqArgs.debugCollectStatsInto); + } catch (IOException e) { + throw new GATKException("", e); + } + } + + @Override + public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { + + // include supplementary alignments? + if ( read.isSupplementaryAlignment() && !aqArgs.keepSupplementaryAlignments ) { + return; + } + + // include qc-failed reads? + if ( read.failsVendorQualityCheck() && !aqArgs.includeQcFailedReads ) { + return; + } + + // collect input stats + if ( aqArgs.debugCollectStatsInto != null ) { + collectInputStats(read); + } + + // add SNVQ attributes + addBaseQuality(read, getHeaderForReads(), aqArgs.maxPhredScore, fbargs); + + // collect output stats + if ( aqArgs.debugCollectStatsInto != null ) { + collectOutputStats(read); + if ( aqArgs.debugReadName.size() != 0 && aqArgs.debugReadName.contains(read.getName()) ) { + dumpOutputRead(read); + } + } + + // write read to output + outputWriter.addRead(read); + } + + private void collectInputStats(GATKRead read) { + for ( byte q : read.getBaseQualitiesNoCopy() ) { + inputQualStats.add(q); + } + } + + private void collectOutputStats(GATKRead read) { + if ( aqArgs.outputQualityAttribute != null ) { + if (read.hasAttribute(aqArgs.outputQualityAttribute)) { + for (byte q : read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes()) { + outputBQStats.add(SAMUtils.fastqToPhred((char) q)); + } + } + } else { + for (byte q : read.getBaseQualitiesNoCopy()) { + outputBQStats.add(q); + } + } + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); + final byte[] bases = read.getBasesNoCopy(); + final double[] sumP = new double[bases.length]; + for ( int i = 0 ; i < 4 ; i++ ) { + byte altBase = rgInfo.flowOrder.getBytes()[i]; + String attrValue = read.getAttributeAsString(attrNameForNonCalledBase(altBase)); + int ofs = 0; + for ( byte q : attrValue.getBytes() ) { + if ( bases[ofs] != altBase ) { + outputQAltStats.add(SAMUtils.fastqToPhred((char)q)); + } else { + outputQCalledStats.add(SAMUtils.fastqToPhred((char)q)); + } + sumP[ofs] += Math.pow(10.0, SAMUtils.fastqToPhred((char)q) / -10.0); + ofs++; + + } + } + for ( double p : sumP ) { + outputSumPStats.add(p); + } + } + + // dump read as a csv for easier analysis + private void dumpOutputRead(GATKRead read) { + + try { + // open file + final String fname = aqArgs.debugCollectStatsInto + "." + read.getName() + ".csv"; + logger.info("dumping read into: " + fname); + final PrintWriter pw = new PrintWriter(fname); + + // write header + final StringBuilder hdr = new StringBuilder(); + hdr.append("pos,base,qual,tp,t0,bq"); + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); + for (int i = 0; i < 4; i++) { + hdr.append(","); + hdr.append(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i))); + } + hdr.append(",qCalled"); + pw.println(hdr); + + // access data + final byte[] bases = read.getBasesNoCopy(); + final byte[] quals = read.getBaseQualitiesNoCopy(); + final byte[] tp = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_TAG_NAME); + final byte[] t0 = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_T0_TAG_NAME); + final byte[] bq = (aqArgs.outputQualityAttribute != null) + ? read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes() + : null; + final byte[][] qX = new byte[4][]; + for (int i = 0; i < 4; i++) { + qX[i] = read.getAttributeAsString(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i))).getBytes(); + } + + // write lines + List line = new LinkedList<>(); + for (int pos = 0; pos < bases.length; pos++) { + line.clear(); + + // position + line.add(Integer.toString(pos)); + + // base, qual + line.add(Character.toString(bases[pos])); + line.add(Integer.toString(quals[pos])); + + // tp,t0,bq + line.add(Integer.toString(tp[pos])); + line.add(Integer.toString(SAMUtils.fastqToPhred((char)t0[pos]))); + if ( bq != null ) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char) bq[pos]))); + } + + // qX + int calledIndex = -1; + for (int i = 0; i < 4; i++) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[i][pos]))); + if ( bases[pos] == rgInfo.flowOrder.charAt(i) ) { + calledIndex = i; + } + } + + // qCalled + if ( calledIndex >= 0 ) { + line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[calledIndex][pos]))); + } else { + line.add("-1"); + } + + // write the line + pw.println(StringUtils.join(line, ',')); + } + + // close file + pw.close(); + } catch (IOException e) { + throw new GATKException("", e); + } + } + + private void printStats(final String fname) throws IOException { + + inputQualStats.csvWrite(fname + ".inputQual.csv"); + outputBQStats.csvWrite(fname + ".outputBQ.csv"); + outputQAltStats.csvWrite(fname + ".outputQAlt.csv"); + outputQCalledStats.csvWrite(fname + ".outputQCalled.csv"); + outputSumPStats.csvWrite(fname + ".outputSumP.csv"); + } + + static public String attrNameForNonCalledBase(byte nonCalledBase) { + return attrNameForNonCalledBase((char)nonCalledBase); + } + + static public String attrNameForNonCalledBase(char nonCalledBase) { + return "q" + Character.toLowerCase(nonCalledBase); + } + + public void addBaseQuality(final GATKRead read, final SAMFileHeader hdr, double maxPhredScore, FlowBasedArgumentCollection fbargs) { + + // take in phred score limit + if ( !Double.isNaN(maxPhredScore) ) { + maxQualityScore = (int)maxPhredScore; + minLikelihoodProbRate = Math.pow(10, -maxPhredScore / 10.0); + } + + // convert to a flow base read + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(hdr, read); + final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final int flowOrderLength = FlowBasedReadUtils.calcFlowOrderLength(rgInfo.flowOrder); + + // generate base and snvq probabilities for the read + final ReadProbs readProbs = generateFlowReadBaseAndSNVQErrorProbabilities(fbRead, flowOrderLength, rgInfo.flowOrder.getBytes()); + + // install in read + if ( aqArgs.outputQualityAttribute != null ) { + read.setAttribute(aqArgs.outputQualityAttribute, new String(convertErrorProbToFastq(readProbs.baseProbs))); + } else { + read.setBaseQualities(convertErrorProbToPhred(readProbs.baseProbs)); + } + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + final String name = AddFlowSNVQuality.attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i)); + read.setAttribute(name, new String(convertErrorProbToFastq(readProbs.snvqProbs[i]))); + } + } + + // Not using SamUtils function since normally an error probability can not be zero. + // still, this method is called to convert base quality as well as snvq, which is computed. + // the following check is a safety, in case snvq produces a zero. + private char[] convertErrorProbToFastq(double[] errorProb) { + + byte[] phred = convertErrorProbToPhred(errorProb); + return SAMUtils.phredToFastq(phred).toCharArray(); + } + + // Not using SamUtils function since normally an error probability can not be zero. + // still, this method is called to convert base quality as well as snvq, which is computed. + // the following check is a safety, in case snvq produces a zero. + private byte[] convertErrorProbToPhred(double[] errorProb) { + + final byte[] phred = new byte[errorProb.length]; + for ( int i = 0 ; i < errorProb.length ; i++ ) { + + if ( errorProb[i] == 0 ) { + phred[i] = (byte)maxQualityScore; + } else { + final double p = errorProb[i]; + phred[i] = (byte)Math.round(-10 * Math.log10(p)); + } + } + return phred; + } + + /** + * generate base and snvq probabilties for a read. + * + * @param fbRead a flow based read + * @param flowOrderLength number of bases in flow order (essentially number of valid base values) + * @param flowOrder the flow order itself (which can be the size of flowOrderLength or a repeat of it + * + * @return an instance of a private class containing the base probabilities as well as the snvq probabilities + */ + private ReadProbs generateFlowReadBaseAndSNVQErrorProbabilities(final FlowBasedRead fbRead, final int flowOrderLength, byte[] flowOrder) { + + /** + * access key and error probabilities + * for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix} + */ + final int[] key = fbRead.getKey(); + final double[][] errorProbBands = extractErrorProbBands(fbRead, minLikelihoodProbRate); + + // allocate returned prob arrays + final double[] baseProbs = new double[fbRead.getBasesNoCopy().length]; + final double[][] snvqProbs = new double[flowOrderLength][]; + for ( int i = 0 ; i < snvqProbs.length ; i++ ) { + snvqProbs[i] = new double[baseProbs.length]; + } + + // loop over hmers via flow key + int base = 0; + Map allBaseProb0 = new LinkedHashMap<>(); + Map allBaseProb1 = new LinkedHashMap<>(); + + for ( int flow = 0 ; flow < key.length ; flow++ ) { + if ( key[flow] != 0 ) { + + // establish initial stat + allBaseProb0.clear(); + allBaseProb1.clear(); + int flow_i = (flow % flowOrderLength); + + // establish hmer quality + final int hmerLength = key[flow]; + final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength, flowOrder, allBaseProb0, allBaseProb1); + + // install value in first byte of the hmer + baseProbs[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + if ( allBaseProb0.containsKey(flowOrder[i]) ) { + snvqProbs[i][base - 1] = allBaseProb0.get(flowOrder[i]); + } else if ( i != flow_i ) { + snvqProbs[i][base - 1] = minLikelihoodProbRate; + } + } + + // for hmers longer than 1 + if ( hmerLength > 1 ) { + + // skip all but last (leave with zero error probability) + base += (hmerLength - 2); + + // fill last base from computed error probability + baseProbs[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1 + + for ( int i = 0 ; i < flowOrderLength ; i++ ) { + if ( allBaseProb1.containsKey(flowOrder[i]) ) { + final double p = allBaseProb1.get(flowOrder[i]); + for ( int j = 0 ; j < hmerLength - 1 ; j++ ) { + snvqProbs[i][base - 1 - j] = (j == 0) ? p : minLikelihoodProbRate; // all but last get the min prob + } + } else if ( i != flow_i ) { + for ( int j = 0 ; j < hmerLength - 1 ; j++ ) { + snvqProbs[i][base - 1 - j] = minLikelihoodProbRate; + } + } + } + } + + // override result for the last base with the original hmer error probability + if ( base == baseProbs.length ) { + baseProbs[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; + } + } + } + + // adjust probability of called bases so that sum will be 1, also enforce min prob + final byte[] bases = fbRead.getBasesNoCopy(); + for ( int ofs = 0 ; ofs < bases.length ; ofs++ ) { + + // go through alt bases and accumulate p, find out index of called bases (in flow order) + final byte calledBase = bases[ofs]; + double altP = 0; + int calledIndex = -1; + for (int i = 0; i < flowOrderLength; i++) { + if ( calledBase != flowOrder[i] ) { + snvqProbs[i][ofs] = Math.max(minLikelihoodProbRate, snvqProbs[i][ofs]); + altP += snvqProbs[i][ofs]; + } else { + calledIndex = i; + } + } + if ( calledBase < 0 ) { + throw new GATKException(String.format("failed to locate called base %c in flow order %s", (char)calledBase, flowOrder)); + } + + // install probability in called base slot + snvqProbs[calledIndex][ofs] = Math.max(0, 1 - altP); + + // at this point, bq becomes trivial (?) + baseProbs[ofs] = 1 - snvqProbs[calledIndex][ofs]; + } + + // build return value + ReadProbs readProbs = new ReadProbs(); + readProbs.baseProbs = baseProbs; + readProbs.snvqProbs = snvqProbs; + return readProbs; + } + + // extract error probability bands. middle (1) band is the key prob. + // lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths + private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) { + + // access key + final int[] key = flowRead.getKey(); + + // allocate result + double[][] result = new double[ERROR_PROB_BANDS][]; + for ( int i = 0 ; i < result.length ; i++ ) { + result[i] = new double[key.length]; + } + + for ( int i = 0 ; i < key.length ; i++ ) { + + // extract key probability + result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue); + + // -1 + if ( key[i] > 0 ) { + result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue); + } else { + result[ERROR_PROB_BAND_1LESS][i] = minValue; + } + + // +1 + if ( key[i] < flowRead.getMaxHmer() ) { + result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue); + } else { + result[ERROR_PROB_BAND_1MORE][i] = minValue; + } + } + + return result; + } + + @VisibleForTesting + protected double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, + final int flowOrderLength, byte[] flowOrder, + Map allBaseProb0, Map allBaseProb1) { + + // result is left/right error probabilities + final double[] errorProbs = new double[2]; + final int hmerLength = key[flow]; + + errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength, flowOrder, allBaseProb0); + if ( hmerLength != 1 ) { + errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength, flowOrder, allBaseProb1); + } + + return errorProbs; + } + + private double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, + final int flowOrderLength, final byte[] flowOrder, final Map allBaseProb) { + + // create a key slice of the area around the flow/hmer. + final int minIndex = Math.max(flow - (flowOrderLength - 1), 0); + final int maxIndex = Math.min(flow + (flowOrderLength - 1), key.length - 1); + final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1); + final int hmerLength = key[flow]; + + // walk the flows towards the side until (and including) the first non-zero key + // on hmers of length 1 we walk both sides + final class SliceInfo { + int[] slice; + byte altByte; + int sideFlow; + } + final List slices = new LinkedList<>(); + final int[] incrs = (hmerLength != 1) + ? new int[] { sideIncr } + : new int[] { sideIncr, -sideIncr}; + for (int incr : incrs) { + for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) { + + // side flow can no overflow the slice + if ( sideFlow < minIndex || sideFlow > maxIndex ) { + break; + } + + // create a alternative key slice by incrementing sideFlow and decrementing flow + final int[] altSlice = Arrays.copyOf(slice, slice.length); + altSlice[sideFlow - minIndex] += 1; + altSlice[flow - minIndex] -= 1; + if ( sliceIsValidForConsideration(altSlice, flowOrderLength) ) { + SliceInfo si = new SliceInfo(); + si.slice = altSlice; + si.altByte = flowOrder[sideFlow % flowOrderLength]; + si.sideFlow = sideFlow; + slices.add(si); + } + + // is the sideFlow (the first encountered) non-zero? if so, break + if (key[sideFlow] != 0) { + break; + } + } + } + + // at this point, we have a list of valid slices. figure out the error probability for each of them + // and compute the base quality + final double keyP = sliceProbs(slice, minIndex, key, errorProbBands, flow, flow)[0]; + double sumP = keyP; + for ( final SliceInfo si : slices ) { + final double[] sliceP = sliceProbs(si.slice, minIndex, key, errorProbBands, flow, si.sideFlow); + if ( allBaseProb != null ) { + allBaseProb.put(si.altByte, getSnvq(sliceP[0], sliceP[1], sliceP[2], aqArgs.snvMode)); + } + sumP += sliceP[0]; + } + final double ep = 1 - (keyP / sumP); + + return ep; + } + + static double getSnvq(final double sliceP, final double p1, final double p2, AddFlowSNVQualityArgumentCollection.SnvqModeEnum snvMode) { + if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Legacy ) { + return sliceP; + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Optimistic ) { + return (p1 * p2); + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Pessimistic ) { + return (1 - (1 - p1) * (1 - p2)); + } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Geometric ) { + return Math.sqrt((p1 * p2) * (1 - (1 - p1) * (1 - p2))); + } else { + throw new GATKException("unknown snvqMode: " + snvMode); + } + } + + // compute probability for a slice + private static double[] sliceProbs(final int[] slice, final int minIndex, final int[] key, final double[][] errorProbBands, + final int flow, final int sideFlow) { + + double accumulatedP = 1.0; + double p1 = 0.0; + double p2 = 0.0; + int key_i = minIndex; + for ( int i = 0 ; i < slice.length ; i++, key_i++ ) { + final int hmer = key[key_i]; + final int band; + if ( slice[i] == (hmer - 1) ) { + band = ERROR_PROB_BAND_1LESS; + } else if ( slice[i] == (hmer + 1) ) { + band = ERROR_PROB_BAND_1MORE; + } else if ( slice[i] == hmer ){ + band = ERROR_PROB_BAND_KEY; + } else { + throw new GATKException("slice[i] and hmer are too far apart: " + slice[i] + " " + hmer); + } + final double p = errorProbBands[band][key_i]; + accumulatedP *= p; + + // collect p1/p2 (flow and sideFlow probs) + if ( key_i == flow ) { + p1 = p; + } + if ( key_i == sideFlow ) { + p2 = p; + } + } + + return new double[] {accumulatedP, p1, p2}; + } + + static boolean sliceIsValidForConsideration(final int[] slice, final int flowOrderLength) { + + // look for strings of consecutive zeros in length of flowOrderLength - 1 + int consecutiveZeros = 0; + for ( int key : slice ) { + if ( key != 0 ) { + consecutiveZeros = 0; + } else { + consecutiveZeros++; + if ( consecutiveZeros >= (flowOrderLength - 1) ) { + return false; + } + } + } + + // if here, not found -> valid + return true; + } +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java new file mode 100644 index 00000000000..4c20b14e59c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java @@ -0,0 +1,70 @@ +package org.broadinstitute.hellbender.tools.walkers.featuremapping; + +import org.broadinstitute.barclay.argparser.Advanced; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.Hidden; + +import java.io.Serializable; +import java.util.List; + +/** + * Set of arguments for the {@link AddFlowSNVQuality} + */ +public class AddFlowSNVQualityArgumentCollection implements Serializable{ + private static final long serialVersionUID = 1L; + public static final String MAX_PHRED_SCORE_FULL_NAME = "max-phred-score"; + public static final String KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME = "keep-supplementary-alignments"; + public static final String INCLUDE_QC_FAILED_READ_FULL_NAME = "include-qc-failed-read"; + public static final String SNVQ_MODE_FULL_NAME = "snvq-mode"; + public static final String OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME = "output-quality-attribute"; + public static final String DEBUG_READ_NAME_FULL_NAME = "debug-read-name"; + public static final String DEBUG_COLLECT_STATS_INTO_FULL_NAME = "debug-collect-stats-into"; + + public enum SnvqModeEnum { + Legacy, + Optimistic, + Pessimistic, + Geometric + }; + + /** + * maximum value for + * delta in score + **/ + @Argument(fullName = MAX_PHRED_SCORE_FULL_NAME, doc = "Limit value for phred scores", optional = true) + public double maxPhredScore = Double.NaN; + + /** + * keep supplementary alignments? + **/ + @Argument(fullName = KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME, doc = "keep supplementary alignments ?", optional = true) + public boolean keepSupplementaryAlignments = true; + + @Advanced + @Argument(fullName= INCLUDE_QC_FAILED_READ_FULL_NAME, doc = "include reads with QC failed flag", optional = true) + public boolean includeQcFailedReads = true; + + /** + * snvq computation mode + */ + @Argument(fullName = SNVQ_MODE_FULL_NAME, doc = "snvq calculation mode.", optional = true) + public SnvqModeEnum snvMode = SnvqModeEnum.Geometric; + + /** + * By default this tool overwrites the QUAL field with the new qualities. Setting this argument saves the original qualities in the specified SAM tag. + */ + @Argument(fullName = OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME, doc = "alternate SAM tag to put original quality scores instead of overwriting the QUAL field. If not used, QUAL will be overwritten.", optional = true) + public String outputQualityAttribute = null; + + /** + * debug read names? + **/ + @Hidden + @Argument(fullName = DEBUG_READ_NAME_FULL_NAME, doc = "Read names of reads to output details of as part of debugging. ", optional = true) + public List debugReadName = null; + + @Advanced + @Hidden + @Argument(fullName= DEBUG_COLLECT_STATS_INTO_FULL_NAME, doc = "Statistics about the reads will be output to given filename.", optional = true) + public String debugCollectStatsInto = null; +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java index b3ddc3e21b6..2014ce3030e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java @@ -2,12 +2,14 @@ import htsjdk.samtools.*; import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.SequenceUtil; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.math3.util.Precision; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -81,6 +83,19 @@ @ExperimentalFeature public final class FlowFeatureMapper extends ReadWalker { + static class CopyAttrInfo { + public final String name; + public final VCFHeaderLineType type; + public final String desc; + + public CopyAttrInfo(final String spec) { + final String[] toks = spec.split(","); + name = toks[0]; + type = toks.length > 1 ? VCFHeaderLineType.valueOf(toks[1]) : VCFHeaderLineType.String; + desc = toks.length > 2 ? StringUtils.join(Arrays.copyOfRange(toks, 2, toks.length), ",") : ("copy-attr: " + name); + } + } + private static final Logger logger = LogManager.getLogger(FlowFeatureMapper.class); private static final String VCB_SOURCE = "fm"; @@ -101,8 +116,18 @@ public final class FlowFeatureMapper extends ReadWalker { private static final String VCF_SMQ_RIGHT = "X_SMQ_RIGHT"; private static final String VCF_SMQ_LEFT_MEAN = "X_SMQ_LEFT_MEAN"; private static final String VCF_SMQ_RIGHT_MEAN = "X_SMQ_RIGHT_MEAN"; + private static final String VCF_ADJACENT_REF_DIFF = "X_ADJACENT_REF_DIFF"; + private static final Double LOWEST_PROB = 0.0001; + private static final int VENDOR_QUALITY_CHECK_FLAG = 0x200; + + private static final String INCLUDE_QC_FAILED_READS_FULL_NAME = "include-qc-failed-reads"; + + final private List copyAttrInfo = new LinkedList<>(); + + // order here is according to SequenceUtil.VALID_BASES_UPPER + final private String scoreForBaseNames[] = new String[SequenceUtil.VALID_BASES_UPPER.length]; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -131,6 +156,10 @@ public final class FlowFeatureMapper extends ReadWalker { @Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true) public boolean floorBlocks = false; + @Advanced + @Argument(fullName=INCLUDE_QC_FAILED_READS_FULL_NAME, doc = "include reads with QC failed flag", optional = true) + public boolean includeQcFailedReads = false; + @ArgumentCollection public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); @@ -175,6 +204,9 @@ protected static class MappedFeature implements Comparable { int smqLeftMean; int smqRightMean; + double scoreForBase[]; + boolean adjacentRefDiff; + public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases, byte[] refBases, int readBasesOffset, int start, int offsetDelta) { this.read = read; @@ -315,8 +347,20 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the right of the feature")); headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the left of the feature")); headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the right of the feature")); - for ( String name : fmArgs.copyAttr ) { - headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name)); + for ( String spec : fmArgs.copyAttr ) { + final CopyAttrInfo info = new CopyAttrInfo(spec); + headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + info.name, 1, info.type, info.desc)); + copyAttrInfo.add(info); + } + + // validation mode? + if ( fmArgs.reportAllAlts ) { + for ( int baseIndex = 0 ; baseIndex < SequenceUtil.VALID_BASES_UPPER.length ; baseIndex++ ) { + headerInfo.add(new VCFInfoHeaderLine(scoreNameForBase(baseIndex), 1, VCFHeaderLineType.Float, "Base specific mapping score")); + } + } + if ( fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff ) { + headerInfo.add(new VCFInfoHeaderLine(VCF_ADJACENT_REF_DIFF, 1, VCFHeaderLineType.Flag, "Adjacent base filter indication: indel in the adjacent 5 bases to the considered base on the read")); } final VCFHeader vcfHeader = new VCFHeader(headerInfo); @@ -324,6 +368,13 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f return vcfHeader; } + private String scoreNameForBase(int baseIndex) { + if ( scoreForBaseNames[baseIndex] == null ) { + scoreForBaseNames[baseIndex] = VCF_SCORE + "_" + new String(new byte[]{SequenceUtil.VALID_BASES_UPPER[baseIndex]}); + } + return scoreForBaseNames[baseIndex]; + } + @Override public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { @@ -337,6 +388,11 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext, return; } + // include qc-failed reads? + if ( ((read.getFlags() & VENDOR_QUALITY_CHECK_FLAG) != 0) && !includeQcFailedReads ) { + return; + } + // flush qeues up to this read flushQueue(read, referenceContext); @@ -349,6 +405,20 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext, // score the feature fr.score = scoreFeature(fr); + // score for validation mode? + if ( fmArgs.reportAllAlts) { + fr.scoreForBase = new double[SequenceUtil.VALID_BASES_UPPER.length]; + for ( int baseIndex = 0 ; baseIndex < fr.scoreForBase.length ; baseIndex++ ) { + final byte base = SequenceUtil.VALID_BASES_UPPER[baseIndex]; + boolean incl = base != fr.readBases[0]; + if ( incl ) { + fr.scoreForBase[baseIndex] = scoreFeature(fr, base); + } else { + fr.scoreForBase[baseIndex] = Double.NaN; + } + } + } + // emit feature if filters in if ( filterFeature(fr) ) { featureQueue.add(fr); @@ -413,14 +483,17 @@ private void enrichFeature(final MappedFeature fr) { } private double scoreFeature(final MappedFeature fr) { + return scoreFeature(fr, (byte)0); + } + private double scoreFeature(final MappedFeature fr, byte altBase) { // build haplotypes final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), fr.read); - final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder); + final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder, altBase); // create flow read - final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, - rgInfo.maxClass, fbargs); + final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final int diffLeft = haplotypes[0].getStart() - flowRead.getStart() + fr.offsetDelta; final int diffRight = flowRead.getEnd() - haplotypes[0].getEnd(); flowRead.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), false); @@ -524,16 +597,24 @@ public static double computeLikelihoodLocal(final FlowBasedRead read, final Flow return result; } - private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder) { + private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder, byte altBase) { // build bases for flow haplotypes // NOTE!!!: this code assumes length of feature on read and reference is the same // this is true for SNP but not for INDELs - it will have to be re-written! // TODO: write for INDEL - final byte[] bases = fr.read.getBasesNoCopy(); + byte[] bases = fr.read.getBasesNoCopy(); int offset = fr.readBasesOffset; int refStart = fr.start; int refModOfs = 0; + + // install alt base? + byte orgBase = 0; + if ( altBase != 0 ) { + orgBase = fr.refBases[0]; + fr.refBases[0] = altBase; + } + if ( offset > 0 ) { // reach into hmer before offset--; @@ -569,6 +650,11 @@ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final Strin new FlowBasedHaplotype(refHaplotype, flowOrder) }; + // restore changes + if ( altBase != 0 ) { + fr.refBases[0] = orgBase; + } + // return return result; } @@ -590,7 +676,11 @@ private void emitFeature(final MappedFeature fr) { // create alleles final Collection alleles = new LinkedList<>(); - alleles.add(Allele.create(fr.readBases, false)); + if ( fmArgs.reportAllAlts && Arrays.equals(fr.readBases, fr.refBases) ) { + alleles.add(Allele.create("*".getBytes(), false)); + } else { + alleles.add(Allele.create(fr.readBases, false)); + } alleles.add(Allele.create(fr.refBases, true)); // create variant context builder @@ -625,11 +715,34 @@ private void emitFeature(final MappedFeature fr) { vcb.attribute(VCF_SMQ_RIGHT_MEAN, fr.smqRightMean); } - for ( String name : fmArgs.copyAttr ) { - if ( fr.read.hasAttribute(name) ) { - vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name)); + for ( CopyAttrInfo info : copyAttrInfo ) { + if ( fr.read.hasAttribute(info.name) ) { + final String attrName = fmArgs.copyAttrPrefix + info.name; + if ( info.type == VCFHeaderLineType.Integer ) { + vcb.attribute(attrName, fr.read.getAttributeAsInteger(info.name)); + } else if ( info.type == VCFHeaderLineType.Float ) { + vcb.attribute(attrName, fr.read.getAttributeAsFloat(info.name)); + } else { + vcb.attribute(attrName, fr.read.getAttributeAsString(info.name)); + } + } + } + + // validation mode? + if ( fmArgs.reportAllAlts ) { + if ( fr.scoreForBase != null ) { + for (int baseIndex = 0; baseIndex < SequenceUtil.VALID_BASES_UPPER.length; baseIndex++) { + if (!Double.isNaN(fr.scoreForBase[baseIndex])) { + vcb.attribute(scoreNameForBase(baseIndex), String.format("%.5f", fr.scoreForBase[baseIndex])); + } + } } } + if ( fr.adjacentRefDiff ) { + vcb.attribute(VCF_ADJACENT_REF_DIFF, true); + } + + // build it! final VariantContext vc = vcb.make(); // write to file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java index 22c13b00878..03660746d2b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java @@ -32,7 +32,7 @@ enum MappingFeatureEnum { /** * attributes to copy from bam **/ - @Argument(fullName = "copy-attr", doc = "attributes to copy from bam", optional = true) + @Argument(fullName = "copy-attr", doc = "attributes to copy from bam. format ,,. types: Integer, Float, String, Character, Flag", optional = true) public List copyAttr = new LinkedList<>(); /** @@ -116,4 +116,18 @@ enum MappingFeatureEnum { @Hidden @Argument(fullName = "surrounding-mean-quality-size", doc = "number of bases around the feature to calculate surrounding mean quality", optional = true) public Integer surroundingMeanQualitySize = null; + + /** + * validation mode - if not specified, this feature is off + **/ + @Hidden + @Argument(fullName = "report-all-alts", doc = "In this mode (aka validation mode), every base of every read in the input CRAM and interval is reported, and an X_SCORE value is calculated for all 3 possible alts", optional = true) + public boolean reportAllAlts = false; + + /** + * adjacent-ref-diff mode - if not specified, this feature is off + **/ + @Hidden + @Argument(fullName = "tag-bases-with-adjacent-ref-diff", doc = "In this mode bases that have an adjacent difference from the reference on the same read are not discarded, and tagged with X_ADJACENT_REF_DIFFm", optional = true) + public boolean tagBasesWithAdjacentRefDiff = false; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java index 05c952c317a..3f23cb5a6e6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java @@ -1,7 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.featuremapping; import htsjdk.samtools.CigarElement; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.text.similarity.LevenshteinDistance; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -21,22 +21,34 @@ public class SNVMapper implements FeatureMapper { - final int identBefore; - final int identAfter; + final int surroundBefore; + final int surroundAfter; final int minCigarElementLength; final LevenshteinDistance levDistance = new LevenshteinDistance(); final Integer smqSize; final Integer smqSizeMean; + final boolean ignoreSurround; + final int spanBefore; + final int spanAfter; + + final FlowFeatureMapperArgumentCollection fmArgs; + public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) { - identBefore = fmArgs.snvIdenticalBases; - identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore; - minCigarElementLength = identBefore + 1 + identAfter; + surroundBefore = fmArgs.snvIdenticalBases; + surroundAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : surroundBefore; smqSize = fmArgs.surroundingMediaQualitySize; smqSizeMean = fmArgs.surroundingMeanQualitySize; + this.fmArgs = fmArgs; + + // ignore surround + ignoreSurround = fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff; + spanBefore = ignoreSurround ? 0 : surroundBefore; + spanAfter = ignoreSurround ? 0 : surroundAfter; + minCigarElementLength = spanBefore + 1 + spanAfter; // adjust minimal read length - FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter); + FlowBasedRead.setMinimalReadLength(1 + 1 + spanAfter); } @Override @@ -93,30 +105,44 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons if ( length >= minCigarElementLength && cigarElement.getOperator().consumesReadBases() && cigarElement.getOperator().consumesReferenceBases() ) { - readOfs += identBefore; - refOfs += identBefore; - for ( int ofs = identBefore ; ofs < length - identAfter ; ofs++, readOfs++, refOfs++ ) { + readOfs += spanBefore; + refOfs += spanBefore; + for ( int ofs = spanBefore ; ofs < length - spanAfter ; ofs++, readOfs++, refOfs++ ) { - if ( ref[refOfs] != 'N' && bases[readOfs] != ref[refOfs] ) { + if ( ref[refOfs] != 'N' && (fmArgs.reportAllAlts || (bases[readOfs] != ref[refOfs])) ) { // check that this is really a SNV (must be surrounded by identical ref) boolean surrounded = true; - for ( int i = 0 ; i < identBefore && surrounded ; i++ ) { - if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) { + for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) { + final int bIndex = readOfs-1-i; + final int rIndex = refOfs-1-i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - for ( int i = 0 ; i < identAfter && surrounded ; i++ ) { - if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) { + for (int i = 0; i < surroundAfter && surrounded ; i++ ) { + final int bIndex = readOfs+1+i; + final int rIndex = refOfs+1+i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - if ( !surrounded ) { + if ( (!fmArgs.reportAllAlts && !fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) { continue; } // add this feature FlowFeatureMapper.MappedFeature feature = FlowFeatureMapper.MappedFeature.makeSNV(read, readOfs, ref[refOfs], referenceContext.getStart() + refOfs, readOfs - refOfs); + if ( (fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) + feature.adjacentRefDiff = true; feature.nonIdentMBasesOnRead = nonIdentMBases; feature.refEditDistance = refEditDistance; if ( !read.isReverseStrand() ) @@ -166,8 +192,8 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons features.add(feature); } } - readOfs += identAfter; - refOfs += identAfter; + readOfs += spanAfter; + refOfs += spanAfter; } else { @@ -243,8 +269,8 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen cigarElement.getOperator().consumesReferenceBases() ) { // break out if not enough clearing - if ( (start < referenceContext.getStart() + refOfs + identBefore) || - (start >= referenceContext.getStart() + refOfs + length - identAfter) ) + if ( (start < referenceContext.getStart() + refOfs + spanBefore) || + (start >= referenceContext.getStart() + refOfs + length - spanAfter) ) return FilterStatus.Filtered; int delta = start - (referenceContext.getStart() + refOfs); @@ -255,13 +281,25 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen // check that this is really a SNV (must be surrounded by identical ref) boolean surrounded = true; - for ( int i = 0 ; i < identBefore && surrounded ; i++ ) { - if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) { + for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) { + final int bIndex = readOfs-1-i; + final int rIndex = refOfs-1-i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } - for ( int i = 0 ; i < identAfter && surrounded ; i++ ) { - if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) { + for (int i = 0; i < surroundAfter && surrounded ; i++ ) { + final int bIndex = readOfs+1+i; + final int rIndex = refOfs+1+i; + if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) { + surrounded = false; + continue; + } + if ( bases[bIndex] != ref[rIndex] ) { surrounded = false; } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java index dd6a2e93838..a2642f20150 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java @@ -114,6 +114,7 @@ public final class VariantFiltration extends VariantWalker { public static final String CLUSTER_WINDOW_SIZE_LONG_NAME = "cluster-window-size"; public static final String MASK_EXTENSION_LONG_NAME = "mask-extension"; public static final String MASK_NAME_LONG_NAME = "mask-name"; + public static final String MASK_DESCRIPTION_LONG_NAME = "mask-description"; public static final String FILTER_NOT_IN_MASK_LONG_NAME = "filter-not-in-mask"; public static final String MISSING_VAL_LONG_NAME = "missing-values-evaluate-as-failing"; public static final String INVERT_LONG_NAME = "invert-filter-expression"; @@ -238,6 +239,14 @@ public final class VariantFiltration extends VariantWalker { @Argument(fullName=ALLELE_SPECIFIC_LONG_NAME, optional=true, doc="Set mask at the allele level. This option is not compatible with clustering.") public boolean applyForAllele = false; + /** + * If a mask interval list is provided, then set the description of the filter in the VCF header to this String. + * Note that if spaces are needed, then the entire description should be enclosed in quotes. Also note that if + * --filter-not-in-mask is used, the description should be adapted to reflect the reverse logic. + */ + @Argument(fullName=MASK_DESCRIPTION_LONG_NAME, optional=true, doc="Description to add to the FILTER field in VCF header for the mask filter.") + public String maskDescription; + // JEXL expressions for the filters private List filterExps; private List genotypeFilterExps; @@ -305,7 +314,9 @@ private void initializeVcfWriter() { } if ( mask != null ) { - if (filterRecordsNotInMask) { + if (maskDescription != null) { + hInfo.add(new VCFFilterHeaderLine(maskName, maskDescription)); + } else if (filterRecordsNotInMask) { hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask")); } else { hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask")); @@ -331,6 +342,9 @@ public void onTraversalStart() { if (filterRecordsNotInMask && mask == null) { throw new CommandLineException.BadArgumentValue(FILTER_NOT_IN_MASK_LONG_NAME, "argument not allowed if mask argument is not provided"); } + if (maskDescription != null && mask == null) { + throw new CommandLineException.BadArgumentValue(MASK_DESCRIPTION_LONG_NAME, "argument not allowed if mask argument is not provided"); + } filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions); genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions); howToTreatMissingValues = failMissingValues ? JexlMissingValueTreatment.TREAT_AS_MATCH : JexlMissingValueTreatment.TREAT_AS_MISMATCH; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java index 60f1ef7b205..aede6c33fc2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java @@ -130,7 +130,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, } gb.PL(newLikelihoods); - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc); + GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g, gpc); // restrict SAC to the new allele subset if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) { @@ -139,7 +139,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, } // restrict AD to the new allele subset - if(g.hasAD()) { + if(g.hasAD() && gb.makeWithShallowCopy().hasAD()) { final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD()); gb.AD(newAD); // if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java index bbc9cb888aa..eaa9bad7033 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypeAssignmentMethod.java @@ -56,6 +56,7 @@ public enum GenotypeAssignmentMethod { /** * Use PLs unless they are unavailable, in which case use best match to original + * GQ0 hom-refs will be converted to no-calls */ PREFER_PLS } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java index 7efb5687ac3..9b6c5030497 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java @@ -4,7 +4,7 @@ import com.google.common.primitives.Ints; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.*; @@ -349,6 +349,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis //If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration), // then we need to actually find the GT from PLs makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles); + } else if (g.hasGQ() && g.getGQ() == 0) { + makeGenotypeCall(g, genotypeBuilder, null, targetAlleles); //null likelihoods for reblocked hom-ref that we want to no-call } final Map attrs = new HashMap<>(g.getExtendedAttributes()); attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY); @@ -391,14 +393,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb, final double[] genotypeLikelihoods, final List allelesToUse) { - if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) { - //gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ(); - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL, - genotypeLikelihoods, allelesToUse, null); - } else { - GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, - genotypeLikelihoods, allelesToUse, null); - } + GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, + genotypeLikelihoods, allelesToUse, g, null); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java index a0f804ffc11..bd8cf62e851 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java @@ -239,6 +239,25 @@ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, fi return result; } + /** + * The following functions estimate the error probability for an hmer. specifically two error + * probability values are generated: one for the first base of the hmer and another for the + * rest of its bases. + * + * The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability + * It iterates over the possible valid combinations of errors and sums them up. + * + * @param key - key (hmer length) in flow space + * @param errorProbBands - for each flow (position in the key) three error probabilities are provided: + * [0] - for the hmer being one base shorter + * [1] - for the hmer to be at its length + * [2] - for the hmer to be one base longer + * @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length) + * @param flowOrderLength - the cycle length of of the flow order (usually 4) + * @return an array of two probabilities: + * [0] - probability for the first base of the hmer + * [1] - probability for the rest of the bases of the hmer + */ @VisibleForTesting protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java index cee262d277c..74434c89cbe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java @@ -5,7 +5,7 @@ import htsjdk.samtools.util.Locatable; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.Tuple; -import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -858,9 +858,9 @@ else if ( hasQ || hasZ ) { cols.put("ReadName", read.getName()); // haplotypes and reference scores - cols.put("PaternalHaplotypeScore", paternal.score); - cols.put("MaternalHaplotypeScore", maternal.score); - cols.put("RefHaplotypeScore", refScore); + cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score)); + cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score)); + cols.put("RefHaplotypeScore", String.format("%.6f", refScore)); // build haplotype keys final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java new file mode 100644 index 00000000000..3cee7b801bb --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java @@ -0,0 +1,926 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import com.opencsv.CSVReader; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.math3.util.Precision; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerUtils; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentLikelihoodEngine; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LikelihoodEngineArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadLikelihoodCalculationEngine; +import org.broadinstitute.hellbender.utils.BaseUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.clipping.ReadClipper; +import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype; +import org.broadinstitute.hellbender.utils.haplotype.Haplotype; +import org.broadinstitute.hellbender.utils.read.*; +import org.broadinstitute.hellbender.utils.report.GATKReport; +import org.broadinstitute.hellbender.utils.report.GATKReportTable; + +import java.io.*; +import java.text.DecimalFormat; +import java.util.*; +import java.util.zip.GZIPOutputStream; + +/** + * Converts Ultima reads into flow-based annotation, and provides some general statistics regarding + * quality and errors relative to the reference. Ultima data is flow-based, and thus the original computed + * quality refers to each flow, rather than each base. In the Ultima cram/bam, there is a per-base representation + * of the original flow qualities, where the original quality is distributed along each flow (homopolymer). + * In order to reconstitute the original flow information, the tool incorporates the information encoded + * in the Ultima cram/bam, and outputs both the read in flow space, as well as a conversion of the aligned + * reference portion into flow space, and an alignment score. + * + *

Input

+ *
    + *
  • Ultima aligned SAM/BAM/CRAM
  • + *
+ * + *

Output

+ *
    + *
  • Per read ground truth information CSV and a ground truth scoring quality report, in GATK report format
  • + *
+ * + *

CSV Output Description

+ * csv with the read representation in flow space. The csv includes the following columns: + *
  • ReadName
  • + *
  • ReadKey : The signal of the read at each flow according to the flow order
  • + *
  • ReadIsReversed : Whether the read is reversed in the alignment
  • + *
  • ReadMQ : The mapping quality of the read
  • + *
  • ReadRQ : The read rq value
  • + *
  • GroundTruthKey : The aligned reference section, translated into per-flow signals
  • + *
  • ReadSequence
  • + *
  • Score : A flow-based alignment score. Since the alignment is per-flow, in the case that there’s a cycle skip, the read and reference flow signals will not be aligned, and therefore the score will be inaccurate.
  • + *
  • NormalizedScore: A flow-based normalized alignment score
  • + *
  • ErrorProbability : The error of each flow (corresponds to the signals in ReadKey)
  • + *
  • ReadKeyLength
  • + *
  • GroundTruthKeyLength
  • + *
  • CycleSkipStatus : One of NS (Non Skip), PCS (Possible Cycle Skip), or CS (Cycle Skip)
  • + *
  • Cigar
  • + *
  • LowestQBaseTP
  • + * + *

    GATK Report Description

    + * In the quality report (optional), the following tables are included: + * + *
  • qualReport:error rate per qual : The error rate for each quality. Columns:
  • + *
      + *
    • qual: The encoded quality + *
    • count: The number of times the quality was observed + *
    • error: The error rate of the flows with this qual + *
    • phred: the error translated into a phred score + *
    + * + *
  • qual_hmerReport:error rate per qual by hmer. The error rate for each quality and hmer combination. Columns:
  • + *
      + *
    • qual: The encoded quality + *
    • hmer: The hmer length + *
    • count: The number of times the quality was observed + *
    • error: The error rate of the flows with this qual + *
    + * + *
  • qual_hmer_deviation_base_Report:error rate per qual by hmer and deviation. The count of errors for each qual, hmer, deviation and base
  • + *
      + *
    • qual: The encoded quality + *
    • hmer: The hmer length + *
    • deviation: The deviation (difference in signal, relative to the reference) + *
    • base: The base + *
    • count: The number of times the deviation was observed + *
    + * + *
  • Phred/qual statistics per flow position report. Various statistics for each flow position in relationship to the found quality value. Columns:
  • + *
      + *
    • flow - flow position
    • + *
    • count - count of observations
    • + *
    • min - minimal observed quality
    • + *
    • max - maximal observed quality
    • + *
    • mean - mean observed value
    • + *
    • median - median observed value
    • + *
    • std - standard deviation
    • + *
    • p1...Pn - percentil columns, accotding to the --quality-percentiles parameter
    • + *
    + * + *

    Usage examples

    + *
    + * gatk GroundTruthScorer \
    + *   -I input.bam \
    + *   -R reference.fasta.gz
    + *   -L chr20 \
    + *   --output-csv output.csv \
    + *   --report-file report.txt \
    + *   --omit-zeros-from-report \ (optional)
    + *   --features-file dbsnp.chr9.vcf.gz \ (optional)
    + *   --genome-prior genome_prior.csv (optional)
    + * 
    + * + * {@GATK.walkertype ReadWalker} + */ + +@CommandLineProgramProperties( + summary = "Ground Truth Scorer", + oneLineSummary = "Score reads against a reference/ground truth", + programGroup = FlowBasedProgramGroup.class +) +@DocumentedFeature +@ExperimentalFeature +public class GroundTruthScorer extends ReadWalker { + private static final Logger logger = LogManager.getLogger(GroundTruthScorer.class); + + public static final String OUTPUT_CSV_LONG_NAME = "output-csv"; + public static final String REPORT_FILE_LONG_NAME = "report-file"; + public static final String USE_SOFTCLIPPED_BASES_LONG_NAME = "use-softclipped-bases"; + public static final String GENOME_PRIOR_LONG_NAME = "genome-prior"; + public static final String FEATURES_FILE_LONG_NAME = "features-file"; + public static final String NORMALIZED_SCORE_THRESHOLD_LONG_NAME = "normalized-score-threshold"; + public static final String ADD_MEAN_CALL_LONG_NAME = "add-mean-call"; + public static final String GT_NO_OUTPUT_LONG_NAME = "gt-no-output"; + public static final String OMIT_ZEROS_FROM_REPORT = "omit-zeros-from-report"; + public static final String QUALITY_PERCENTILES = "quality-percentiles"; + public static final String EXCLUDE_ZERO_FLOWS = "exclude-zero-flows"; + + private static final int QUAL_VALUE_MAX = 60; + private static final int HMER_VALUE_MAX = 100; //TODO: This should become a parameter + private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1; + + private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1; + private static final double DEFAULT_RATIO_THRESHOLD = 0.003; + + /* + Private accumulator class for counting false/true observations (hence Boolean). + + Observations are counted at a top level and are also optionally classified into a set of bins (the + number of which is fixed upon construction). The bins themselves are also BooleanAccumulator objects, + resulting in a tree like multi-level accumulator. + + + GroundTruthScores builds a four level deep accumulation tree, which can support observations of a + boolean event with 3-deep context (bin1,bin2,bin3). + + Once accumulation is done, the instance is able to generate a suitable GATKReportTable for any given + bin depth (1, 2 or 3). + + */ + private static class BooleanAccumulator { + long falseCount; + long trueCount; + BooleanAccumulator[] bins; + + // add an observation to this accumulator + void add(final boolean b) { + if (b) { + trueCount++; + } else { + falseCount++; + } + } + + // add an observation to this accumulator and to one of the bins in the level under it (1 deep) + void add(final boolean b, final int bin) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b); + } + } + + // add an observation to this accumulator and to two levels of bins under it (2 deep) + void add(final boolean b, final int bin, final int bin2) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b, bin2); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2); + } + } + + // add an observation to this accumulator and to three levels of bins under it (3 deep) + void add(final boolean b, final int bin, final int bin2, final int bin3) { + add(b); + if ( bins != null && bin >= 0 && bin < bins.length ) { + bins[bin].add(b, bin2, bin3); + } else { + logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped"); + bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2, bin3); + } + } + + // get observation count of this accumulator + long getCount() { + return falseCount + trueCount; + } + + // get the false rate/ration for this accumulator + double getFalseRate() { + return (getCount() == 0) ? 0.0 : ((double)falseCount / getCount()); + } + + // create a set of accumulators with 3-deep bin nesting + static BooleanAccumulator[] newReport(final int size, final int binCount, final int binCount2, final int binCount3) { + BooleanAccumulator[] report = new BooleanAccumulator[size]; + for ( byte i = 0 ; i < report.length ; i++ ) { + report[i] = new BooleanAccumulator(); + if ( binCount != 0 ) { + report[i].bins = new BooleanAccumulator[binCount]; + for ( int j = 0 ; j < report[i].bins.length ; j++ ) { + report[i].bins[j] = new BooleanAccumulator(); + if ( binCount2 != 0 ) { + report[i].bins[j].bins = new BooleanAccumulator[binCount2]; + for ( int k = 0 ; k < report[i].bins[j].bins.length ; k++ ) { + report[i].bins[j].bins[k] = new BooleanAccumulator(); + if (binCount3 != 0) { + report[i].bins[j].bins[k].bins = new BooleanAccumulator[binCount3]; + for (int m = 0; m < report[i].bins[j].bins[k].bins.length; m++) { + report[i].bins[j].bins[k].bins[m] = new BooleanAccumulator(); + } + } + } + } + } + } + } + return report; + } + + // create a GATK report from a set of accumulators without nesting into their bins + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name, final double probThreshold, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name + "Report", "error rate per " + name, 4); + table.addColumn(name, "%d"); + table.addColumn("count", "%d"); + table.addColumn("error", "%f"); + table.addColumn("phred", "%d"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + if ( omitZeros && i != 0 && report[i].getCount() == 0 ) + continue; + else { + final double rate = report[i].getFalseRate(); + final double phredRate = (rate == 0.0 && report[i].getCount() != 0 && probThreshold != 0.0) ? probThreshold : rate; + + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, report[i].getCount()); + table.set(rowIndex, 2, rate); + table.set(rowIndex, 3, phredRate != 0 ? (int) Math.ceil(-10.0 * Math.log10(phredRate)) : 0); + rowIndex++; + } + } + return table; + } + + // create a GATK report from a set of accumulators while nesting into one level of their bins (1 deep) + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "Report", "error rate per " + name1 + " by " + name2, 4); + table.addColumn(name1, "%d"); + table.addColumn(name2, "%d"); + table.addColumn("count", "%d"); + table.addColumn("error", "%f"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + for ( int j = 0; j < report[i].bins.length ; j++ ) { + if ( omitZeros && (i != 0 || j != 0) && report[i].bins[j].getCount() == 0 ) + continue; + else { + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, j); + table.set(rowIndex, 2, report[i].bins[j].getCount()); + table.set(rowIndex, 3, report[i].bins[j].getFalseRate()); + rowIndex++; + } + } + } + return table; + } + + // create a GATK report from a set of accumulators while nesting into two levels of their bins (2 deep) + static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final String name3, final String name4, final boolean omitZeros) { + final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "_" + name3 + "_" + name4 + "_Report", "error rate per " + name1 + " by " + name2 + " and " + name3, 5); + table.addColumn(name1, "%d"); + table.addColumn(name2, "%d"); + table.addColumn(name3, "%s"); + table.addColumn(name4, "%s"); + table.addColumn("count", "%d"); + int rowIndex = 0; + for (int i = 0; i < report.length; i++) { + for ( int j = 0; j < report[i].bins.length ; j++ ) { + for ( int k = 0; k < report[i].bins[j].bins.length ; k++ ) { + for ( int m = 0; m < report[i].bins[j].bins[k].bins.length ; m++ ) { + if ( omitZeros && (i != 0 || j != 0 || k != 0 || m != 0) && report[i].bins[j].bins[k].bins[m].getCount() == 0 ) + continue; + else { + table.set(rowIndex, 0, i); + table.set(rowIndex, 1, j); + table.set(rowIndex, 2, binToDeviation(k)); + table.set(rowIndex, 3, String.format("%c", binToBase(m))); + table.set(rowIndex, 4, report[i].bins[j].bins[k].bins[m].getCount()); + rowIndex++; + } + } + } + } + } + return table; + } + } + + private static class PercentileReport extends SeriesStats { + + static GATKReportTable newReportTable(final Vector report, String qualityPercentiles) { + String[] qp = qualityPercentiles.split(","); + final GATKReportTable table = new GATKReportTable("PhredBinAccumulator", "PhredBinAccumulator", 8 + qp.length); + table.addColumn("flow", "%d"); + table.addColumn("count", "%d"); + table.addColumn("min", "%f"); + table.addColumn("max", "%f"); + table.addColumn("mean", "%f"); + table.addColumn("median", "%f"); + table.addColumn("std", "%f"); + for ( final String p : qp ) { + table.addColumn("p" + p, "%f"); + } + int rowIndex = 0; + for ( final PercentileReport r : report ) { + int col = 0; + table.set(rowIndex, col++, rowIndex); + table.set(rowIndex, col++, r.getCount()); + table.set(rowIndex, col++, r.getMin()); + table.set(rowIndex, col++, r.getMax()); + table.set(rowIndex, col++, r.getMean()); + table.set(rowIndex, col++, r.getMedian()); + table.set(rowIndex, col++, r.getStd()); + for ( String p : qp ) { + table.set(rowIndex, col++, r.getPercentile(Double.parseDouble(p))); + } + rowIndex++; + } + return table; + } + + void addProb(double p) { + super.add(-10 * Math.log10(p)); + } + } + + @Argument(fullName = OUTPUT_CSV_LONG_NAME, doc="main CSV output file. supported file extensions: .csv, .csv.gz.") + public GATKPath outputCsvPath = null; + + @Argument(fullName = REPORT_FILE_LONG_NAME, doc="report output file.", optional = true) + public GATKPath reportFilePath = null; + + @ArgumentCollection + public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection(); + + @ArgumentCollection + public FlowBasedAlignmentArgumentCollection fbargs = new FlowBasedAlignmentArgumentCollection(); + + @Argument(fullName = USE_SOFTCLIPPED_BASES_LONG_NAME, doc="", optional = true) + public boolean useSoftclippedBases; + + @Argument(fullName = GENOME_PRIOR_LONG_NAME, doc="CSV input file containing genome-prior (one line per base with hmer frequencies).", optional = true) + public GATKPath genomePriorPath; + + @Argument(fullName = FEATURES_FILE_LONG_NAME, doc="A VCF file containing features to be used as a use for filtering reads.", optional = true) + public FeatureDataSource features; + + @Argument(fullName = NORMALIZED_SCORE_THRESHOLD_LONG_NAME, doc="threshold for normalized score, below which reads are ignored", optional = true) + public double normalizedScoreThreshold = NORMALIZED_SCORE_THRESHOLD_DEFAULT; + + @Argument(fullName = ADD_MEAN_CALL_LONG_NAME, doc="Add ReadMeanCall and ReadProbs columns to output", optional = true) + public boolean addMeanCalll; + + @Argument(fullName = GT_NO_OUTPUT_LONG_NAME, doc = "do not generate output records", optional = true) + public boolean noOutput = false; + + @Argument(fullName = OMIT_ZEROS_FROM_REPORT, doc = "omit zero values from output report", optional = true) + public boolean omitZerosFromReport = false; + + @Argument(fullName = QUALITY_PERCENTILES, doc = "list of quality percentiles, defaults to 10,25,50,75,90", optional = true) + public String qualityPercentiles = "10,25,50,75,90"; + + @Argument(fullName = EXCLUDE_ZERO_FLOWS, doc = "should flows with a call of zero be included in the percentile report?", optional = true) + public boolean excludeZeroFlows = false; + + // locals + private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine; + private PrintWriter outputCsv; + private DecimalFormat doubleFormat = new DecimalFormat("0.0#####"); + private GenomePriorDB genomePriorDB; + private BooleanAccumulator[] qualReport; + private String[] csvFieldOrder; + private Vector percentileReports; + + // static/const + static final private String[] CSV_FIELD_ORDER_BASIC = { + "ReadName", "ReadKey", "ReadIsReversed", "ReadMQ", "ReadRQ", "GroundTruthKey", "ReadSequence", + "Score", "NormalizedScore", "ErrorProbability", + "ReadKeyLength", "GroundTruthKeyLength", "CycleSkipStatus", "Cigar", "LowestQBaseTP" + }; + static final private String[] CSV_FIELD_ORDER_MEAN_CALL = { + "ReadProbs", "ReadMeanCall" + }; + + @Override + public void onTraversalStart() { + super.onTraversalStart(); + + // establish csv fields + List order = new LinkedList<>(Arrays.asList(CSV_FIELD_ORDER_BASIC)); + if ( addMeanCalll ) { + order.addAll(Arrays.asList(CSV_FIELD_ORDER_MEAN_CALL)); + } + csvFieldOrder = order.toArray(new String[0]); + + // create likelihood engine + final ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false); + if ( engine instanceof FlowBasedAlignmentLikelihoodEngine ) { + likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine; + } else { + throw new GATKException("must use a flow based likelihood calculation engine"); + } + + // open genome prior if provided + if ( genomePriorPath != null ) { + try { + genomePriorDB = new GenomePriorDB(genomePriorPath); + } catch (IOException e) { + throw new GATKException("failed to open genome-prior file: " + genomePriorPath); + } + } + + // open output, write header + try { + if (outputCsvPath.toPath().toString().endsWith(".gz")) { + outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream())); + } else { + outputCsv = new PrintWriter(outputCsvPath.getOutputStream()); + } + } catch (IOException e) { + throw new GATKException("failed to open csv output: " + outputCsvPath, e); + } + emitCsvHeaders(); + + // initialize reports + if ( reportFilePath != null ) { + qualReport = BooleanAccumulator.newReport(QUAL_VALUE_MAX + 1, HMER_VALUE_MAX + 1, deviationToBin(HMER_VALUE_MAX + 1), BASE_VALUE_MAX + 1); + + // establish max hmer size of flow input + int maxClass = FlowBasedRead.MAX_CLASS; + final SAMFileHeader header = getHeaderForReads(); + if ( header != null ) { + for ( SAMReadGroupRecord rg : header.getReadGroups() ) { + if ( rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG) != null ) { + maxClass = Math.max(maxClass, Integer.parseInt(rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG))); + } + } + } + percentileReports = new Vector<>(); + } + } + + @Override + public void closeTool() { + + // close main output csv + if ( outputCsv != null ) { + outputCsv.close(); + } + + // write reports + if ( reportFilePath != null ) { + final GATKReport report = new GATKReport( + BooleanAccumulator.newReportTable(qualReport, "qual", DEFAULT_RATIO_THRESHOLD, omitZerosFromReport), + BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport), + BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport), + PercentileReport.newReportTable(percentileReports, qualityPercentiles) + ); + try ( final PrintStream ps = new PrintStream(reportFilePath.getOutputStream()) ) { + report.print(ps); + } + } + + super.closeTool(); + } + + @Override + public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { + + // working with unmapped reads is really not practical, as we need the reference to work out ground truth + if ( read.isUnmapped() ) + return; + if ( referenceContext.getWindow() == null ) + return; + + // handle read clipping + final GATKRead clippedRead; + if (isSoftClipped(read) ) { + if (useSoftclippedBases) { + referenceContext.setWindow(read.getStart() - read.getUnclippedStart(), read.getUnclippedEnd() - read.getEnd());; + clippedRead = ReadClipper.revertSoftClippedBases(read); + } else { + clippedRead = ReadClipper.hardClipSoftClippedBases(read); + } + } else { + clippedRead = read; + } + + // filter? + if ( (features != null) && !filter(clippedRead, referenceContext) ) { + return; + } + + // create flow read/haplotype + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), clippedRead); + final FlowBasedRead flowRead = new FlowBasedRead(clippedRead, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true); + final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, rgInfo.flowOrder); + + // is this really needed? + if ( !flowRead.isValid() ) { + return; + } + + // compute score + final int hapKeyLength = flowHaplotype.getKeyLength(); + final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false); + final double normalizedScore = score / flowRead.getKeyLength(); + if ( normalizedScore < normalizedScoreThreshold ) + return; + + // compute error probability + final double[] errorProb = computeErrorProb(flowRead, genomePriorDB); + + // cycle skip + final FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus = FlowBasedReadUtils.getCycleSkipStatus(flowRead, referenceContext); + + // accumulate reports + if ( cycleSkipStatus != FlowBasedReadUtils.CycleSkipStatus.CS && qualReport != null ) { + addToQualReport(flowRead, referenceContext, errorProb); + } + + // emit + try { + emit(flowRead, flowHaplotype, score, normalizedScore, errorProb, read, cycleSkipStatus); + } catch (IOException e) { + throw new GATKException("failed to write output record", e); + } + } + + private boolean filter(final GATKRead read, final ReferenceContext referenceContext) { + + // loop on features contained in the read, check that they are in agreement with read data + Iterator iter = features.query(new SimpleInterval(read)); + byte[] ref = referenceContext.getBases(); + while ( iter.hasNext() ) { + final VariantContext vc = iter.next(); + for ( int refCoord = vc.getStart() ; refCoord <= vc.getEnd() ; refCoord++ ) { + + // get byte from read + Optional readByte = ReadUtils.getReadBaseAtReferenceCoordinate(read, refCoord); + if ( !readByte.isPresent() ) { + return false; + } + + // get byte from reference + byte refByte = ref[refCoord - referenceContext.getWindow().getStart()]; + + // compare + if ( refByte != readByte.get() ) { + return false; + } + } + } + + // if here, no interference detected + return true; + } + + /* + * compute error probability vector for a read + * + * The vector has one element for each flow key, representing the probability complementing the call-probability to 1 + * This is further complicated by the optional presence of a genome-prior database, which provides factoring for + * each hmer length (on a base basis) + */ + private double[] computeErrorProb(final FlowBasedRead flowRead, final GenomePriorDB genomePriorDB) { + + final int[] key = flowRead.getKey(); + final byte[] flowOrder = flowRead.getFlowOrderArray(); + + final double[] probCol = new double[flowRead.getMaxHmer() + 1]; + double[] result = new double[key.length]; + + for ( int i = 0 ; i < key.length ; i++ ) { + + // step 1 - extract column & sum + double sum = 0; + for ( int j = 0 ; j < probCol.length ; j++ ) { + sum += (probCol[j] = flowRead.getProb(i, j)); + } + + // step 2 - normalize column + if ( sum != 0.0 ) { + for (int j = 0; j < probCol.length; j++) { + probCol[j] /= sum; + } + } + + // step 3 - scale according to prior genome? + if ( genomePriorDB != null ) { + + long[] prior = genomePriorDB.getPriorForBase(flowOrder[i]); + sum = 0; + if ( prior != null ) { + for (int j = 0; j < probCol.length; j++) { + sum += (probCol[j] *= prior[j]); + } + } + + // assign normalized result + if ( sum != 0.0 ) { + result[i] = 1 - (probCol[Math.min(probCol.length - 1, Math.min(key[i], flowRead.getMaxHmer()))] / sum); + } else { + // revert to non-prior normalization + result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())]; + } + } else { + + // assign normalized result + result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())]; + } + + // accumulate error probabilities + if ( percentileReports != null ) { + if ( key[i] != 0 || !excludeZeroFlows ) { + while ( percentileReports.size() < (i + 1) ) { + percentileReports.add(new PercentileReport()); + } + percentileReports.get(i).addProb(result[i]); + } + } + } + + return result; + } + + /* + * compute lowest-quality-base-tp-value vector for a read + * + * The vector has one element for each flow key, representing the value of the tp for the hmer base + * which has the lowest quality value + * + * example: + * Bases: TTTTT + * Qs: ABCBA + * Tp: -1 1 0 1 -1 + * Output: -1 + */ + private byte[] computeLowestQBaseTP(final FlowBasedRead flowRead) { + + final int[] key = flowRead.getKey(); + byte[] result = new byte[key.length]; + final byte[] tp = flowRead.getAttributeAsByteArray("tp"); + final byte[] qual = flowRead.getBaseQualitiesNoCopy(); + + // loop om key + int seq_i = 0; + for ( int i = 0 ; i < key.length ; i++ ) { + + // extract hmer length, zero is easy + int hmer = key[i]; + if ( hmer == 0 ) { + result[i] = 0; + continue; + } + + // scan qualities for the lowest value, start with first + // as qualities and tp are symetric, we can scan up to the middle + // when finding the middle (offset) account for even/odd hmers + result[i] = tp[seq_i]; + byte lowestQ = qual[seq_i]; + int hmer_scan_length = (hmer + 1) / 2; + for ( int j = 1 ; j < hmer_scan_length ; j++ ) { + if ( qual[seq_i + j] < lowestQ ) { + result[i] = tp[seq_i + j]; + lowestQ = qual[seq_i + j]; + } + } + + // advance + seq_i += hmer; + } + + return result; + } + + private void emitCsvHeaders() { + + outputCsv.println(StringUtils.join(csvFieldOrder, ",")); + } + + private void emit(final FlowBasedRead flowRead, final FlowBasedHaplotype refHaplotype, double score, final double normalizedScore, final double[] errorProb, + GATKRead read, + FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus) throws IOException { + + // build line columns + final Map cols = new LinkedHashMap<>(); + + // read info + cols.put("ReadName", flowRead.getName()); + cols.put("ReadIsReversed", flowRead.isReverseStrand() ? 1 : 0); + cols.put("ReadMQ", flowRead.getMappingQuality()); + cols.put("ReadRQ", flowRead.getAttributeAsFloat("rq")); + cols.put("CycleSkipStatus", cycleSkipStatus); + cols.put("Cigar", read.getCigar().toString()); + + + // keys, seq, etc + cols.put("ReadKey", "\"" + StringUtils.join(flowRead.getKey(), ',') + "\""); + cols.put("GroundTruthKey", "\"" + StringUtils.join(refHaplotype.getKey(), ',') + "\""); + cols.put("ReadSequence", flowRead.getBasesString()); + cols.put("ReadKeyLength", flowRead.getKeyLength()); + cols.put("GroundTruthKeyLength", refHaplotype.getKeyLength()); + + // scores + cols.put("Score", score); + cols.put("NormalizedScore", normalizedScore); + cols.put("ErrorProbability", "\"" + StringUtils.join( + Arrays.stream(errorProb).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + + // lowest q base tp + final byte[] lowestQBaseTP = computeLowestQBaseTP(flowRead); + cols.put("LowestQBaseTP", "\"" + StringUtils.join(lowestQBaseTP, ',') + "\""); + + // add read probabilities + if ( addMeanCalll ) { + double[][] readProbsAndMeanCall = collectReadProbs(flowRead); + cols.put("ReadProbs", "\"" + StringUtils.join( + Arrays.stream(readProbsAndMeanCall[0]).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + cols.put("ReadMeanCall", "\"" + StringUtils.join( + Arrays.stream(readProbsAndMeanCall[1]).mapToObj(v -> doubleFormat.format(v)).toArray(), + ',') + "\""); + } + + // construct line + StringBuilder sb = new StringBuilder(); + int colIndex = 0; + for ( String field : csvFieldOrder ) { + if ( colIndex++ > 0 ) { + sb.append(','); + } + if ( !cols.containsKey(field) ) { + throw new GATKException("column missing from csv line: " + field); + } + sb.append(cols.get(field)); + cols.remove(field); + } + if ( cols.size() > 0 ) { + throw new GATKException("invalid columns on csv line: " + cols.keySet()); + } + + // output line + if ( !noOutput ) { + outputCsv.println(sb); + } + } + + private double[][] collectReadProbs(final FlowBasedRead read) { + + final int keyLength = read.getKeyLength(); + final int maxHmer = read.getMaxHmer(); + final double[] probs = new double[keyLength * (maxHmer + 1)]; + final double[] meanCall = new double[keyLength]; + + // retrieve probs + int pos = 0; + for ( int flow = 0 ; flow < keyLength ; flow++ ) { + double mc = 0; + int mc_sum = 0; + for ( int hmer = 0 ; hmer <= maxHmer ; hmer++ ) { + final double p = read.getProb(flow, hmer); + probs[pos++] = p; + mc += (p * hmer); + mc_sum += hmer; + } + meanCall[flow] = mc / mc_sum; + } + + return new double[][] {probs, meanCall}; + } + + static private class GenomePriorDB { + + final private Map db = new LinkedHashMap<>(); + + GenomePriorDB(GATKPath path) throws IOException { + + final CSVReader csvReader = new CSVReader(new InputStreamReader(path.getInputStream())); + String[] line; + while ( (line = csvReader.readNext()) != null ) { + long[] prior = new long[HMER_VALUE_MAX + 1]; + Byte base = line[0].getBytes()[0]; + for ( int i = 0 ; i < prior.length ; i++ ) { + if ( i == 0 ){ + prior[i] = Long.parseLong(line[i+1]); + } else { + prior[i] = Long.parseLong(line[i]); + } + } + db.put(base, prior); + } + } + + long[] getPriorForBase(byte base) { + return db.get(base); + } + } + + private void addToQualReport(FlowBasedRead flowRead, ReferenceContext referenceContext, final double[] errorProb) { + + // convert reference to key space + final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true); + final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, flowRead.getFlowOrder()); + + // access keys & flow order + final int[] readKey = flowRead.getKey(); + final int[] hapKey = flowHaplotype.getKey(); + final byte[] flowOrder = flowRead.getFlowOrder().getBytes(); + + // loop on key positions + if ( readKey.length != hapKey.length ) { + return; + } + for ( int flow = 0 ; flow < readKey.length ; flow++ ) { + + // determine quality + final double prob = Precision.round(errorProb[flow], (QUAL_VALUE_MAX / 10) + 1); + final int qual = (int)Math.ceil(-10 * Math.log10(prob)); + + // determine if matches reference + final int deviation = readKey[flow] - hapKey[flow]; + final boolean same = (deviation == 0); + + // accumulate + if ( qual < qualReport.length ) { + int baseBin = baseToBin(flowOrder[flow % flowOrder.length], flowRead.isReverseStrand()); + qualReport[qual].add(same, readKey[flow], deviationToBin(deviation), baseBin); + } + } + } + + static private int baseToBin(byte base, boolean isReverseStrand) { + final byte trueBase = !isReverseStrand ? base : BaseUtils.simpleComplement(base); + return FlowBasedRead.DEFAULT_FLOW_ORDER.indexOf(trueBase); + } + + static private byte binToBase(int bin) { + return (byte)FlowBasedRead.DEFAULT_FLOW_ORDER.charAt(bin); + } + + // 0,-1,1,-2,2... -> 0,1,2,3,4... + static private int deviationToBin(final int deviation) { + if ( deviation >= 0 ) { + return deviation * 2; + } else { + return (-deviation * 2) - 1; + } + } + + // 0,1,2,3,4... -> 0,-1,1,-2,2... + static private String binToDeviation(final int bin) { + if ( bin == 0 ) { + return "0"; + } else if ( (bin % 2) == 0 ) { + return String.format("+%d", bin / 2); + } else { + return String.format("%d", -((bin+1) / 2)); + } + } + + static private boolean isSoftClipped( final GATKRead read ) { + if ( read.isUnmapped() ) + return false; + if ( read.getCigar().getFirstCigarElement() == null ) + return false; + final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator(); + final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator(); + return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) || + (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java new file mode 100644 index 00000000000..2ba594b7058 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java @@ -0,0 +1,160 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicInteger; + +public class SeriesStats { + + private static final Logger logger = LogManager.getLogger(SeriesStats.class); + + // local state + private double last = Double.NaN; + private int count = 0; + private double sum = 0; + private double min = Double.NaN; + private double max = Double.NaN; + private SortedMap bins = new TreeMap<>(); + private int intCount = 0; + private Map auxBins = new LinkedHashMap<>(); + + public void csvWrite(final String path) throws IOException { + logger.info("Writing SeriesStats " + toDigest() + " into " + path); + PrintWriter pw = new PrintWriter(path); + pw.println("value,count"); + boolean intKeys = isIntKeys(); + for (Map.Entry entry : bins.entrySet() ) { + if ( intKeys ) { + pw.println(String.format("%d,%d", entry.getKey().intValue(), entry.getValue().get())); + } else { + pw.println(String.format("%f,%d", entry.getKey(), entry.getValue().get())); + } + } + pw.close(); + } + + public void add(int v) { + add((double)v); + intCount++; + } + + public void add(double v) { + + // save in simple values + last = v; + sum += v; + if ( count > 0 ) { + min = Math.min(min, v); + max = Math.max(max, v); + } else { + min = max = v; + } + count++; + + // save in bins + final Double key = v; + if ( bins.containsKey(key) ) { + bins.get(key).incrementAndGet(); + } else { + bins.put(key, new AtomicInteger(1)); + } + } + + public double getLast() { + return last; + } + + public int getCount() { + return count; + } + + public double getMin() { + return (count != 0) ? min : Double.NaN; + } + + public double getMax() { + return (count != 0) ? max : Double.NaN; + } + + public int getUniq() { + return bins.size(); + } + + public double getMean() { + return (count != 0) ? (sum / count) : Double.NaN; + } + + public double getMedian() { + return getPercentile(50); + } + + public double getPercentile(double precentile) { + if ( count == 0 ) { + return Double.NaN; + } else if ( count == 1 ) { + return last; + } else { + + int percentileIndex = (int)(count * precentile / 100); + int index = 0; + for (Map.Entry entry : bins.entrySet() ) { + int binSize = entry.getValue().get(); + if ( percentileIndex >= index && (percentileIndex < (index + binSize)) ) { + return entry.getKey(); + } + index += binSize; + } + + // if here, we need the highest entry + return bins.lastKey(); + } + } + + public double getStd() { + + if (count == 0) { + return Double.NaN; + } + + // calculate mean + double mean = getMean(); + + // calculate sum of sqr(deviations) + double vsum = 0; + for (Map.Entry entry : bins.entrySet()) { + int binSize = entry.getValue().get(); + vsum += (Math.pow(entry.getKey() - mean, 2) * binSize); + } + + // calculate variance and std + double variance = vsum / count; + return Math.sqrt(variance); + } + + public Map getBins() { + return this.bins; + } + + public Map getAuxBins() { + return this.auxBins; + } + + public String toDigest() { + if ( isIntKeys() ) { + return String.format("count=%d, min=%d, max=%d, median=%d, bin.count=%d", getCount(), (int)getMin(), (int)getMax(), (int)getMedian(), getBins().size()); + } else { + return String.format("count=%d, min=%f, max=%f, median=%f, bin.count=%d", getCount(), getMin(), getMax(), getMedian(), getBins().size()); + } + } + + private boolean isIntKeys() { + return (count == intCount); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java index 17728fa6035..3a350228128 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java @@ -29,6 +29,8 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import com.google.common.annotations.VisibleForTesting; + /** * Filtering haplotypes that contribute weak alleles to the genotyping. * @@ -278,7 +280,8 @@ private AlleleLikelihoods subsetHaplotypesByAlleles(final A * @param sorThreshold only variants with SOR above threshold will be considered * @return list of alleles that can be removed */ - private List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, + @VisibleForTesting + List identifyBadAlleles(final List collectedRPLs, final List collectedSORs, final List alleles, final double qualThreshold, final double sorThreshold) { @@ -303,9 +306,11 @@ private List identifyBadAlleles(final List collectedRPLs, final //we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD //we will first filter the one with the lowest QUAL. logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount)); - for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) { - if (!result.contains(alleles.get(sorIndices[i]))) { - result.add(alleles.get(sorIndices[i])); + for (int i = sorIndices.length-1 ; (i >= 0) ; i--) { + if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){ + if (!result.contains(alleles.get(sorIndices[i]))) { + result.add(alleles.get(sorIndices[i])); + } } } logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount)); @@ -340,7 +345,7 @@ private AlleleLikelihoods getAlleleLikelihoodMatrix(final Alle .forEach(alleleHaplotypeMap.get(notAllele)::add); final AlleleLikelihoods alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap); - logger.debug(() -> String.format("GALM: %s %d %d", allele.toString(), alleleHaplotypeMap.get(allele).size(), alleleHaplotypeMap.get(notAllele).size())); + logger.debug(() -> String.format("GALM: %s %d %d", allele, alleleHaplotypeMap.get(allele.altAllele()).size(), alleleHaplotypeMap.get(notAllele).size())); return alleleLikelihoods; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java index 2788cc21575..010a9e21398 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java @@ -11,7 +11,10 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import java.io.OutputStreamWriter; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.List; /** * Filtering haplotypes that contribute weak alleles to the genotyping. This is a version that determines if allele is weak using @@ -57,14 +60,16 @@ int getAlleleLikelihoodVsInverse(final AlleleLikelihoods allel final GenotypingLikelihoods genotypingLikelihoods = genotypesModel.calculateLikelihoods(alleleList, genotypingData, null, 0, null); - AFCalculationResult af = afCalc.fastCalculateDiploidBasedOnGLs(genotypingLikelihoods, genotypingEngine.getPloidyModel().totalPloidy()); - final double log10Confidence = af.log10ProbOnlyRefAlleleExists(); - final double phredScaledConfidence = (10.0 * log10Confidence) + 0.0; - final int[] asPL = genotypingLikelihoods.sampleLikelihoods(0).getAsPLs(); - - logger.debug(() -> String.format("GAL:: %s: %d %d %d", allele.toString(), asPL[0], asPL[1], asPL[2])); - return Math.min(asPL[1]-asPL[0], asPL[2]-asPL[0]); + List perSamplePLs = new ArrayList<>(); + for (int i = 0; i < genotypingLikelihoods.numberOfSamples(); i++) { + final int[] pls = genotypingLikelihoods.sampleLikelihoods(i).getAsPLs(); + perSamplePLs.add(Math.min(pls[1] - pls[0], pls[2] - pls[0])); + final int finalI = i; + logger.debug(() -> String.format("GAL (%s):: %s: %d %d %d", + genotypingLikelihoods.getSample(finalI), allele.toString(), pls[0], pls[1], pls[2])); + } + return Collections.min(perSamplePLs); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java index 1d7eecc0fe8..fa172768608 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java @@ -133,12 +133,12 @@ public AlleleLikelihoods computeReadLikelihoods(final List< @Override public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) { final double log10ErrorRate = Math.log10(expectedErrorRate); - final double catastrophicErrorRate = fbargs.fillingValue; - final double log10catastrophicErrorRate = Math.log10(fbargs.fillingValue); + final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors. + final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate); return read -> { final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate); - final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * fbargs.fillingValue)) : - Math.ceil(read.getLength() * fbargs.fillingValue); + final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) : + Math.ceil(read.getLength() * largeEventErrorRate); return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate; }; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java index 700659297c9..27c5f0eb875 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java @@ -132,12 +132,13 @@ public AlleleLikelihoods computeReadLikelihoods(final List< @Override public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) { final double log10ErrorRate = Math.log10(expectedErrorRate); - final double catastrophicErrorRate = Math.log10(fbargs.fillingValue); + final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors. + final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate); return read -> { final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate)); - final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * fbargs.fillingValue)); - return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*catastrophicErrorRate; + final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate)); + return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate; }; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java index f504d400b13..557e7f0add9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -127,6 +127,21 @@ * argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause * performance challenges including excessive slowness. We are working on resolving these limitations.

    * + *

    For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY, + * see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions + * should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region + * when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will + * be determined using the following precedence:

    + *
      + *
    1. ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant + * (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine + * the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;
    2. + *
    3. ploidy given via global -ploidy flag;
    4. + *
    5. ploidy determined by the default global built-in constant for humans (2).
    6. + *
    + * + *

    Coordinates for the PAR for CRCh38 can be found here.

    + * *

    Additional Notes

    *
      *
    • When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
    • diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index 7ab58d73264..f818659bb5e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -1,7 +1,10 @@ package org.broadinstitute.hellbender.tools.walkers.haplotypecaller; +import htsjdk.tribble.NamedFeature; import htsjdk.variant.variantcontext.VariantContext; +import org.apache.arrow.util.VisibleForTesting; import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.SerializationUtils; import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; @@ -9,6 +12,7 @@ import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection; +import org.broadinstitute.hellbender.engine.FeatureDataSource; import org.broadinstitute.hellbender.engine.FeatureInput; import org.broadinstitute.hellbender.engine.GATKPath; import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; @@ -23,9 +27,11 @@ /** * Set of arguments for the {@link HaplotypeCallerEngine} */ -public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{ +public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable { private static final long serialVersionUID = 1L; + public static final String PLOIDY_REGIONS_NAME = "ploidy-regions"; + public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands"; public static final String GQ_BAND_SHORT_NAME = "GQB"; public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality"; @@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu return new HaplotypeCallerReadThreadingAssemblerArgumentCollection(); } + @Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true) + public FeatureInput ploidyRegions = null; + /** * You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This * is especially useful if your samples are all in the same file but you need to run them individually through HC @@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() { return flowMode != FlowMode.NONE; } + // Copy method used to create new hcArgs with same fields except input ploidy model + public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) { + HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this); + newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy; + return newArgsWithNewPloidy; + } /** * the different flow modes, in terms of their parameters and their values diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 71f6d64ede7..36918f1fb91 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -3,7 +3,10 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.OverlapDetector; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.NamedFeature; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; @@ -20,12 +23,15 @@ import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.tools.walkers.annotator.*; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod; +import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingEngine; import org.broadinstitute.hellbender.tools.walkers.genotyper.MinimalGenotypingEngine; import org.broadinstitute.hellbender.tools.walkers.genotyper.OutputMode; import org.broadinstitute.hellbender.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; +import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.hellbender.utils.haplotype.Event; import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles; @@ -83,8 +89,31 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { protected final OutputStreamWriter assemblyDebugOutStream; - // the genotyping engine for the isActive() determination - private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null; + /** + * List of interval file entries including regions and custom ploidy values to apply in that region. + */ + protected final List ploidyRegions = new ArrayList<>(); + + /** + * An OverlapDetector object for checking whether region overlaps given ploidyRegions. + */ + protected final OverlapDetector ploidyRegionsOverlapDetector; + + /** + * List of all custom ploidies provided by user + */ + private final LinkedHashSet allCustomPloidies; + + /** + * The default genotyping engine for the isActive() determination + */ + private MinimalGenotypingEngine defaultActiveRegionEvaluationGenotyperEngine = null; + + /** + * Map of user-provided ploidy values to corresponding active region genotyper. Values are checked as valid Integers during + * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime. + */ + private final HashMap ploidyToActiveEvaluationGenotyper = new HashMap<>(); protected ReadThreadingAssembler assemblyEngine = null; @@ -93,7 +122,16 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { // If we are in PDHMM mode we need to hold onto two likelihoods engines for the fallback private ReadLikelihoodCalculationEngine pdhmmLikelihoodCalculationEngine = null; - protected HaplotypeCallerGenotypingEngine genotypingEngine = null; + /** + * The default genotyping engine to use for actual variant calling and genotyping in an active region. + */ + protected HaplotypeCallerGenotypingEngine defaultGenotypingEngine = null; + + /** + * Map of user-provided ploidy values to corresponding genotyper. Values are checked as valid Integers during + * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime. + */ + protected final HashMap ploidyToGenotyperMap = new HashMap<>(); private VariantAnnotatorEngine annotationEngine = null; @@ -163,7 +201,7 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator { /** * Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header, * and a reference file - * @param hcArgs command-line arguments for the HaplotypeCaller + * @param hcArgs command-line arguments for the HaplotypeCaller * @param assemblyRegionArgs * @param createBamOutIndex true to create an index file for the bamout * @param createBamOutMD5 true to create an md5 file for the bamout @@ -196,6 +234,21 @@ public HaplotypeCallerEngine(final HaplotypeCallerArgumentCollection hcArgs, Ass HaplotypeCallerGenotypingDebugger.initialize(hcArgs.genotyperDebugOutStream); } + // Parse the user provided custom ploidy regions into ploidyRegions object containing SimpleCounts + if (this.hcArgs.ploidyRegions != null) { + FeatureDataSource ploidyDataSource = new FeatureDataSource<>(this.hcArgs.ploidyRegions, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, NamedFeature.class); + ploidyDataSource.forEach(r -> this.ploidyRegions.add(new SimpleCount(r))); + } + + for (SimpleCount region : this.ploidyRegions) { + if (!IntervalUtils.intervalIsOnDictionaryContig(region.getInterval(), readsHeader.getSequenceDictionary())) { + throw new UserException("Invalid region provided for --ploidy-regions at " + region.getContig() + ":" + region.getStart() + "-" + region.getEnd() + ". Contig name or endpoint doesn't match read sequence dictionary."); + } + } + + this.ploidyRegionsOverlapDetector = OverlapDetector.create(this.ploidyRegions); + this.allCustomPloidies = this.ploidyRegions.stream().map(SimpleCount::getCount).collect(Collectors.toCollection(LinkedHashSet::new)); + trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, readsHeader.getSequenceDictionary()); initialize(createBamOutIndex, createBamOutMD5); } @@ -242,8 +295,16 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5 initializeActiveRegionEvaluationGenotyperEngine(); - genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); - genotypingEngine.setAnnotationEngine(annotationEngine); + defaultGenotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); + defaultGenotypingEngine.setAnnotationEngine(annotationEngine); + + // Create other custom genotyping engines if user provided ploidyRegions + for (final int ploidy : this.allCustomPloidies) { + HaplotypeCallerArgumentCollection newPloidyHcArgs = hcArgs.copyWithNewPloidy(ploidy); + HaplotypeCallerGenotypingEngine newGenotypingEngine = new HaplotypeCallerGenotypingEngine(newPloidyHcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD); + newGenotypingEngine.setAnnotationEngine(annotationEngine); + this.ploidyToGenotyperMap.put(ploidy, newGenotypingEngine); + } boolean isFlowBased = (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBased) || (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBasedHMM); @@ -381,8 +442,18 @@ private void initializeActiveRegionEvaluationGenotyperEngine() { // Seems that at least with some test data we can lose genuine haploid variation if we use ploidy == 1 activeRegionArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, hcArgs.standardArgs.genotypeArgs.samplePloidy); - activeRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList); - activeRegionEvaluationGenotyperEngine.setLogger(logger); + defaultActiveRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList); + defaultActiveRegionEvaluationGenotyperEngine.setLogger(logger); + + // If custom ploidyRegions provided, create corresponding map for active region determination genotyper + for (final int ploidy : this.allCustomPloidies) { + StandardCallerArgumentCollection newPloidyActiveArgs = new StandardCallerArgumentCollection(); + newPloidyActiveArgs.copyStandardCallerArgsFrom(activeRegionArgs); + newPloidyActiveArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, ploidy); + MinimalGenotypingEngine newActiveGenotypingEngine = new MinimalGenotypingEngine(newPloidyActiveArgs, samplesList); + newActiveGenotypingEngine.setLogger(logger); + this.ploidyToActiveEvaluationGenotyper.put(ploidy, newActiveGenotypingEngine); + } } /** @@ -455,7 +526,7 @@ public VCFHeader makeVCFHeader( final SAMSequenceDictionary sequenceDictionary, final Set headerInfo = new HashSet<>(); headerInfo.addAll(defaultToolHeaderLines); - headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); + headerInfo.addAll(defaultGenotypingEngine.getAppropriateVCFInfoHeaders()); // all annotation fields from VariantAnnotatorEngine headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions(emitReferenceConfidence())); // all callers need to add these standard annotation header lines @@ -523,6 +594,57 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence vcfWriter.writeHeader(makeVCFHeader(sequenceDictionary, defaultToolHeaderLines)); } + /** + * Determines the appropriate ploidy to use at given site for different genotyping engines. + * @param region Current region of interest + * @return Ploidy value to use here given user inputs, or -1 if fall back to default + */ + private int getPloidyToUseAtThisSite(Locatable region) { + Set overlaps = this.ploidyRegionsOverlapDetector.getOverlaps(region); + // Return first engine for interval overlapping this region + if (!overlaps.isEmpty()) { + Iterator intervals = overlaps.iterator(); + int max = intervals.next().getCount(); + while (intervals.hasNext()) { + int next = intervals.next().getCount(); + if (next > max) { + max = next; + } + } + return max; + } else { + return -1; // Sentinel value to fall back to default genotyper + } + } + + /** + * Selects appropriate active region genotyping engine for given region + * @param region Current region of interest + * @return Active genotyping engine with correct ploidy setting for given region + */ + private MinimalGenotypingEngine getLocalActiveGenotyper(Locatable region) { + int currentPloidy = getPloidyToUseAtThisSite(region); + if (currentPloidy == -1) { + return this.defaultActiveRegionEvaluationGenotyperEngine; + } else { + return this.ploidyToActiveEvaluationGenotyper.get(currentPloidy); + } + } + + /** + * Selects appropriate genotyping engine for given region. + * @param region Current region of interest, e.g. AssemblyRegion + * @return Genotyping engine with correct ploidy setting for given region + */ + protected HaplotypeCallerGenotypingEngine getLocalGenotypingEngine(Locatable region) { + int currentPloidy = getPloidyToUseAtThisSite(region); + if (currentPloidy == -1) { + return this.defaultGenotypingEngine; + } else { + return this.ploidyToGenotyperMap.get(currentPloidy); + } + } + /** * Given a pileup, returns an ActivityProfileState containing the probability (0.0 to 1.0) that it's an "active" site. * @@ -537,6 +659,8 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence */ @Override public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext features) { + MinimalGenotypingEngine localActiveGenotypingEngine = getLocalActiveGenotyper(ref); + if (forceCallingAllelesPresent && features.getValues(hcArgs.alleles, ref).stream().anyMatch(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered())) { return new ActivityProfileState(ref.getInterval(), 1.0); } @@ -546,7 +670,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer return new ActivityProfileState(ref.getInterval(), 0.0); } - final int ploidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy; + final int ploidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy; final List noCall = GATKVariantContextUtils.noCallAlleles(ploidy); // used to noCall all genotypes until the exact model is applied final Map splitContexts; @@ -565,7 +689,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer sample.getValue().getBasePileup().forEach(p -> PileupBasedAlleles.addMismatchPercentageToRead(p.getRead(), readsHeader, ref)); } // The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not. - final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy; + final int activeRegionDetectionHackishSamplePloidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy; final double[] genotypeLikelihoods = ((RefVsAnyResult) referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny( activeRegionDetectionHackishSamplePloidy, sample.getValue().getBasePileup(), ref.getBase(), @@ -579,9 +703,9 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer if (genotypes.size() == 1) { // Faster implementation using exact marginalization instead of iteration - isActiveProb = activeRegionEvaluationGenotyperEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector()); + isActiveProb = localActiveGenotypingEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector()); } else { - final VariantContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make()); + final VariantContext vcOut = localActiveGenotypingEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make()); isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb(vcOut.getPhredScaledQual()); } @@ -607,6 +731,8 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer * @return List of variants discovered in the region (may be empty) */ public List callRegion(final AssemblyRegion region, final FeatureContext features, final ReferenceContext referenceContext) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region); + if ( hcArgs.justDetermineActiveRegions ) { // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return NO_CALLS; @@ -632,6 +758,7 @@ public List callRegion(final AssemblyRegion region, final Featur final List givenAlleles = features.getValues(hcArgs.alleles).stream() .filter(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered()) .flatMap(vc -> GATKVariantContextUtils.splitVariantContextToEvents(vc, false, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, false).stream()) + .filter(event -> event.getStart() >= region.getSpan().getStart()) // filter out events that do not start within the region. This approach works because events that begin upstream of the calling window cannot be called by this region calling code in the frist place. .collect(Collectors.toList()); if( givenAlleles.isEmpty() && region.size() == 0 ) { @@ -799,7 +926,7 @@ public List callRegion(final AssemblyRegion region, final Featur if (hcArgs.filterAlleles) { logger.debug("Filtering alleles"); - AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine); + AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine); //need to update haplotypes to find the alleles EventMap.buildEventMapsForHaplotypes(readLikelihoods.alleles(), assemblyResult.getFullReferenceWithPadding(), @@ -849,7 +976,7 @@ public List callRegion(final AssemblyRegion region, final Featur } } - final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( + final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods( haplotypes, subsettedReadLikelihoodsFinal, perSampleFilteredReadList, @@ -890,7 +1017,7 @@ public List callRegion(final AssemblyRegion region, final Featur result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - subsettedReadLikelihoodsFinal, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, + subsettedReadLikelihoodsFinal, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors)); trimmingResult.nonVariantRightFlankRegion().ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, VCpriors))); @@ -948,6 +1075,7 @@ protected boolean containsCalls(final CalledHaplotypes calledHaplotypes) { * @return a list of variant contexts (can be empty) to emit for this ref region */ protected List referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized, final List VCpriors) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region); if ( emitReferenceConfidence() ) { if ( needsToBeFinalized ) { AssemblyBasedCallerUtils.finalizeRegion(region, @@ -967,7 +1095,7 @@ protected List referenceModelForNoVariation(final AssemblyRegion final List haplotypes = Collections.singletonList(refHaplotype); return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes, paddedLoc, region, AssemblyBasedCallerUtils.createDummyStratifiedReadMap(refHaplotype, samplesList, readsHeader, region), - genotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors); + localGenotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors); } else { return NO_CALLS; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java index b5b5c2cfa88..e47245c9f91 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java @@ -460,6 +460,7 @@ private void uncollapse(final CallRegionContext context) { } private void filter(final CallRegionContext context) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region); try { // no need for this step? @@ -479,7 +480,7 @@ private void filter(final CallRegionContext context) { context.suspiciousLocations = new HashSet<>(); if (hcArgs.filterAlleles) { logger.debug("Filtering alleles"); - AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine); + AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine); //need to update haplotypes to find the alleles EventMap.buildEventMapsForHaplotypes(context.readLikelihoods.alleles(), context.assemblyResult.getFullReferenceWithPadding(), @@ -520,6 +521,7 @@ private void filter(final CallRegionContext context) { } private void genotype(final CallRegionContext context) { + final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region); // no need for this step? if ( context.regionVariants != null ) { @@ -542,7 +544,7 @@ private void genotype(final CallRegionContext context) { // haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included // in the genotyping, but we lose information if we select down to a few haplotypes. [EB] List haplotypes = context.readLikelihoods.alleles(); - final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( + final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods( haplotypes, context.readLikelihoods, perSampleFilteredReadList, @@ -582,7 +584,7 @@ private void genotype(final CallRegionContext context) { result.addAll(referenceConfidenceModel.calculateRefConfidence(context.assemblyResult.getReferenceHaplotype(), calledHaplotypes.getCalledHaplotypes(), context.assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, - context.readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, + context.readLikelihoods, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, context.VCpriors)); context.nonVariantRightFlankRegion.ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, context.VCpriors))); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java index 095e2dab8ea..dd02b8e8d4b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java @@ -2,7 +2,7 @@ import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; -import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java new file mode 100644 index 00000000000..c166b061784 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java @@ -0,0 +1,203 @@ +package org.broadinstitute.hellbender.tools.walkers.mutect; + + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; +import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality; +import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; +import org.broadinstitute.hellbender.utils.haplotype.Haplotype; +import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine; +import org.broadinstitute.hellbender.utils.pileup.PileupElement; +import org.broadinstitute.hellbender.utils.read.AlignmentUtils; +import org.broadinstitute.hellbender.utils.read.Fragment; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * For each sample and for each allele a list feature vectors of supporting reads + * In order to reduce the number of delimiter characters, we flatten featurized reads. For example, suppose allele 1 has featurized reads + * [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is + * 1,2,3,4|5,6,7,8 + */ +public class FeaturizedReadSets { + private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class); + + public static final int DEFAULT_BASE_QUALITY = 25; + + private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA); + private static final int FEATURES_PER_RANGE = 5; + private static final List RANGES = List.of(5, 10, 25, 50); + public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size(); + private static final int VERY_BAD_QUAL_THRESHOLD = 10; + private static final int BAD_QUAL_THRESHOLD = 20; + + private FeaturizedReadSets() { } + + public static List>> getReadVectors(final VariantContext vc, + final Collection samples, + final AlleleLikelihoods likelihoods, + final AlleleLikelihoods haplotypeLikelihoods, + final int refDownsample, + final int altDownsample, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode, readGroupIndices); + } + + // returns Lists (in allele order) of lists of read vectors supporting each allele + public static List>> getReadVectors(final VariantContext vc, + final Collection samples, + final AlleleLikelihoods likelihoods, + final AlleleLikelihoods haplotypeLikelihoods, + final int refDownsample, + final int altDownsample, + final Map altDownsampleMap, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + final Map> readsByAllele = likelihoods.alleles().stream() + .collect(Collectors.toMap(a -> a, a -> new ArrayList<>())); + + samples.stream().flatMap(s -> likelihoods.bestAllelesBreakingTies(s).stream()) + .filter(ba -> ba.isInformative()) + .forEach(ba -> readsByAllele.get(ba.allele).add(ba.evidence)); + + // downsample if necessary + final Allele refAllele = likelihoods.alleles().stream().filter(Allele::isReference).findFirst().get(); + for (final Allele allele : likelihoods.alleles()) { + final int downsample = allele.isReference() ? refDownsample : altDownsampleMap.getOrDefault(allele, altDownsample); + if (readsByAllele.get(allele).size() > downsample) { + Collections.shuffle(readsByAllele.get(allele)); + readsByAllele.put(allele, readsByAllele.get(allele).subList(0, downsample)); + } + } + + final Map bestHaplotypes = new HashMap<>(); + samples.stream().flatMap(s -> haplotypeLikelihoods.bestAllelesBreakingTies(s).stream()) + .forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele))); + + return vc.getAlleles().stream() + .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode, readGroupIndices)).collect(Collectors.toList())) + .collect(Collectors.toList()); + } + + + private static List featurize(final GATKRead read, final VariantContext vc, + final Map bestHaplotypes, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode, + final Map readGroupIndices) { + final List result = new ArrayList<>(); + result.add(readGroupIndices.get(read.getReadGroup())); // this is read group metadata rather than part of the tensor + result.add(read.getMappingQuality()); + result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY)); + result.add(read.isFirstOfPair() ? 1 : 0); + result.add(read.isReverseStrand() ? 1 : 0); + + // distances from ends of read + final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0); + result.add(readPositionOfVariantStart); + result.add(read.getLength() - readPositionOfVariantStart); + + + result.add(Math.abs(read.getFragmentLength())); + + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) { + // distances from ends of fragment + final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart()); + final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength()); + result.add(vc.getStart() - fragmentStart); + result.add(fragmentEnd - vc.getEnd()); + } + + // Ultima-specific read tags + if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) { + result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s + result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round + } + + // mismatches versus best haplotype + final Haplotype haplotype = bestHaplotypes.get(read); + + // TODO: fix this + // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina + if (!bestHaplotypes.containsKey(read)) { + logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(), + vc.getContig(), vc.getStart())); + result.add(3); + result.add(2); + + for (int n = 0; n < NUM_RANGED_FEATURES; n++) { + result.add(0); + } + } else { + byte[] haplotypeBases = haplotype.getBases(); + final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP); + final GATKRead copy = read.copy(); + copy.setCigar(readToHaplotypeAlignment.getCigar()); + final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches; + result.add(mismatchCount); + + final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count(); + result.add((int) indelsVsBestHaplotype); + + final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset(); + final AlignmentStateMachine asm = new AlignmentStateMachine(copy); + asm.stepForwardOnGenome(); + final List rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList(); + + while (!asm.isRightEdge()) { + final PileupElement pe = asm.makePileupElement(); + final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart); + + // pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant + final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst(); + if (relevantRange.isPresent()) { + final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt()); + if (pe.isAfterInsertion()) { + featuresToAddTo[0]++; + } + + if (pe.isDeletion()) { + featuresToAddTo[1]++; + } else { + final byte base = pe.getBase(); + final byte qual = pe.getQual(); + final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype]; + + if (base != haplotypeBase) { + featuresToAddTo[2]++; + } + + if (qual < VERY_BAD_QUAL_THRESHOLD) { + featuresToAddTo[3]++; + } else if (qual < BAD_QUAL_THRESHOLD) { + featuresToAddTo[4]++; + } + } + } + asm.stepForwardOnGenome(); + } + + for (final int[] featuresToAdd : rangedFeatures) { + for (final int val : featuresToAdd) { + result.add(val); + } + } + } + // the +1 is for the read group index that comes before the features + Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures() + 1, "Wrong number of features"); + + return result; + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index d25c7a33fc5..a6c619f1b17 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample"; public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset"; public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth"; + public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode"; public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10; public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20; - public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20; + public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1; @Override protected int getDefaultMaxMnpDistance() { return 1; } @@ -205,10 +206,33 @@ public double getDefaultAlleleFrequency() { @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection") public File mutect3Dataset; + @Advanced + @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.") + public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA; + + public enum Mutect3DatasetMode { + ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES), + ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES); + + final private int numReadFeatures; + + Mutect3DatasetMode(final int numReadFeatures) { + this.numReadFeatures = numReadFeatures; + } + + public int getNumReadFeatures() { + return numReadFeatures; + } + } + /** * VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field) * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors. * If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions. + * + * Although the normal use of this input is in generating training data, it can also be used when generating test data + * for making Permutect calls. In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as + * usual, and Permutect uses the labels to analyze the quality of its results. */ @Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true) public FeatureInput mutect3TrainingTruth; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java index f9c3733acca..0120fa9dae3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.ArgumentCollection; @@ -262,7 +263,8 @@ public boolean shouldTrackPileupsForAssemblyRegions() { @Override public void onTraversalStart() { VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), null, Collections.emptyList(), false, false); - m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), referenceArguments.getReferenceSpecifier(), annotatorEngine); + m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), + getBestAvailableSequenceDictionary(), referenceArguments.getReferenceSpecifier(), annotatorEngine); vcfWriter = createVCFWriter(outputVCF); if (m2Engine.emitReferenceConfidence()) { logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions."); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java index 1d2096f3c2b..c831a4169e0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Locatable; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; @@ -72,7 +73,7 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseable { private static final List STANDARD_MUTECT_INFO_FIELDS = Arrays.asList(GATKVCFConstants.NORMAL_LOG_10_ODDS_KEY, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, GATKVCFConstants.NORMAL_ARTIFACT_LOG_10_ODDS_KEY, - GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY, + GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY, GATKVCFConstants.GERMLINE_QUAL_KEY, GATKVCFConstants.CONTAMINATION_QUAL_KEY, GATKVCFConstants.SEQUENCING_QUAL_KEY, GATKVCFConstants.POLYMERASE_SLIPPAGE_QUAL_KEY, GATKVCFConstants.READ_ORIENTATION_QUAL_KEY, GATKVCFConstants.STRAND_QUAL_KEY, GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, GATKVCFConstants.N_COUNT_KEY, GATKVCFConstants.AS_UNIQUE_ALT_READ_SET_COUNT_KEY); @@ -94,6 +95,7 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab final private M2ArgumentCollection MTAC; private SAMFileHeader header; + private SAMSequenceDictionary sequenceDictionary; private final int minCallableDepth; public static final String CALLABLE_SITES_NAME = "callable"; @@ -136,9 +138,12 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab * @param referenceSpec reference specifier for the reference * @param annotatorEngine annotator engine built with desired annotations */ - public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs, final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) { + public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs, + final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header, + final SAMSequenceDictionary sequenceDictionary, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) { this.MTAC = Utils.nonNull(MTAC); this.header = Utils.nonNull(header); + this.sequenceDictionary = sequenceDictionary; minCallableDepth = MTAC.callableDepth; referenceReader = ReferenceUtils.createReferenceReader(Utils.nonNull(referenceSpec)); aligner = SmithWatermanAligner.getAligner(MTAC.smithWatermanImplementation); @@ -162,7 +167,7 @@ public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentColl annotationEngine = Utils.nonNull(annotatorEngine); assemblyEngine = MTAC.createReadThreadingAssembler(); likelihoodCalculationEngine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(MTAC.likelihoodArgs, MTAC.fbargs, true, MTAC.likelihoodArgs.likelihoodEngineImplementation); - genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine); + genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine, header, sequenceDictionary); haplotypeBAMWriter = AssemblyBasedCallerUtils.createBamWriter(MTAC, createBamOutIndex, createBamOutMD5, header); trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, header.getSequenceDictionary()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java index e6a080cd14f..ad708c8dac8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java @@ -1,21 +1,22 @@ package org.broadinstitute.hellbender.tools.walkers.mutect; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.tuple.Triple; -import org.apache.commons.math3.util.CombinatoricsUtils; import org.apache.commons.math3.util.FastMath; -import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger; import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity; -import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets; import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases; import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine; import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.MathUtils; -import org.broadinstitute.hellbender.utils.NaturalLogUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix; @@ -46,8 +47,9 @@ private enum Label { ARTIFACT, VARIANT, UNLABELED, IGNORE } - // number of features for each vectorized read - private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ; + private final SAMSequenceDictionary sequenceDictionary; + + private final Map readGroupIndices = new HashMap<>(); // number of additional variant features for assembly complexity, reference context private static final int NUM_EXTRA_FEATURES = 9; @@ -71,6 +73,8 @@ private enum Label { private static final int MIN_REF = 5; private final PrintWriter printWriter; + private final PrintWriter contigPrintWriter; + private final PrintWriter readGroupPrintWriter; // number of nonartifact data to keep for each artifact datum private final int nonArtifactPerArtifact; @@ -85,9 +89,15 @@ private enum Label { private final EnumMap> unmatchedArtifactAltCounts; - public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount, final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples) { + public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount, + final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples, + final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) { try { printWriter = new PrintWriter(new FileWriter(Utils.nonNull(datasetFile))); + final File contigTableFile = datasetFile.toPath().resolveSibling("contigs.table").toFile(); + final File readGroupTableFile = datasetFile.toPath().resolveSibling("read-groups.table").toFile(); + contigPrintWriter = new PrintWriter(new FileWriter(contigTableFile)); + readGroupPrintWriter = new PrintWriter(new FileWriter(readGroupTableFile)); } catch (IOException ex) { throw new UserException.BadInput("Could not create dataset file writer"); } @@ -98,6 +108,12 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, this.maxRefCount = maxRefCount; this.maxAltCount = maxAltCount; + this.sequenceDictionary = sequenceDictionary; + final List readGroups = header.getReadGroups(); + for (int n = 0; n < readGroups.size(); n++) { + readGroupIndices.put(readGroups.get(n).getReadGroupId(), n); + } + unmatchedArtifactAltCounts = new EnumMap<>(VariantType.class); for (final VariantType type : VariantType.values()) { unmatchedArtifactAltCounts.put(type, new ArrayBlockingQueue<>(CAPACITY)); @@ -108,10 +124,11 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs, final AlleleLikelihoods likelihoods, final AlleleLikelihoods logFragmentLikelihoods, - final AlleleLikelihoods logFragmentAlleleLikelihoods) { + final AlleleLikelihoods logFragmentAlleleLikelihoods, + final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) { final String refBases = ReferenceBases.annotate(ref, vc); final String refAllele = vc.getReference().getBaseString(); - final String contig = vc.getContig(); + final int contigIndex = sequenceDictionary.getSequenceIndex(vc.getContig()); final int position = vc.getStart(); final Set tumorSamples = likelihoods.samples().stream().filter(sample -> !normalSamples.contains(sample)).collect(Collectors.toSet()); final int numAlt = vc.getNAlleles() - 1; @@ -132,6 +149,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona final int normalDepth = (int) MathUtils.sum(normalADs); final boolean hasNormal = normalDepth > 0; + final List allRefAlleles = new ArrayList<>(); + allRefAlleles.add(vc.getReference()); + truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference()))); + final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get(); + + // skip(1) excludes the reference allele + final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream() + .skip(1).toList(); + + final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream() + .filter(truthVC -> ! truthVC.isFiltered()) + .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream()) + .collect(Collectors.toSet()); + final List