From d40a485917c8e256ca1bd4967a7c3257cefcdc1b Mon Sep 17 00:00:00 2001
From: jamesemery
Date: Wed, 11 Oct 2023 10:01:21 -0400
Subject: [PATCH 01/64] Funcotator Update for Datasource Release V1.8 (#8512)
---
.../cosmic/createCosmicFusionGeneTsv.py | 4 +-
.../cosmic/createSqliteCosmicDb.sh | 6 ++
.../cosmic/getCosmicDataSources.sh | 15 ++-
.../data_sources/downloadHgncDataSource.sh | 2 +-
.../finalizeFuncotatorReleaseDirectory.sh | 28 ++++++
scripts/funcotator/data_sources/getGencode.sh | 2 +-
.../data_sources/getGencodeXHGNC.sh | 4 +-
.../data_sources/getGencodeXRefseq.sh | 4 +-
.../FuncotatorDataSourceDownloader.java | 77 ++++++++++++---
.../dataSources/DataSourceUtils.java | 37 +++++--
.../mafOutput/MafOutputRendererConstants.java | 43 ++++----
...orDataSourceDownloaderIntegrationTest.java | 34 +++++--
.../dataSources/DataSourceUtilsUnitTest.java | 99 +++++++++++++++++--
.../dataSources/DbSnpIntegrationTest.java | 8 +-
.../mafOutput/MafOutputRendererUnitTest.java | 22 +++++
.../utils/test/FuncotatorTestUtils.java | 8 +-
16 files changed, 312 insertions(+), 81 deletions(-)
create mode 100644 scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh
diff --git a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py
index 8964c9dc294..826795e92b5 100755
--- a/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py
+++ b/scripts/funcotator/data_sources/cosmic/createCosmicFusionGeneTsv.py
@@ -90,7 +90,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
tsvReader = GenericTsvReader(inputFilename)
headers = tsvReader.getFieldNames()
print('Found headers (input): ' + str(headers))
- if "Translocation Name" not in headers:
+ if "TRANSLOCATION_NAME" not in headers:
raise NotImplementedError("Could not find Translocation Name column in the input file.")
outputHeaders = ['gene', 'fusion_genes', 'fusion_id']
@@ -99,7 +99,7 @@ def renderFusionGeneDictEntry(geneKey, fusionGeneDict):
fusionGeneDict = OrderedDict()
last_i = 0
for i, line in enumerate(tsvReader):
- fusion_gene_description = line['Translocation Name']
+ fusion_gene_description = line['TRANSLOCATION_NAME']
if len(fusion_gene_description.strip()) == 0:
# blank
diff --git a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh
index 800242de91a..b7a756a7932 100755
--- a/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh
+++ b/scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh
@@ -18,6 +18,7 @@ set -e
COSMIC_FILE=CosmicCompleteTargetedScreensMutantExport.tsv
OUT_DB_FILE="Cosmic.db"
+OUT_TMP_FOLDER="~/tmp"
################################################################################
@@ -29,6 +30,10 @@ if [[ $# -gt 1 ]] ; then
OUT_DB_FILE=$2
fi
+if [[ $# -gt 2 ]] ; then
+ OUT_TMP_FOLDER=$3
+fi
+
if [ ! -f ${COSMIC_FILE} ] ; then
echo "ERROR: Given COSMIC file does not exist: ${COSMIC_FILE}" 1>&2
exit 1
@@ -42,6 +47,7 @@ sqlite3 ${OUT_DB_FILE} < cosmic/metadata.txt
echo "User: ${EMAIL}" >> cosmic/metadata.txt
diff --git a/scripts/funcotator/data_sources/downloadHgncDataSource.sh b/scripts/funcotator/data_sources/downloadHgncDataSource.sh
index aa3d120d67d..b3e86a90b81 100755
--- a/scripts/funcotator/data_sources/downloadHgncDataSource.sh
+++ b/scripts/funcotator/data_sources/downloadHgncDataSource.sh
@@ -2,5 +2,5 @@
# Downloads the HGNC data source from the HGNC website.
-curl 'https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submit' > hgnc_download_$(date +%b%d%Y).tsv
+curl 'https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_locus_type&col=gd_locus_group&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_name_aliases&col=gd_pub_chrom_map&col=gd_date_mod&col=gd_date_sym_change&col=gd_date_name_change&col=gd_pub_acc_ids&col=gd_enz_ids&col=gd_pub_eg_id&col=gd_pub_ensembl_id&col=gd_pubmed_ids&col=gd_pub_refseq_ids&col=family.id&col=family.name&col=gd_ccds_ids&col=gd_vega_ids&col=md_eg_id&col=md_mim_id&col=md_refseq_id&col=md_prot_id&col=md_ensembl_id&col=md_ucsc_id&status=Approved&status_opt=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbtag=on&submit=submi' > hgnc_download_$(date +%b%d%Y).tsv
diff --git a/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh
new file mode 100644
index 00000000000..5f124ef6a67
--- /dev/null
+++ b/scripts/funcotator/data_sources/finalizeFuncotatorReleaseDirectory.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+#NOTE: This script has been checked in to aid in the release process for future Funcotator datasource bundles.
+
+echo "Making Tarballs of each Datasource Directory..."
+
+tar -zcvf funcotator_dataSources.v1.8.hg38.20230908s.tar.gz funcotator_dataSources.v1.8.hg38.20230908s
+tar -zcvf funcotator_dataSources.v1.8.hg38.20230908g.tar.gz funcotator_dataSources.v1.8.hg38.20230908g
+tar -zcvf funcotator_dataSources.v1.8.hg19.20230908s.tar.gz funcotator_dataSources.v1.8.hg19.20230908s
+tar -zcvf funcotator_dataSources.v1.8.hg19.20230908g.tar.gz funcotator_dataSources.v1.8.hg19.20230908g
+
+echo "Making the various hashfiles for release"
+
+find funcotator_dataSources.v1.8.hg38.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908s.dir.long.md5sum
+md5sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908s.dir.md5sum
+sha256sum funcotator_dataSources.v1.8.hg38.20230908s.tar.gz > funcotator_dataSources.v1.8.hg38.20230908s.sha256
+
+find funcotator_dataSources.v1.8.hg38.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg38.20230908g.dir.long.md5sum
+md5sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg38.20230908g.dir.md5sum
+sha256sum funcotator_dataSources.v1.8.hg38.20230908g.tar.gz > funcotator_dataSources.v1.8.hg38.20230908g.sha256
+
+find funcotator_dataSources.v1.8.hg19.20230908s -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908s.dir.long.md5sum
+md5sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908s.dir.md5sum
+sha256sum funcotator_dataSources.v1.8.hg19.20230908s.tar.gz > funcotator_dataSources.v1.8.hg19.20230908s.sha256
+
+find funcotator_dataSources.v1.8.hg19.20230908g -type f | xargs md5sum > funcotator_dataSources.v1.8.hg19.20230908g.dir.long.md5sum
+md5sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz | awk '{print $1}' > funcotator_dataSources.v1.8.hg19.20230908g.dir.md5sum
+sha256sum funcotator_dataSources.v1.8.hg19.20230908g.tar.gz > funcotator_dataSources.v1.8.hg19.20230908g.sha256
\ No newline at end of file
diff --git a/scripts/funcotator/data_sources/getGencode.sh b/scripts/funcotator/data_sources/getGencode.sh
index 7ab0316787b..aaf159add43 100755
--- a/scripts/funcotator/data_sources/getGencode.sh
+++ b/scripts/funcotator/data_sources/getGencode.sh
@@ -15,7 +15,7 @@ MAXARGS=0
# Latest release numbers for our references.
# Update these numbers when a new Gencode is released.
-LATEST_RELEASE=34
+LATEST_RELEASE=43
DATA_SOURCE_NAME="Gencode"
OUT_DIR_NAME='gencode'
diff --git a/scripts/funcotator/data_sources/getGencodeXHGNC.sh b/scripts/funcotator/data_sources/getGencodeXHGNC.sh
index 656e6c8bc45..cb4ada59b1f 100755
--- a/scripts/funcotator/data_sources/getGencodeXHGNC.sh
+++ b/scripts/funcotator/data_sources/getGencodeXHGNC.sh
@@ -8,10 +8,10 @@ outFileBaseName="gencode_xhgnc"
outExt=".tsv"
hg19db="homo_sapiens_core_75_37"
-hg38db="homo_sapiens_core_90_38"
+hg38db="homo_sapiens_core_110_38"
hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
-hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
+hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}
################################################################################
diff --git a/scripts/funcotator/data_sources/getGencodeXRefseq.sh b/scripts/funcotator/data_sources/getGencodeXRefseq.sh
index 58c06455e4b..809fe22fcbb 100755
--- a/scripts/funcotator/data_sources/getGencodeXRefseq.sh
+++ b/scripts/funcotator/data_sources/getGencodeXRefseq.sh
@@ -8,10 +8,10 @@ outFileBaseName="gencode_xrefseq"
outExt=".tsv"
hg19db="homo_sapiens_core_75_37"
-hg38db="homo_sapiens_core_90_38"
+hg38db="homo_sapiens_core_110_38"
hg19FileName=${outFileBaseName}_v75_37.hg19${outExt}
-hg38FileName=${outFileBaseName}_v90_38.hg38${outExt}
+hg38FileName=${outFileBaseName}_v110_38.hg38${outExt}
################################################################################
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java
index 8468594d506..80b7a5bce40 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloader.java
@@ -32,8 +32,8 @@
*
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
*
- * - For somatic data sources:
{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}
- * - For germline data sources:
{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}
+ * - For somatic data sources:
{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}
+ * - For germline data sources:
{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}
*
*
*
@@ -63,6 +63,8 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
public static final String GERMLINE_ARG_LONG_NAME = "germline";
public static final String OVERWRITE_ARG_LONG_NAME = "overwrite-output-file";
public static final String EXTRACT_AFTER_DOWNLOAD = "extract-after-download";
+ public static final String HG38_ARG_LONG_NAME = "hg38";
+ public static final String HG19_ARG_LONG_NAME = "hg19";
//==================================================================================================================
// Private Static Members:
@@ -73,18 +75,27 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
// Private Static Members:
// Set to always get the latest version of the data sources:
- private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
- DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString();
+ private static final String HG38_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
+ DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(38);
+ private static final String HG19_BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
+ DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString(19);
+
+ private static final String HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
+ private static final String HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
- private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
@VisibleForTesting
- static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
- private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
+ static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
+ static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
+ private static final Path HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
+ private static final Path HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
- public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
+ public static final String HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG38_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
+ public static final String HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL = HG19_BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;
- public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
- private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
+ public static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
+ public static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
+ private static final Path HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG38_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
+ private static final Path HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(HG19_SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);
//==================================================================================================================
// Private Members:
@@ -129,6 +140,23 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
optional = true)
protected boolean extractDataSourcesAfterDownload = false;
+ @Argument(
+ shortName = HG38_ARG_LONG_NAME,
+ fullName = HG38_ARG_LONG_NAME,
+ mutex = {HG19_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
+ doc = "If set, will extract data from the HG38 data sources bucket.",
+ optional = true)
+ protected boolean getHg38Datasources = false;
+
+ @Argument(
+ //TODO should these be MUTEX or should one be allowed to download either?
+ shortName = HG19_ARG_LONG_NAME,
+ fullName = HG19_ARG_LONG_NAME,
+ mutex = {HG38_ARG_LONG_NAME, TESTING_OVERRIDE_PATH_FOR_DATA_SOURCES_SHA256_ARG},
+ doc = "If set, will extract data from the HG19 data sources bucket.",
+ optional = true)
+ protected boolean getHg19Datasources = false;
+
// Testing arguments:
@Hidden
@Advanced
@@ -164,6 +192,11 @@ protected void onStartup() {
throw new UserException("Must select either somatic or germline datasources.");
}
+ // Make sure the user specified at least one reference source to download:
+ if ((!getHg38Datasources) && (!getHg19Datasources) && (testingOverrideDataSourcesPath == null)) {
+ throw new UserException("Must select either HG19 or HG38 datasources.");
+ }
+
// Make sure the testing inputs are correct:
if ( ((testingOverrideDataSourcesPath == null) && (testingOverrideDataSourcesSha256Path != null)) ||
((testingOverrideDataSourcesSha256Path == null) && (testingOverrideDataSourcesPath != null)) ) {
@@ -184,14 +217,26 @@ protected Object doWork() {
// Get the correct data source:
if ( getSomaticDataSources ) {
- dataSourceDescription = "Somatic";
- dataSourcesPath = SOMATIC_GCLOUD_DATASOURCES_PATH;
- dataSourcesSha256Path = SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
+ if (getHg38Datasources) {
+ dataSourceDescription = "HG38_Somatic";
+ dataSourcesPath = HG38_SOMATIC_GCLOUD_DATASOURCES_PATH;
+ dataSourcesSha256Path = HG38_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
+ } else { // Okay because HG38 and HG19 datasources are currently MUTEX and at least one is required
+ dataSourceDescription = "HG19_Somatic";
+ dataSourcesPath = HG19_SOMATIC_GCLOUD_DATASOURCES_PATH;
+ dataSourcesSha256Path = HG19_SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH;
+ }
}
else if ( getGermlineDataSources ) {
- dataSourceDescription = "Germline";
- dataSourcesPath = GERMLINE_GCLOUD_DATASOURCES_PATH;
- dataSourcesSha256Path = GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
+ if (getHg38Datasources) {
+ dataSourceDescription = "HG38_Germline";
+ dataSourcesPath = HG38_GERMLINE_GCLOUD_DATASOURCES_PATH;
+ dataSourcesSha256Path = HG38_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
+ } else {
+ dataSourceDescription = "HG19_Germline";
+ dataSourcesPath = HG19_GERMLINE_GCLOUD_DATASOURCES_PATH;
+ dataSourcesSha256Path = HG19_GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH;
+ }
}
else {
// Test case:
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
index 48dd6560bee..01791264a01 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
@@ -53,7 +53,9 @@ private DataSourceUtils() {}
private static final String MANIFEST_SOURCE_LINE_START = "Source:";
private static final String MANIFEST_ALT_SOURCE_LINE_START = "Alternate Source:";
@VisibleForTesting
- static final Pattern VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)");
+ static final Pattern OLD_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)");
+ static final Pattern NEW_VERSION_PATTERN = Pattern.compile(MANIFEST_VERSION_LINE_START + "\\s+(\\d+)\\.(\\d+)\\.hg(\\d+)\\.(\\d\\d\\d\\d)(\\d\\d)(\\d\\d)(.*)");
+
private static final Pattern SOURCE_PATTERN = Pattern.compile(MANIFEST_SOURCE_LINE_START + "\\s+(ftp.*)");
private static final Pattern ALT_SOURCE_PATTERN = Pattern.compile(MANIFEST_ALT_SOURCE_LINE_START + "\\s+(gs.*)");
@@ -69,9 +71,9 @@ private DataSourceUtils() {}
@VisibleForTesting
static final int MAX_MAJOR_VERSION_NUMBER = 1;
@VisibleForTesting
- static final int MAX_MINOR_VERSION_NUMBER = 7;
+ static final int MAX_MINOR_VERSION_NUMBER = 8;
@VisibleForTesting
- static final LocalDate MAX_DATE = LocalDate.of(2020, Month.MAY, 21);
+ static final LocalDate MAX_DATE = LocalDate.of(2023, Month.SEPTEMBER, 8);
//==================================================================================================================
// Public Static Members:
@@ -80,7 +82,7 @@ private DataSourceUtils() {}
public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = getDataSourceMinVersionString();
/** The maximum supported version of the data sources for funcotator to run. */
- public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString();
+ public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString(38);
public static final String MANIFEST_FILE_NAME = "MANIFEST.txt";
public static final String DATA_SOURCES_FTP_PATH = "ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/";
@@ -88,6 +90,8 @@ private DataSourceUtils() {}
public static final String DATA_SOURCES_NAME_PREFIX = "funcotator_dataSources";
public static final String DS_SOMATIC_NAME_MODIFIER = "s";
public static final String DS_GERMLINE_NAME_MODIFIER = "g";
+ public static final String DS_HG38_NAME_MODIFIER = "hg38";
+ public static final String DS_HG19_NAME_MODIFIER = "hg19";
public static final String DS_EXTENSION = ".tar.gz";
public static final String DS_CHECKSUM_EXTENSION = ".sha256";
@@ -137,8 +141,8 @@ public static String getDataSourceMinVersionString() {
* {@link #MAX_DATE}
* @return A {@link String} representing the Max version information as it would appear in the data sources file name.
*/
- public static String getDataSourceMaxVersionString() {
- return getDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, MAX_DATE);
+ public static String getDataSourceMaxVersionString(final int ref) {
+ return getNewDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, ref, MAX_DATE);
}
@@ -159,6 +163,25 @@ public static String getDataSourceVersionString(final int major, final int minor
date.getDayOfMonth()
);
}
+ /**
+ * Get the string representing the given version information for funcotator as it would be written in the data sources
+ * release files.
+ * @param major {@code int} representing the major version of the data sources to use.
+ * @param minor {@code int} representing the minor version of the data sources to use.
+ * @param ref {@code int} representing the hg reference number of the data sources to use.
+ * @param date {@link LocalDate} representing the date of the data sources to use.
+ * @return A {@link String} representing the given version information as it would appear in the data sources file name.
+ */
+ public static String getNewDataSourceVersionString(final int major, final int minor, final int ref, final LocalDate date) {
+ return String.format("v%d.%d.hg%d.%d%02d%02d",
+ major,
+ minor,
+ ref,
+ date.getYear(),
+ date.getMonthValue(),
+ date.getDayOfMonth()
+ );
+ }
/**
* Initializes the data sources for {@link Funcotator}.
@@ -704,7 +727,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {
while ((line != null) && ((version == null) || (source == null) || (alternateSource == null))) {
if (version == null && line.startsWith(MANIFEST_VERSION_LINE_START)) {
- final Matcher matcher = VERSION_PATTERN.matcher(line);
+ final Matcher matcher = NEW_VERSION_PATTERN.matcher(line);
if ( matcher.matches() ) {
versionMajor = Integer.valueOf(matcher.group(1));
versionMinor = Integer.valueOf(matcher.group(2));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java
index 359ab701153..9a7be220c34 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/mafOutput/MafOutputRendererConstants.java
@@ -178,19 +178,20 @@ public class MafOutputRendererConstants {
static final Map VariantClassificationMapInverse;
// Output Field Name Map Defaults:
- static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "gene", "Gene");
+ //TODO these are hardcoded to gencode versions and should be updated to generalize to any version of gencode (see https://github.com/broadinstitute/gatk/issues/8482)
+ static final List OutputFieldNameMap_Hugo_Symbol = Arrays.asList(FieldName_Hugo_Symbol, "Gencode_19_hugoSymbol", "Gencode_27_hugoSymbol", "Gencode_28_hugoSymbol", "Gencode_34_hugoSymbol", "Gencode_43_hugoSymbol", "gene", "Gene");
static final List OutputFieldNameMap_Entrez_Gene_Id = Arrays.asList(FieldName_Entrez_Gene_Id, "HGNC_Entrez_Gene_ID", "HGNC_Entrez Gene ID", "HGNC_Entrez_Gene_ID(supplied_by_NCBI)", "HGNC_Entrez Gene ID(supplied by NCBI)", "entrez_id", "gene_id");
static final List OutputFieldNameMap_Center = Arrays.asList(FieldName_Center, "center");
- static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "ncbi_build");
- static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom");
- static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "start", "Start", "start_pos", "pos");
- static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "end", "End", "end_pos");
+ static final List OutputFieldNameMap_NCBI_Build = Arrays.asList(FieldName_NCBI_Build, "Gencode_19_ncbiBuild", "Gencode_27_ncbiBuild", "Gencode_28_ncbiBuild", "Gencode_34_ncbiBuild", "Gencode_43_ncbiBuild", "ncbi_build");
+ static final List OutputFieldNameMap_Chromosome = Arrays.asList(FieldName_Chromosome, "Gencode_19_chromosome", "Gencode_27_chromosome", "Gencode_28_chromosome", "Gencode_34_chromosome", "Gencode_43_chromosome", "chr", "contig", "chromosome", "chrom", "Chrom");
+ static final List OutputFieldNameMap_Start_Position = Arrays.asList(FieldName_Start_Position, "Start_position", "Gencode_19_start", "Gencode_27_start", "Gencode_28_start", "Gencode_34_start", "Gencode_43_start", "start", "Start", "start_pos", "pos");
+ static final List OutputFieldNameMap_End_Position = Arrays.asList(FieldName_End_Position, "End_position", "Gencode_19_end", "Gencode_27_end", "Gencode_28_end", "Gencode_34_end", "Gencode_43_end", "end", "End", "end_pos");
static final List OutputFieldNameMap_Strand = Collections.singletonList(FieldName_Strand);
- static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "variant_classification");
- static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "variant_type");
- static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "ref", "ref_allele", "reference_allele");
- static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "ref", "ref_allele", "reference_allele");
- static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1");
+ static final List OutputFieldNameMap_Variant_Classification = Arrays.asList(FieldName_Variant_Classification, "Gencode_19_variantClassification", "Gencode_27_variantClassification", "Gencode_28_variantClassification", "Gencode_34_variantClassification", "Gencode_43_variantClassification", "variant_classification");
+ static final List OutputFieldNameMap_Variant_Type = Arrays.asList(FieldName_Variant_Type, "Gencode_19_variantType", "Gencode_27_variantType", "Gencode_28_variantType", "Gencode_34_variantType", "Gencode_43_variantType", "variant_type");
+ static final List OutputFieldNameMap_Reference_Allele = Arrays.asList(FieldName_Reference_Allele, "Gencode_19_refAllele", "Gencode_27_refAllele", "Gencode_28_refAllele", "Gencode_34_refAllele", "Gencode_43_refAllele", "ref", "ref_allele", "reference_allele");
+ static final List OutputFieldNameMap_Tumor_Seq_Allele1 = Arrays.asList(FieldName_Tumor_Seq_Allele1, "Gencode_19_tumorSeqAllele1", "Gencode_27_tumorSeqAllele1", "Gencode_28_tumorSeqAllele1", "Gencode_34_tumorSeqAllele1", "Gencode_43_tumorSeqAllele1", "ref", "ref_allele", "reference_allele");
+ static final List OutputFieldNameMap_Tumor_Seq_Allele2 = Arrays.asList(FieldName_Tumor_Seq_Allele2, "Gencode_19_tumorSeqAllele2", "Gencode_27_tumorSeqAllele2", "Gencode_28_tumorSeqAllele2", "Gencode_34_tumorSeqAllele2", "Gencode_43_tumorSeqAllele2", "alt", "alt_allele", "alt2", "alt_allele2", "alternate_allele2", "observed_allele2", "alternate_allele", "observed_allele", "alt1", "alt_allele1", "alternate_allele1", "observed_allele1");
static final List OutputFieldNameMap_dbSNP_RS = Arrays.asList(FieldName_dbSNP_RS, "dbsnp_rs", "dbSNP_RSPOS");
static final List OutputFieldNameMap_dbSNP_Val_Status = Arrays.asList(FieldName_dbSNP_Val_Status, MAF_DBSNP_VAL_STATUS_FIELD, "dbsnp_val_status", DBSNP_VLD_NAME);
static final List OutputFieldNameMap_Tumor_Sample_Barcode = Arrays.asList(FieldName_Tumor_Sample_Barcode, "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name");
@@ -212,15 +213,15 @@ public class MafOutputRendererConstants {
static final List OutputFieldNameMap_Sequencer = Arrays.asList(FieldName_Sequencer, "sequencer", "platform");
static final List OutputFieldNameMap_Tumor_Sample_UUID = Arrays.asList(FieldName_Tumor_Sample_UUID, "tumor_uuid", "case_uuid", "tumor_barcode", "tumor_id", "case_barcode", "case_id", "tumor_name", "Tumor_Sample_Barcode");
static final List OutputFieldNameMap_Matched_Norm_Sample_UUID = Arrays.asList(FieldName_Matched_Norm_Sample_UUID, "normal_uuid", "control_uuid", "normal_barcode", "normal_id", "control_barcode", "control_id", "normal_name", "sample_name", "Matched_Norm_Sample_Barcode");
- static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "genome_change");
- static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "annotation_transcript", "transcript_id");
- static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "transcript_strand");
- static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "transcript_exon");
- static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "transcript_position");
- static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "transcript_change");
- static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "codon_change");
- static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "protein_change");
- static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "other_transcripts");
+ static final List OutputFieldNameMap_Genome_Change = Arrays.asList(FieldName_Genome_Change, "Gencode_19_genomeChange", "Gencode_27_genomeChange", "Gencode_28_genomeChange", "Gencode_34_genomeChange", "Gencode_43_genomeChange", "genome_change");
+ static final List OutputFieldNameMap_Annotation_Transcript = Arrays.asList(FieldName_Annotation_Transcript, "Gencode_19_annotationTranscript", "Gencode_27_annotationTranscript", "Gencode_28_annotationTranscript", "Gencode_34_annotationTranscript", "Gencode_43_annotationTranscript", "annotation_transcript", "transcript_id");
+ static final List OutputFieldNameMap_Transcript_Strand = Arrays.asList(FieldName_Transcript_Strand, "Gencode_19_transcriptStrand", "Gencode_27_transcriptStrand", "Gencode_28_transcriptStrand", "Gencode_34_transcriptStrand", "Gencode_43_transcriptStrand", "transcript_strand");
+ static final List OutputFieldNameMap_Transcript_Exon = Arrays.asList(FieldName_Transcript_Exon, "Gencode_19_transcriptExon", "Gencode_27_transcriptExon", "Gencode_28_transcriptExon", "Gencode_34_transcriptExon", "Gencode_43_transcriptExon", "transcript_exon");
+ static final List OutputFieldNameMap_Transcript_Position = Arrays.asList(FieldName_Transcript_Position, "Gencode_19_transcriptPos", "Gencode_27_transcriptPos", "Gencode_28_transcriptPos", "Gencode_34_transcriptPos", "Gencode_43_transcriptPos", "transcript_position");
+ static final List OutputFieldNameMap_cDNA_Change = Arrays.asList(FieldName_cDNA_Change, "Gencode_19_cDnaChange", "Gencode_27_cDnaChange", "Gencode_28_cDnaChange", "Gencode_34_cDnaChange", "Gencode_43_cDnaChange", "transcript_change");
+ static final List OutputFieldNameMap_Codon_Change = Arrays.asList(FieldName_Codon_Change, "Gencode_19_codonChange", "Gencode_27_codonChange", "Gencode_28_codonChange", "Gencode_34_codonChange", "Gencode_43_codonChange", "codon_change");
+ static final List OutputFieldNameMap_Protein_Change = Arrays.asList(FieldName_Protein_Change, "Gencode_19_proteinChange", "Gencode_27_proteinChange", "Gencode_28_proteinChange", "Gencode_34_proteinChange", "Gencode_43_proteinChange", "protein_change");
+ static final List OutputFieldNameMap_Other_Transcripts = Arrays.asList(FieldName_Other_Transcripts, "Gencode_19_otherTranscripts", "Gencode_27_otherTranscripts", "Gencode_28_otherTranscripts", "Gencode_34_otherTranscripts", "Gencode_43_otherTranscripts", "other_transcripts");
static final List OutputFieldNameMap_Refseq_mRNA_Id = Arrays.asList(FieldName_Refseq_mRNA_Id, "Gencode_XRefSeq_mRNA_id", "gencode_xref_refseq_mRNA_id", "ENSEMBL_RefSeq_mRNA_accession", "RefSeq_mRNA_Id", "HGNC_RefSeq IDs");
static final List OutputFieldNameMap_Refseq_prot_Id = Arrays.asList(FieldName_Refseq_prot_Id, "Gencode_XRefSeq_prot_acc", "gencode_xref_refseq_prot_acc", "ENSEMBL_RefSeq_protein_accession", "RefSeq_prot_Id");
static final List OutputFieldNameMap_SwissProt_acc_Id = Arrays.asList(FieldName_SwissProt_acc_Id, "Simple_Uniprot_uniprot_accession", "uniprot_accession", "UniProt_uniprot_accession");
@@ -243,8 +244,8 @@ public class MafOutputRendererConstants {
static final List OutputFieldNameMap_TCGAscape_Amplification_Peaks = Arrays.asList(FieldName_TCGAscape_Amplification_Peaks, "TCGAScape_Amplification_Peaks");
static final List OutputFieldNameMap_TCGAscape_Deletion_Peaks = Arrays.asList(FieldName_TCGAscape_Deletion_Peaks, "TCGAScape_Deletion_Peaks");
static final List OutputFieldNameMap_DrugBank = Arrays.asList(FieldName_DrugBank, "Simple_Uniprot_DrugBank", "UniProt_DrugBank");
- static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "ref_context");
- static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "gc_content");
+ static final List OutputFieldNameMap_ref_context = Arrays.asList(FieldName_ref_context, "Gencode_19_referenceContext", "Gencode_27_referenceContext", "Gencode_28_referenceContext", "Gencode_34_referenceContext", "Gencode_43_referenceContext", "ref_context");
+ static final List OutputFieldNameMap_gc_content = Arrays.asList(FieldName_gc_content, "Gencode_19_gcContent", "Gencode_27_gcContent", "Gencode_28_gcContent", "Gencode_34_gcContent", "Gencode_43_gcContent", "gc_content");
static final List OutputFieldNameMap_CCLE_ONCOMAP_overlapping_mutations = Arrays.asList(FieldName_CCLE_ONCOMAP_overlapping_mutations, "CCLE_By_GP_overlapping_mutations");
static final List OutputFieldNameMap_CCLE_ONCOMAP_total_mutations_in_gene = Arrays.asList(FieldName_CCLE_ONCOMAP_total_mutations_in_gene, "CCLE_By_Gene_total_mutations_in_gene");
static final List OutputFieldNameMap_CGC_Mutation_Type = Arrays.asList(FieldName_CGC_Mutation_Type, "CGC_Mutation Type");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java
index 932cd39f087..e8f477c130c 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorDataSourceDownloaderIntegrationTest.java
@@ -30,20 +30,33 @@ public class FuncotatorDataSourceDownloaderIntegrationTest extends CommandLinePr
//==================================================================================================================
// Helper Methods:
- private Path getDataSourceRemotePath(final String dsTypeArg) {
+ private Path getDataSourceRemotePath(final String dsTypeArg, final String refVer) {
switch (dsTypeArg) {
case FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME:
- return FuncotatorDataSourceDownloader.SOMATIC_GCLOUD_DATASOURCES_PATH;
+ switch (refVer) {
+ case "hg19":
+ return FuncotatorDataSourceDownloader.HG19_SOMATIC_GCLOUD_DATASOURCES_PATH;
+ case "hg38":
+ return FuncotatorDataSourceDownloader.HG38_SOMATIC_GCLOUD_DATASOURCES_PATH;
+ default: throw new GATKException("Data source Reference Version does not exist: " + refVer);
+ }
+
case FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME:
- return FuncotatorDataSourceDownloader.GERMLINE_GCLOUD_DATASOURCES_PATH;
+ switch (refVer) {
+ case "hg19":
+ return FuncotatorDataSourceDownloader.HG19_GERMLINE_GCLOUD_DATASOURCES_PATH;
+ case "hg38":
+ return FuncotatorDataSourceDownloader.HG38_GERMLINE_GCLOUD_DATASOURCES_PATH;
+ default: throw new GATKException("Data source Reference Version does not exist: " + refVer);
+ }
default: throw new GATKException("Data source type does not exist: " + dsTypeArg);
}
}
- private void verifyDataSourcesExistThenDeleteThem(final String dsTypeArg, final boolean doExtract) {
+ private void verifyDataSourcesExistThenDeleteThem(final String dsTypeArg, final String refVer, final boolean doExtract) {
// Get the path to our files:
final Path currentPath = IOUtils.getPath(".");
- final Path remoteDataSourcePath = getDataSourceRemotePath(dsTypeArg);
+ final Path remoteDataSourcePath = getDataSourceRemotePath(dsTypeArg, refVer);
final Path expectedDownloadedDataSourcePath = currentPath.resolve(remoteDataSourcePath.getFileName().toString());
// Verify it exists and delete it:
@@ -105,36 +118,42 @@ private Object[][] provideForTestDownload() {
return new Object[][] {
{
FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME,
true,
true,
false
},
{
FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME,
true,
false,
false
},
{
FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME,
true,
true,
false
},
{
FuncotatorDataSourceDownloader.GERMLINE_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME,
true,
false,
false
},
{
FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG38_ARG_LONG_NAME,
true,
false,
true
},
{
FuncotatorDataSourceDownloader.SOMATIC_ARG_LONG_NAME,
+ FuncotatorDataSourceDownloader.HG19_ARG_LONG_NAME,
true,
false,
true
@@ -149,10 +168,11 @@ private Object[][] provideForTestDownload() {
dataProvider = "provideForTestDownload",
groups = {"funcotatorValidation", "bucket"}
)
- void testDownloadRealDataSources(final String dsTypeArg, final boolean doOverwrite, final boolean doValidate, final boolean doExtract) {
+ void testDownloadRealDataSources(final String dsTypeArg, final String refVer, final boolean doOverwrite, final boolean doValidate, final boolean doExtract) {
final ArgumentsBuilder arguments = new ArgumentsBuilder();
arguments.add(dsTypeArg, true);
+ arguments.add(refVer, true);
arguments.add(FuncotatorDataSourceDownloader.OVERWRITE_ARG_LONG_NAME, doOverwrite);
arguments.add(FuncotatorDataSourceDownloader.VALIDATE_INTEGRITY_ARG_LONG_NAME, doValidate);
arguments.add(FuncotatorDataSourceDownloader.EXTRACT_AFTER_DOWNLOAD, doExtract);
@@ -162,7 +182,7 @@ void testDownloadRealDataSources(final String dsTypeArg, final boolean doOverwri
// Now verify we got the data sources and clean up the files
// so we don't have up to 30 gigs of stuff lying around:
- verifyDataSourcesExistThenDeleteThem(dsTypeArg, doExtract);
+ verifyDataSourcesExistThenDeleteThem(dsTypeArg, refVer, doExtract);
}
@Test(dataProvider = "provideForTestDownloadSmallDummyDataSources",
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java
index 442281e3a4d..943043724bf 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtilsUnitTest.java
@@ -318,7 +318,7 @@ private Iterator