diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index cffd62b9e09..43d0180780a 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -20,8 +20,14 @@ If this is a new visual feature please add a before/after screenshot or gif here with e.g. [GifGrabber](http://www.gifgrabber.com/). # Notify reviewers -If your pull request involves changes to an existing file, one or more of the -people that edited before you might be good candidates to review it. Please use -`git blame ` to determine that and notify them here. Otherwise notify -everybody in the core team: -@cBioPortal/core-developers +Read our [Pull request merging +policy](../CONTRIBUTING.md#pull-request-merging-policy). If you are part of the +cBioPortal organization, notify the approprate team (remove inappropriate): + +@cBioPortal/frontend +@cBioPortal/backend +@cBioPortal/devops + +If you are not part of the cBioPortal organization look at who worked on the +file before you. Please use `git blame ` to determine that +and notify them here: diff --git a/.gitignore b/.gitignore index 4d19924dfed..ff84bb544be 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,5 @@ nbactions.xml */.project dependency-reduced-pom.xml /core/nbactions-private.xml +.profile.d/ +Procfile diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b7c7a7c5ef0..f6655e21280 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -86,6 +86,67 @@ Once you downloaded the images you do the following for each screenshot: - If the change in the screenshot is **desired**, add the screenshot to the repo, commit it and push it to your PR's branch. +## Pull Request Merging Policy +Pull Requests (PRs) are reviewed by the +[backend](https://github.com/orgs/cBioPortal/teams/backend), +[frontend](https://github.com/orgs/cBioPortal/teams/frontend) and +[devops](https://github.com/orgs/cBioPortal/teams/devops) teams of cBioPortal. +For each PR the submitter should propose what team(s) is/are appropriate to +review it. This is the current merging policy: + +- A documentation change needs one **Approve** +- A simple bugfix to rc or hotfix requires one **Approve** +- A new feature requires two **Approve**. One from someone at MSKCC and one from + another institution. + +If these requirements are met, any person with merge rights can merge to rc or +hotfix. + +## Pull Request Reviewers Guide +Here we describe the guidelines for the reviewer. Always follow the checks in +general, then follow the other checks that apply: + +### General +- Double check all the things in the **Checks** section of the Pull Request. + Remind the submitter if any of them are not fulfilled +- Are the test cases spanning a decent amount of scenarios? It is the + submitters as well as the reviewers responsibility to not let any errors + sneak into the portal. + +Bug fixes: + +- Should the bug that causes the issue be added as a test case? + +New features: + +- If this is a new feature make sure the proposed changes are in line with the + current planning of cBioPortal e.g. is the right API used, is this in line + with current refactoring efforts. + +### Backend +New features: + +- Is the new persistence stack used? + +### Frontend +New features: + +- What APIs are used to get the data? Is the REST API used? +- Should this be a separate library in a separate repo or should it be part of cBioPortal? +- Are dependencies properly listed? Ideally in a package.json +- How is the package included in cBioPortal? + +### Devops +New features: + +- Does the configuration style follow the config guidelines? That is compile + (Maven) config goes in the appriopriate `pom.xml` (root, `scripts/`, `portal/`, `core/`). + Runtime (Spring) goes in `portal.properties`. Default values should be in `GlobalProperties.java`. +- Non-stable configuration should be done through war overlays. +- Is the configuration tested as part of Travis CI? It's not a necessity but be + aware that untested configuration will be tough to maintain. +- Is there documentation on the proposed changes? + ## Additional Resources * [cBioPortal Issue Tracker](https://github.com/cBioPortal/cbioportal/issues) diff --git a/business/pom.xml b/business/pom.xml index f4386f9b128..b724c35f2f0 100644 --- a/business/pom.xml +++ b/business/pom.xml @@ -4,7 +4,7 @@ master org.mskcc.cbio - 1.2.4 + 1.2.5 4.0.0 business diff --git a/core/pom.xml b/core/pom.xml index b5fa7daaf68..29a59b66e4a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -4,7 +4,7 @@ master org.mskcc.cbio - 1.2.4 + 1.2.5 4.0.0 core diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/ExtendedMutation.java b/core/src/main/java/org/mskcc/cbio/portal/model/ExtendedMutation.java index a48ea665403..b63a026e43a 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/model/ExtendedMutation.java +++ b/core/src/main/java/org/mskcc/cbio/portal/model/ExtendedMutation.java @@ -255,6 +255,14 @@ public String getOncotatorUniprotName() { return oncotatorUniprotName; } + /** + * Set the UniprotKB name (formerly known as ID) of the protein record. + * + * @param oncotatorUniprotName the UniprotKB name + * @deprecated set the accession with + * {@link #setOncotatorUniprotAccession(String)} instead + */ + @Deprecated public void setOncotatorUniprotName(String oncotatorUniprotName) { this.oncotatorUniprotName = oncotatorUniprotName; } @@ -835,6 +843,14 @@ public String getOncotatorUniprotName() return event.getOncotatorUniprotName(); } + /** + * Set the UniprotKB name (formerly known as ID) of the protein record. + * + * @param oncotatorUniprotName the UniprotKB name + * @deprecated set the accession with + * {@link #setOncotatorUniprotAccession(String)} instead + */ + @Deprecated public void setOncotatorUniprotName(String oncotatorUniprotName) { event.setOncotatorUniprotName(oncotatorUniprotName); diff --git a/core/src/main/java/org/mskcc/cbio/portal/model/GeneticProfile.java b/core/src/main/java/org/mskcc/cbio/portal/model/GeneticProfile.java index c091e60fffd..83eca3ff0cc 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/model/GeneticProfile.java +++ b/core/src/main/java/org/mskcc/cbio/portal/model/GeneticProfile.java @@ -32,6 +32,8 @@ package org.mskcc.cbio.portal.model; +import java.util.Properties; + import org.apache.commons.lang.builder.ToStringBuilder; /** @@ -47,6 +49,7 @@ public class GeneticProfile { private String profileDescription; private String targetLine; private boolean showProfileInAnalysisTab; + private Properties otherMetadataFields; public GeneticProfile() { super(); @@ -54,7 +57,7 @@ public GeneticProfile() { public GeneticProfile(String stableId, int cancerStudyId, GeneticAlterationType geneticAlterationType, String datatype, String profileName, String profileDescription, boolean showProfileInAnalysisTab) { - super(); + this(); this.stableId = stableId; this.cancerStudyId = cancerStudyId; this.geneticAlterationType = geneticAlterationType; @@ -64,6 +67,26 @@ public GeneticProfile(String stableId, int cancerStudyId, GeneticAlterationType this.showProfileInAnalysisTab = showProfileInAnalysisTab; } + + /** + * Constructs a new genetic profile object with the same attributes as the one given as an argument. + * + * @param template the object to copy + */ + public GeneticProfile(GeneticProfile template) { + this( + template.getStableId(), + template.getCancerStudyId(), + template.getGeneticAlterationType(), + template.getDatatype(), + template.getProfileName(), + template.getProfileDescription(), + template.showProfileInAnalysisTab()); + this.setGeneticProfileId(template.geneticProfileId); + this.setTargetLine(template.getTargetLine()); + this.setOtherMetadataFields(template.getAllOtherMetadataFields()); + } + public int getGeneticProfileId() { return geneticProfileId; } @@ -135,10 +158,42 @@ public boolean showProfileInAnalysisTab() { public void setShowProfileInAnalysisTab(boolean showProfileInAnalysisTab) { this.showProfileInAnalysisTab = showProfileInAnalysisTab; } - + + /** + * Stores metadata fields only recognized in particular data file types. + * + * @param fields a properties instance holding the keys and values + */ + public void setOtherMetadataFields(Properties fields) { + this.otherMetadataFields = fields; + } + + /** + * Returns all file-specific metadata fields as a Properties object. + * + * @return a properties instance holding the keys and values or null + */ + public Properties getAllOtherMetadataFields() { + return this.otherMetadataFields; + } + + /** + * Retrieves metadata fields specific to certain data file types. + * + * @param fieldname the name of the field to retrieve + * @return the value of the field or null + */ + public String getOtherMetaDataField(String fieldname) { + if (otherMetadataFields == null) { + return null; + } else { + return otherMetadataFields.getProperty(fieldname); + } + } + @Override public String toString() { return ToStringBuilder.reflectionToString(this); } - + } \ No newline at end of file diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/FetchPfamGraphicsData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/FetchPfamGraphicsData.java index dc4ddd158ec..5df5349322a 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/FetchPfamGraphicsData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/FetchPfamGraphicsData.java @@ -48,7 +48,7 @@ */ public class FetchPfamGraphicsData { - public static final String URL_PREFIX = "http://pfam.sanger.ac.uk/protein/"; + public static final String URL_PREFIX = "http://pfam.xfam.org/protein/"; public static final String URL_SUFFIX = "/graphic"; /** diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCaisesClinicalXML.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCaisesClinicalXML.java index 4a83ca336ce..dc6dbab6a6f 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCaisesClinicalXML.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCaisesClinicalXML.java @@ -35,79 +35,105 @@ import java.io.File; import org.mskcc.cbio.portal.dao.*; import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.SpringUtil; import org.dom4j.*; import org.dom4j.io.SAXReader; import joptsimple.*; import java.util.*; -import java.io.FileInputStream; - /** * * @author jgao */ -public final class ImportCaisesClinicalXML { +public class ImportCaisesClinicalXML extends ConsoleRunnable { + + private File xmlFile; + private int cancerStudyId; - private ImportCaisesClinicalXML() {} + public ImportCaisesClinicalXML(String[] args) { + super(args); + } + + /** + * Runs the command as a script and exits with an appropriate exit code. + * + * @param args the arguments given on the command line + */ + public static void main(String[] args) { + ConsoleRunnable runner = new ImportCaisesClinicalXML(args); + runner.runInConsole(); + } - public static void main(String[] args) throws Exception { + public void run() { // args = new String[] {"--data","/Users/jgao/projects/cbio-portal-data/studies/prad/su2c/data_clinical_caises.xml", // "--meta","/Users/jgao/projects/cbio-portal-data/studies/prad/su2c/meta_clinical_caises.txt", // "--loadMode", "bulkLoad"}; - if (args.length < 4) { - System.out.println("command line usage: importCaisesXml --data --meta "); - return; - } - - OptionParser parser = new OptionParser(); - parser.accepts("noprogress"); - OptionSpec data = parser.accepts( "data", - "caises data file" ).withRequiredArg().describedAs( "data_clinical_caises.xml" ).ofType( String.class ); - OptionSpec meta = parser.accepts( "meta", - "meta (description) file" ).withRequiredArg().describedAs( "meta_clinical_caises.txt" ).ofType( String.class ); - parser.acceptsAll(Arrays.asList("dbmsAction", "loadMode")); - OptionSet options = null; - try { - options = parser.parse( args ); - //exitJVM = !options.has(returnFromMain); - } catch (OptionException e) { - e.printStackTrace(); - } - - String dataFile = null; - if( options.has( data ) ){ - dataFile = options.valueOf( data ); - }else{ - throw new Exception( "'data' argument required."); - } + try { + String progName = "ImportCaisesClinicalXML"; + String description = "Import clinical Caises XML files"; - String descriptorFile = null; - if( options.has( meta ) ){ - descriptorFile = options.valueOf( meta ); - }else{ - throw new Exception( "'meta' argument required."); - } - - Properties properties = new TrimmedProperties(); - properties.load(new FileInputStream(descriptorFile)); - SpringUtil.initDataSource(); - - CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(properties.getProperty("cancer_study_identifier")); - if (cancerStudy == null) { - throw new Exception("Unknown cancer study: " + properties.getProperty("cancer_study_identifier")); + OptionParser parser = new OptionParser(); + parser.accepts("noprogress"); + OptionSpec data = parser.accepts( "data", + "caises data file" ).withRequiredArg().describedAs( "data_clinical_caises.xml" ).ofType( String.class ); + OptionSpec study = parser.accepts("study", + "cancer study identifier").withRequiredArg().describedAs("study").ofType(String.class); + parser.acceptsAll(Arrays.asList("dbmsAction", "loadMode")); + OptionSet options = null; + try { + options = parser.parse( args ); + //exitJVM = !options.has(returnFromMain); + } catch (OptionException e) { + throw new UsageException( + progName, description, parser, + e.getMessage()); + } + + String dataFile = null; + if( options.has( data ) ){ + dataFile = options.valueOf( data ); + } else{ + throw new UsageException( + progName, description, parser, + "'data' argument required"); + } + + String cancerStudyIdentifier = null; + if( options.has( study ) ){ + cancerStudyIdentifier = options.valueOf( study ); + } else{ + throw new UsageException( + progName, description, parser, + "'study' argument required"); + } + + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); + if (cancerStudy == null) { + throw new RuntimeException("Unknown cancer study: " + cancerStudyIdentifier); + } + + this.cancerStudyId = cancerStudy.getInternalId(); + DaoClinicalEvent.deleteByCancerStudyId(cancerStudyId); + this.xmlFile = new File(dataFile); + + importData(); + + System.out.println("Done!"); + } + catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); } - int cancerStudyId = cancerStudy.getInternalId(); - DaoClinicalEvent.deleteByCancerStudyId(cancerStudyId); - - importData(new File(dataFile), cancerStudy.getInternalId()); - - System.out.println("Done!"); } - static void importData(File xmlFile, int cancerStudyId) throws Exception { + public void setFile(File xmlFile, CancerStudy cancerStudy) { + this.xmlFile = xmlFile; + this.cancerStudyId = cancerStudy.getInternalId(); + } + + public void importData() throws Exception { MySQLbulkLoader.bulkLoadOn(); // add unknow attriutes -- this diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index b0ff8654bb4..d826fbb6f0e 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -60,6 +60,7 @@ public class ImportExtendedMutationData{ private File mutationFile; private int geneticProfileId; + private boolean swissprotIsAccession; private MutationFilter myMutationFilter; private int entriesSkipped = 0; private int samplesSkipped = 0; @@ -73,11 +74,23 @@ public class ImportExtendedMutationData{ public ImportExtendedMutationData(File mutationFile, int geneticProfileId) { this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; + this.swissprotIsAccession = false; // create default MutationFilter myMutationFilter = new MutationFilter( ); } + /** + * Turns parsing the SWISSPROT column as an accession on or off again. + * + * If off, the column will be parsed as the name (formerly ID). + * + * @param swissprotIsAccession whether to parse the column as an accession + */ + public void setSwissprotIsAccession(boolean swissprotIsAccession) { + this.swissprotIsAccession = swissprotIsAccession; + } + public void importData() throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); @@ -212,7 +225,6 @@ public void importData() throws IOException, DaoException { aaChange, codonChange, refseqMrnaId, - uniprotName, uniprotAccession; int proteinPosStart, @@ -244,8 +256,12 @@ public void importData() throws IOException, DaoException { aaChange = record.getAminoAcidChange(); codonChange = record.getCodons(); refseqMrnaId = record.getRefSeq(); - uniprotName = record.getSwissprot(); - uniprotAccession = DaoUniProtIdMapping.mapFromUniprotIdToAccession(record.getSwissprot()); + if (this.swissprotIsAccession) { + uniprotAccession = record.getSwissprot(); + } else { + String uniprotName = record.getSwissprot(); + uniprotAccession = DaoUniProtIdMapping.mapFromUniprotIdToAccession(uniprotName); + } proteinPosStart = ExtendedMutationUtil.getProteinPosStart( record.getProteinPosition(), proteinChange); proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd( @@ -369,7 +385,6 @@ public void importData() throws IOException, DaoException { // TODO rename the oncotator column names (remove "oncotator") mutation.setOncotatorCodonChange(codonChange); mutation.setOncotatorRefseqMrnaId(refseqMrnaId); - mutation.setOncotatorUniprotName(uniprotName); mutation.setOncotatorUniprotAccession(uniprotAccession); mutation.setOncotatorProteinPosStart(proteinPosStart); mutation.setOncotatorProteinPosEnd(proteinPosEnd); diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index fae85ea922d..9a2e8357855 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -78,6 +78,16 @@ public void run() { ImportExtendedMutationData importer = new ImportExtendedMutationData( dataFile, geneticProfile.getGeneticProfileId()); + String swissprotIdType = geneticProfile.getOtherMetaDataField("swissprot_identifier"); + if (swissprotIdType != null && swissprotIdType.equals("accession")) { + importer.setSwissprotIsAccession(true); + } else if ( + swissprotIdType != null && + swissprotIdType != "name") { + throw new RuntimeException( + "Unrecognized swissprot_identifier " + + "specification, must be 'name' or 'accession'."); + } importer.importData(); } else if (geneticProfile.getGeneticAlterationType().equals(GeneticAlterationType.FUSION)) { diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java index 79d05fa3991..0ff5649d3b8 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java @@ -260,7 +260,7 @@ private static void computeZScoreXP(String file){ // Double.NaN indicates an invalid expression value if(zscores[k] != Double.NaN){ // limit precision - outputLine.add( String.format( "%.4f", zscores[k] ) ); + outputLine.add( String.format( Locale.US, "%.4f", zscores[k] ) ); }else{ outputLine.add( NOT_AVAILABLE ); } diff --git a/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index ad5259be8d1..8fed3c6ab2b 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/core/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -84,7 +84,12 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D // For mutation data only we can have multiple files with the same genetic_profile. // There is a constraint in the mutation database table to prevent duplicated data // If this constraint is hit (mistakenly importing the same maf twice) MySqlBulkLoader will throw an exception - return existingGeneticProfile; + + // make an object combining the pre-existing profile with the file-specific properties of the current file + GeneticProfile gp = new GeneticProfile(existingGeneticProfile); + gp.setTargetLine(geneticProfile.getTargetLine()); + gp.setOtherMetadataFields(geneticProfile.getAllOtherMetadataFields()); + return gp; } } // add new profile @@ -178,6 +183,7 @@ public static GeneticProfile loadGeneticProfileFromMeta(File file) throws IOExce geneticProfile.setDatatype(datatype); geneticProfile.setShowProfileInAnalysisTab(showProfileInAnalysisTab); geneticProfile.setTargetLine(properties.getProperty("target_line")); + geneticProfile.setOtherMetadataFields(properties); return geneticProfile; } } diff --git a/core/src/main/scripts/importer/cbioportal_common.py b/core/src/main/scripts/importer/cbioportal_common.py index 1fa8d88ea76..e145ee31c02 100644 --- a/core/src/main/scripts/importer/cbioportal_common.py +++ b/core/src/main/scripts/importer/cbioportal_common.py @@ -124,7 +124,8 @@ class MetaFileTypes(object): 'profile_name': True, 'profile_description': True, 'data_filename': True, - 'normal_samples_list': False + 'normal_samples_list': False, + 'swissprot_identifier': False }, MetaFileTypes.EXPRESSION: { 'cancer_study_identifier': True, @@ -640,6 +641,16 @@ def parse_metadata_file(filename, extra={'filename_': filename, 'cause': metaDictionary['reference_genome_id']}) meta_file_type = None + if meta_file_type == MetaFileTypes.MUTATION: + if ('swissprot_identifier' in metaDictionary and + metaDictionary['swissprot_identifier'] not in ('name', + 'accession')): + logger.error( + "Invalid swissprot_identifier specification, must be either " + "'name' or 'accession'", + extra={'filename_': filename, + 'cause': metaDictionary['swissprot_identifier']}) + meta_file_type = None logger.info('Validation of meta file complete', extra={'filename_': filename}) return metaDictionary, meta_file_type diff --git a/core/src/main/scripts/importer/validateData.py b/core/src/main/scripts/importer/validateData.py index f10aea6d020..205167a7ae8 100755 --- a/core/src/main/scripts/importer/validateData.py +++ b/core/src/main/scripts/importer/validateData.py @@ -1,9 +1,32 @@ #!/usr/bin/env python2.7 -# ------------------------------------------------------------------------------ -# Data validation script - validates files before import into portal. -# ------------------------------------------------------------------------------ - +# +# Copyright (c) 2016 The Hyve B.V. +# This code is licensed under the GNU Affero General Public License (AGPL), +# version 3, or (at your option) any later version. +# + +# +# This file is part of cBioPortal. +# +# cBioPortal is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# + +"""Data validation script - validate files before import into portal. + +Run with the command line option --help for usage information. +""" # imports import sys @@ -291,7 +314,12 @@ def _validate_file(self): self.logger.debug('Starting validation of file') - with open(self.filename, 'rU') as data_file: + try: + opened_file = open(self.filename, 'rU') + except IOError: + self.logger.error('File could not be opened') + return + with opened_file as data_file: # parse any block of start-of-file comment lines and the tsv header top_comments = [] @@ -967,6 +995,12 @@ def checkHeader(self, cols): 'that the Uniprot canonical isoform is used when drawing Pfam ' 'domains in the mutations view', extra={'line_number': self.line_number}) + elif not 'swissprot_identifier' in self.meta_dict: + self.logger.warning( + "A SWISSPROT column was found in a file without an " + "associated 'swissprot_identifier' metadatum, assuming " + "'swissprot_identifier: name'.", + extra={'column_number': self.cols.index('SWISSPROT') + 1}) # one of these columns should be present: if not ('HGVSp_Short' in self.cols or 'Amino_Acid_Change' in self.cols): @@ -1189,18 +1223,38 @@ def checkNotBlank(self, value): if value is None or value.strip() == '': return False return True - + def checkSwissProt(self, value): - """Test whether SWISSPROT string is blank and give warning if blank.""" + """Validate the name or accession in the SWISSPROT column.""" if value is None or value.strip() == '': self.logger.warning( 'Missing value in SWISSPROT column; this column is ' 'recommended to make sure that the Uniprot canonical isoform ' 'is used when drawing Pfam domains in the mutations view', extra={'line_number': self.line_number, - 'cause':'blank value in SWISSPROT column'}) - - # it is just a warning, so we can return True always: + 'cause':''}) + # no value to test, return without error + return True + if self.meta_dict.get('swissprot_identifier', 'name') == 'accession': + if not re.match( + # regex from http://www.uniprot.org/help/accession_numbers + r'^([OPQ][0-9][A-Z0-9]{3}[0-9]|' + r'[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$', + value): + # return this as an error + self.extra = 'SWISSPROT value is not a UniprotKB accession' + self.extra_exists = True + return False + else: + # format described on http://www.uniprot.org/help/entry_name + if not re.match( + r'^[A-Z0-9]{2,5}_[A-Z0-9]{2,5}$', + value): + # return this as an error + self.extra = 'SWISSPROT value is not a UniprotKB/Swiss-Prot name' + self.extra_exists = True + return False + # if no reasons to return with a message were found, return valid return True @@ -2344,28 +2398,28 @@ def process_metadata_files(directory, portal_instance, logger): def processCaseListDirectory(caseListDir, cancerStudyId, logger, - stableid_files=None): - """Validate the case lists in a directory and log findings. + prev_stableid_files=None): + """Validate the case lists in a directory and return an id/file mapping. Args: caseListDir (str): path to the case list directory. cancerStudyId (str): cancer_study_identifier expected in the files. logger: logging.Logger instance through which to send output. - stableid_files (Optional): dict mapping the stable ids of any case + prev_stableid_files (Optional): dict mapping the stable IDs of any case lists already defined to the files they were defined in. + + Returns: + Dict[str, str]: dict mapping the stable IDs of all valid defined case + lists to the files they were defined in, including the + prev_stableid_files argument """ logger.debug('Validating case lists') - # start with an empty dictionary if none was given - # (using mutable objects as default arguments directly is confusing) - if stableid_files == None: - stableid_files = {} - - # TODO: include ids based on the defined profiles here - required_id_suffixes = ('all', ) - required_stable_ids = [cancerStudyId + '_' + suffix for suffix in - required_id_suffixes] + stableid_files = {} + # include the previously defined stable IDs + if prev_stableid_files is not None: + stableid_files.update(prev_stableid_files) case_list_fns = [os.path.join(caseListDir, fn) for fn in os.listdir(caseListDir) if @@ -2411,21 +2465,30 @@ def processCaseListDirectory(caseListDir, cancerStudyId, logger, 'White space in sample id is not supported', extra={'filename_': case, 'cause': value}) - - for required_id in required_stable_ids: - if required_id not in stableid_files: - if required_id == cancerStudyId + '_all': - suggestion = ("Consider adding 'add_global_case_list: true' " - "to the study metadata file") - else: - suggestion = "Please define it in the 'case_lists' folder" - logger.error("No case list found for stable_id '%s'. %s", - required_id, - suggestion) + logger.info('Validation of case list folder complete') + + return stableid_files - logger.info('Validation of case lists complete') +def validate_defined_caselists(cancer_study_id, case_list_ids, file_types, logger): + + """Validate the set of case lists defined in a study. + + Args: + cancer_study_id (str): the study ID to be expected in the stable IDs + case_list_ids (Iterable[str]): stable ids of defined case lists + file_types (Dict[str, str]): listing of the MetaFileTypes with high- + dimensional data in this study--these may imply certain case lists + logger: logging.Logger instance to log output to + """ + + if cancer_study_id + '_all' not in case_list_ids: + logger.error( + "No case list found for stable_id '%s', consider adding " + "'add_global_case_list: true' to the study metadata file", + cancer_study_id + '_all') + # TODO: check for required suffixes based on the defined profiles def request_from_portal_api(server_url, api_name, logger): """Send a request to the portal API and return the decoded JSON object.""" @@ -2736,13 +2799,20 @@ def validate_study(study_dir, portal_instance, logger): continue validator.validate() - # finally validate case lists if present + # finally validate the case list directory if present case_list_dirname = os.path.join(study_dir, 'case_lists') if not os.path.isdir(case_list_dirname): logger.info("No directory named 'case_lists' found, so assuming no custom case lists.") else: - processCaseListDirectory(case_list_dirname, study_id, logger, - stableid_files=defined_case_list_fns) + # add case lists IDs defined in the directory to any previous ones + defined_case_list_fns = processCaseListDirectory( + case_list_dirname, study_id, logger, + prev_stableid_files=defined_case_list_fns) + + validate_defined_caselists( + study_id, defined_case_list_fns.keys(), + file_types=validators_by_meta_type.keys(), + logger=logger) logger.info('Validation complete') diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportCaisesClinicalXML.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportCaisesClinicalXML.java index 93692e19871..a5c12ce538f 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportCaisesClinicalXML.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportCaisesClinicalXML.java @@ -61,33 +61,27 @@ public class TestImportCaisesClinicalXML { @Before - public void setUp() { - try { - - TypeOfCancer typeOfCancer = new TypeOfCancer(); - typeOfCancer.setTypeOfCancerId("prad"); - typeOfCancer.setName("prad"); - typeOfCancer.setShortName("prad"); - DaoTypeOfCancer.addTypeOfCancer(typeOfCancer); - - CancerStudy cancerStudy = new CancerStudy("prad","prad","prad","prad",true); - DaoCancerStudy.addCancerStudy(cancerStudy); - - int studyId = DaoCancerStudy.getCancerStudyByStableId("prad").getInternalId(); + public void setUp() throws Exception { + TypeOfCancer typeOfCancer = new TypeOfCancer(); + typeOfCancer.setTypeOfCancerId("prad"); + typeOfCancer.setName("prad"); + typeOfCancer.setShortName("prad"); + DaoTypeOfCancer.addTypeOfCancer(typeOfCancer); - DaoPatient.addPatient(new Patient(cancerStudy, "97115001")); - DaoPatient.addPatient(new Patient(cancerStudy, "97115002")); - DaoPatient.addPatient(new Patient(cancerStudy, "97115003")); + CancerStudy cancerStudy = new CancerStudy("prad","prad","prad","prad",true); + DaoCancerStudy.addCancerStudy(cancerStudy); + + int studyId = DaoCancerStudy.getCancerStudyByStableId("prad").getInternalId(); - int patient1 = DaoPatient.getPatientByCancerStudyAndPatientId(studyId, "97115001").getInternalId(); - int patient2 = DaoPatient.getPatientByCancerStudyAndPatientId(studyId, "97115002").getInternalId(); + DaoPatient.addPatient(new Patient(cancerStudy, "97115001")); + DaoPatient.addPatient(new Patient(cancerStudy, "97115002")); + DaoPatient.addPatient(new Patient(cancerStudy, "97115003")); - DaoSample.addSample(new Sample("SC_9022-Tumor", patient1, "prad")); - DaoSample.addSample(new Sample("SC_9023-Tumor", patient2, "prad")); - - } catch (Exception e) { - e.printStackTrace(); - } + int patient1 = DaoPatient.getPatientByCancerStudyAndPatientId(studyId, "97115001").getInternalId(); + int patient2 = DaoPatient.getPatientByCancerStudyAndPatientId(studyId, "97115002").getInternalId(); + + DaoSample.addSample(new Sample("SC_9022-Tumor", patient1, "prad")); + DaoSample.addSample(new Sample("SC_9023-Tumor", patient2, "prad")); } // TODO add test methods here. @@ -96,7 +90,10 @@ public void setUp() { @Test public void test() throws Exception { File xmlFile = new File("target/test-classes/data_clinical_caises.xml"); - ImportCaisesClinicalXML.importData(xmlFile, 1); + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId("prad"); + ImportCaisesClinicalXML importCaisesClinicalXML = new ImportCaisesClinicalXML(null); + importCaisesClinicalXML.setFile(xmlFile, cancerStudy); + importCaisesClinicalXML.importData(); } // @Test @@ -110,4 +107,4 @@ public void test() throws Exception { // "--meta","/Users/jgao/projects/cbio-portal-data/studies/prad/su2c/meta_clinical_caises.txt", // "--loadMode", "bulkLoad"}); // } -} \ No newline at end of file +} diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportExtendedMutationData.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportExtendedMutationData.java index 91b6f461f96..74f519ba1b6 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportExtendedMutationData.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportExtendedMutationData.java @@ -33,10 +33,7 @@ package org.mskcc.cbio.portal.scripts; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; -import org.junit.Rule; -import org.junit.rules.ExpectedException; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.*; import org.mskcc.cbio.portal.model.*; @@ -47,7 +44,6 @@ import org.springframework.transaction.annotation.Transactional; import static org.junit.Assert.*; -import static org.junit.matchers.JUnitMatchers.containsString; import java.io.*; import java.util.ArrayList; @@ -82,33 +78,15 @@ public void setUp() throws DaoException { ProgressMonitor.setConsoleMode(false); loadGenes(); - } - - @Rule - public ExpectedException exception = ExpectedException.none(); - - - @Test - public void testException() { - MySQLbulkLoader.bulkLoadOn(); - - // TBD: change this to use getResourceAsStream() - File file = new File("target/test-classes/data_mutations_extended.txt"); - - //TODO - find new exception, germline option was removed some time ago already.... - //exception.expect(IllegalArgumentException.class); - //exception.expectMessage(containsString("Gene list 'no_such_germline_whitelistfile' not found")); + } - new ImportExtendedMutationData(file, geneticProfileId); - } - @Test public void testImportExtendedMutationDataExtended() throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); // TBD: change this to use getResourceAsStream() - File file = new File("target/test-classes/data_mutations_extended.txt"); + File file = new File("src/test/resources/data_mutations_extended.txt"); ImportExtendedMutationData parser = new ImportExtendedMutationData(file, geneticProfileId); parser.importData(); MySQLbulkLoader.flushAll(); @@ -121,8 +99,82 @@ public void testImportExtendedMutationDataExtended() throws IOException, DaoExce // accept everything else validateMutationAminoAcid(geneticProfileId, sampleId, 51806, "P113L"); // valid Unknown validateMutationAminoAcid(geneticProfileId, sampleId, 89, "S116R"); // Unknown Somatic - } - + } + + /** + * Tests import of data files with names in the SWISSPROT column. + * + * @throws IOException if something goes wrong reading from the data file + * @throws DaoException if something goes wrong talking to the database + */ + @Test + public void testImportExtendedMutationDataSwissprotName() throws IOException, DaoException { + loadStudyContext1(); + MySQLbulkLoader.bulkLoadOn(); + + File file = new File("src/test/resources/data_mutations_swissprotname.maf"); + ImportExtendedMutationData parser = new ImportExtendedMutationData(file, geneticProfileId); + parser.importData(); + MySQLbulkLoader.flushAll(); + + checkSwissprotLoaded(); + // unknown accessions are only loaded if the column lists accessions + int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-A2-A0CR-01").getInternalId(); + ExtendedMutation m = DaoMutation.getMutations( + geneticProfileId, sampleId, 64581).get(0); + assertNull(m.getOncotatorUniprotAccession()); + } + + /** + * Tests import of data files with accessions in the SWISSPROT column. + * + * @throws IOException if something goes wrong reading from the data file + * @throws DaoException if something goes wrong talking to the database + */ + @Test + public void testImportExtendedMutationDataSwissprotAccession() throws IOException, DaoException { + loadStudyContext1(); + MySQLbulkLoader.bulkLoadOn(); + + File file = new File("src/test/resources/data_mutations_swissprotaccession.maf"); + ImportExtendedMutationData parser = new ImportExtendedMutationData(file, geneticProfileId); + parser.setSwissprotIsAccession(true); + parser.importData(); + MySQLbulkLoader.flushAll(); + + checkSwissprotLoaded(); + // unknown accessions are only loaded if the column lists accessions + int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-A2-A0CR-01").getInternalId(); + ExtendedMutation m = DaoMutation.getMutations( + geneticProfileId, sampleId, 64581).get(0); + assertEquals("Z9ZZZ9ZZZ9", m.getOncotatorUniprotAccession()); + } + + /** + * Performs common assertions for the tests about the SWISSPROT column. + * + * @throws DaoException on errors reading from the database + */ + private void checkSwissprotLoaded() throws DaoException { + + int sampleId; + ExtendedMutation m; + + sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-A2-A0CR-01").getInternalId(); + validateMutationAminoAcid(geneticProfileId, sampleId, 64581, "K145Q"); + + sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-A2-A0T5-01").getInternalId(); + validateMutationAminoAcid(geneticProfileId, sampleId, 3339, "X4137?"); + m = DaoMutation.getMutations( + geneticProfileId, sampleId, 3339).get(0); + assertEquals("P98160", m.getOncotatorUniprotAccession()); + validateMutationAminoAcid(geneticProfileId, sampleId, 54407, "T32P"); + m = DaoMutation.getMutations( + geneticProfileId, sampleId, 54407).get(0); + assertEquals("Q96QD8", m.getOncotatorUniprotAccession()); + + } + /** * Check that import of oncotated data works * @throws IOException @@ -130,7 +182,7 @@ public void testImportExtendedMutationDataExtended() throws IOException, DaoExce */ @Test public void testImportExtendedMutationDataOncotated() throws IOException, DaoException { - File file = new File("target/test-classes/data_mutations_oncotated.txt"); + File file = new File("src/test/resources/data_mutations_oncotated.txt"); ImportExtendedMutationData parser = new ImportExtendedMutationData(file, geneticProfileId); parser.importData(); MySQLbulkLoader.flushAll(); @@ -149,14 +201,6 @@ public void testImportExtendedMutationDataOncotated() throws IOException, DaoExc // mutationList.get(15).getOncotatorCosmicOverlapping()); } - // reject somatic mutations that aren't valid somatic, or on one of the somatic whitelists - private void acceptEverythingElse() throws DaoException { - int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-AA-3664-01").getInternalId(); - - assertEquals(1, DaoMutation.getMutations(geneticProfileId, sampleId, 51806).size()); // valid Unknown - assertEquals(1, DaoMutation.getMutations(geneticProfileId, sampleId, 89).size()); // Unknown Somatic - } - private void checkBasicFilteringRules() throws DaoException { rejectSilentLOHIntronWildtype(); acceptValidSomaticMutations(); @@ -223,6 +267,30 @@ private void loadGenes() throws DaoException { daoGene.addGene(new CanonicalGene(6667L, "SP1")); daoGene.addGene(new CanonicalGene(2842L, "GPR19")); + boolean origBulkLoad = MySQLbulkLoader.isBulkLoad(); + try { + MySQLbulkLoader.bulkLoadOn(); + DaoUniProtIdMapping.addUniProtIdMapping("Q08043", "ACTN3_HUMAN", 89); + DaoUniProtIdMapping.addUniProtIdMapping("Q9H694", "BICC1_HUMAN", 80114); + DaoUniProtIdMapping.addUniProtIdMapping("Q9NZT1", "CALL5_HUMAN", 51806); + DaoUniProtIdMapping.addUniProtIdMapping("Q9BXN2", "CLC7A_HUMAN", 64581); + DaoUniProtIdMapping.addUniProtIdMapping("Q15760", "GPR19_HUMAN", 2842); + DaoUniProtIdMapping.addUniProtIdMapping("P98160", "PGBM_HUMAN", 3339); + DaoUniProtIdMapping.addUniProtIdMapping("Q96P20", "NALP3_HUMAN", 114548); + DaoUniProtIdMapping.addUniProtIdMapping("Q8NH19", "O10AG_HUMAN", 282770); + DaoUniProtIdMapping.addUniProtIdMapping("Q7Z3Z2", "RD3_HUMAN", 343035); + DaoUniProtIdMapping.addUniProtIdMapping("Q96QD8", "S38A2_HUMAN", 54407); + DaoUniProtIdMapping.addUniProtIdMapping("P17405", "ASM_HUMAN", 6609); + DaoUniProtIdMapping.addUniProtIdMapping("P08047", "SP1_HUMAN", 6667); + DaoUniProtIdMapping.addUniProtIdMapping("Q6ZVD7", "STOX1_HUMAN", 219736); + DaoUniProtIdMapping.addUniProtIdMapping("Q9NYW0", "T2R10_HUMAN", 50839); + DaoUniProtIdMapping.addUniProtIdMapping("Q9P0N5", "TM216_HUMAN", 51259); + } finally { + if (!origBulkLoad) { + MySQLbulkLoader.bulkLoadOff(); + } + } + // additional genes for "data_mutations_oncotated.txt" daoGene.addGene(new CanonicalGene(55138L, "FAM90A1")); daoGene.addGene(new CanonicalGene(10628L, "TXNIP")); @@ -248,4 +316,30 @@ private void loadGenes() throws DaoException { MySQLbulkLoader.flushAll(); } + + /** + * Loads the study context (defined samples) for specific test data files. + * + * @throws DaoException if failing to write to the database + */ + private void loadStudyContext1() throws DaoException { + CancerStudy study = DaoCancerStudy.getCancerStudyByInternalId(studyId); + int pId; + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A04T")); + DaoSample.addSample(new Sample("TCGA-A2-A04T-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A0CR")); + DaoSample.addSample(new Sample("TCGA-A2-A0CR-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A0CW")); + DaoSample.addSample(new Sample("TCGA-A2-A0CW-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A0D3")); + DaoSample.addSample(new Sample("TCGA-A2-A0D3-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A0SY")); + DaoSample.addSample(new Sample("TCGA-A2-A0SY-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A0T5")); + DaoSample.addSample(new Sample("TCGA-A2-A0T5-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A25D")); + DaoSample.addSample(new Sample("TCGA-A2-A25D-01", pId, "brca")); + pId = DaoPatient.addPatient(new Patient(study, "TCGA-A2-A4RW")); + DaoSample.addSample(new Sample("TCGA-A2-A4RW-01", pId, "brca")); + } } diff --git a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java index aea280d4339..17cd435be95 100644 --- a/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java +++ b/core/src/test/java/org/mskcc/cbio/portal/scripts/TestImportProfileData.java @@ -24,6 +24,7 @@ package org.mskcc.cbio.portal.scripts; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import java.util.ArrayList; import java.util.Arrays; @@ -77,7 +78,7 @@ public void setUp() throws DaoException { public ExpectedException exception = ExpectedException.none(); @Test - public void testImportMutationsFile() throws Exception { + public void testImportMutationFiles() throws Exception { String[] args = { "--data","src/test/resources/data_mutations_extended.txt", "--meta","src/test/resources/meta_mutations_extended.txt", @@ -89,10 +90,25 @@ public void testImportMutationsFile() throws Exception { //dataset study id (e.g. studyStableId + "_breast_mutations"): String studyStableId = "study_tcga_pub"; studyId = DaoCancerStudy.getCancerStudyByStableId(studyStableId).getInternalId(); + // the sample is added on the fly when encountered in the mutation data file int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-AA-3664-01").getInternalId(); geneticProfileId = DaoGeneticProfile.getGeneticProfileByStableId(studyStableId + "_breast_mutations").getGeneticProfileId(); - validateMutationAminoAcid (geneticProfileId, sampleId, 54407, "T433A"); + validateMutationAminoAcid (geneticProfileId, sampleId, 54407, "T433A"); + + // data for this sample should not exist before loading the next data file + assertNull(DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-AA-3665-01")); + // load a second mutation data file + String[] secondArgs = { + "--data","src/test/resources/data_mutations_extended_continued.txt", + "--meta","src/test/resources/meta_mutations_extended.txt", + "--loadMode", "bulkLoad" + }; + ImportProfileData secondRunner = new ImportProfileData(secondArgs); + secondRunner.run(); + // again, the sample is added on the fly + int secondSampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "TCGA-AA-3665-01").getInternalId(); + validateMutationAminoAcid (geneticProfileId, secondSampleId, 2842, "L113P"); } @Test diff --git a/core/src/test/resources/data_mutations_extended_continued.txt b/core/src/test/resources/data_mutations_extended_continued.txt new file mode 100644 index 00000000000..d62e1dfbe14 --- /dev/null +++ b/core/src/test/resources/data_mutations_extended_continued.txt @@ -0,0 +1,3 @@ +Hugo_Symbol Entrez_Gene_Id Center Tumor_Sample_Barcode Verification_Status Validation_Status Mutation_Status Sequencer Chromosome Start_position End_position Variant_Classification HGVSp_Short MA:FImpact MA:link.MSA MA:link.PDB +GPR19 2842 broad.mit.edu TCGA-AA-3665-01 Unknown valid Germline Illumina GAIIx chr12 12706312 12706312 Nonsense_Mutation L113P high mutationassessor.org/?cm=msa&ty=f&p=GPR19_HUMAN&rb=82&re=330&var=L113P mutationassessor.org/pdb.php?prot=GPR19_HUMAN&from=82&to=330&var=L113P +SP1 6667 broad.mit.edu TCGA-AA-3665-01 Unknown valid Somatic Illumina GAIIx chr12 52063536 52063536 Missense_Mutation A513V low mutationassessor.org/?cm=msa&ty=f&p=SP1_HUMAN&rb=401&re=600&var=A513V diff --git a/core/src/test/resources/data_mutations_swissprotaccession.maf b/core/src/test/resources/data_mutations_swissprotaccession.maf new file mode 100644 index 00000000000..0c28f8c85d1 --- /dev/null +++ b/core/src/test/resources/data_mutations_swissprotaccession.maf @@ -0,0 +1,12 @@ +Hugo_Symbol Tumor_Sample_Barcode Variant_Classification HGVSp_Short SWISSPROT +BICC1 TCGA-A2-A04T-01 Missense_Mutation p.L405S Q9H694 +CLEC7A TCGA-A2-A0CR-01 Missense_Mutation p.K145Q Z9ZZZ9ZZZ9 +HSPG2 TCGA-A2-A0CW-01 Missense_Mutation p.D468N P98160 +NLRP3 TCGA-A2-A0D3-01 Nonsense_Mutation p.R699* Q96P20 +SMPD1 TCGA-A2-A0SY-01 Silent p.P63P P17405 +HSPG2 TCGA-A2-A0T5-01 Splice_Site p.X4137? P98160 +HSPG2 TCGA-A2-A0T5-01 Silent p.= P98160 +HSPG2 TCGA-A2-A0T5-01 Intron p.(=) P98160 +SLC38A2 TCGA-A2-A0T5-01 Missense_Mutation p.T32P Q96QD8 +SLC38A2 TCGA-A2-A25D-01 Silent p.V485V Q96QD8 +TMEM216 TCGA-A2-A4RW-01 Missense_Mutation p.R115C Q9P0N5 diff --git a/core/src/test/resources/data_mutations_swissprotname.maf b/core/src/test/resources/data_mutations_swissprotname.maf new file mode 100644 index 00000000000..0a5a3518e6d --- /dev/null +++ b/core/src/test/resources/data_mutations_swissprotname.maf @@ -0,0 +1,12 @@ +Hugo_Symbol Tumor_Sample_Barcode Variant_Classification HGVSp_Short SWISSPROT +BICC1 TCGA-A2-A04T-01 Missense_Mutation p.L405S BICC1_HUMAN +CLEC7A TCGA-A2-A0CR-01 Missense_Mutation p.K145Q HBB_YEAST +HSPG2 TCGA-A2-A0CW-01 Missense_Mutation p.D468N PGBM_HUMAN +NLRP3 TCGA-A2-A0D3-01 Nonsense_Mutation p.R699* NALP3_HUMAN +SMPD1 TCGA-A2-A0SY-01 Silent p.P63P ASM_HUMAN +HSPG2 TCGA-A2-A0T5-01 Splice_Site p.X4137? PGBM_HUMAN +HSPG2 TCGA-A2-A0T5-01 Silent p.= PGBM_HUMAN +HSPG2 TCGA-A2-A0T5-01 Intron p.(=) PGBM_HUMAN +SLC38A2 TCGA-A2-A0T5-01 Missense_Mutation p.T32P S38A2_HUMAN +SLC38A2 TCGA-A2-A25D-01 Silent p.V485V S38A2_HUMAN +TMEM216 TCGA-A2-A4RW-01 Missense_Mutation p.R115C TM216_HUMAN diff --git a/core/src/test/resources/meta_mutations_extended.txt b/core/src/test/resources/meta_mutations_extended.txt index d1bc7856a22..5aa1eed1048 100644 --- a/core/src/test/resources/meta_mutations_extended.txt +++ b/core/src/test/resources/meta_mutations_extended.txt @@ -1,6 +1,7 @@ cancer_study_identifier: study_tcga_pub cancer_type_id: breast genetic_alteration_type: MUTATION_EXTENDED +datatype: MAF stable_id: breast_mutations show_profile_in_analysis_tab: true profile_description: Mutation data from whole exome sequencing. diff --git a/core/src/test/scripts/system_tests_validate_data.py b/core/src/test/scripts/system_tests_validate_data.py index 8c806b07e18..418b83cc77b 100755 --- a/core/src/test/scripts/system_tests_validate_data.py +++ b/core/src/test/scripts/system_tests_validate_data.py @@ -239,7 +239,7 @@ def test_files_with_quotes(self): # should fail because of errors with quotes self.assertEquals(1, exit_status) self.assertFileGenerated(out_file_name, - 'test_data/study_quotes/result_report.html') + 'test_data/study_quotes/result_report.html') if __name__ == '__main__': diff --git a/core/src/test/scripts/test_data/mutations/data_mutations_invalid_norm_samples.maf b/core/src/test/scripts/test_data/mutations/data_mutations_invalid_norm_samples.maf index d12f5f2afa2..f5e0c30b686 100644 --- a/core/src/test/scripts/test_data/mutations/data_mutations_invalid_norm_samples.maf +++ b/core/src/test/scripts/test_data/mutations/data_mutations_invalid_norm_samples.maf @@ -1,8 +1,8 @@ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_DBSNP_VAL_STATUS ONCOTATOR_VARIANT_CLASSIFICATION HGVSp_Short SWISSPROT ONCOTATOR_REFSEQ_MRNA_ID ONCOTATOR_REFSEQ_PROT_ID ONCOTATOR_UNIPROT_ENTRY_NAME ONCOTATOR_UNIPROT_ACCESSION ONCOTATOR_CODON_CHANGE ONCOTATOR_TRANSCRIPT_CHANGE ONCOTATOR_EXON_AFFECTED ONCOTATOR_PROTEIN_POS_START ONCOTATOR_PROTEIN_POS_END ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT ONCOTATOR_GENE_SYMBOL_BEST_EFFECT ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT ONCOTATOR_CODON_CHANGE_BEST_EFFECT ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT ONCOTATOR_EXON_AFFECTED_BEST_EFFECT ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT MA:FImpact MA:FIS MA:protein.change MA:link.MSA MA:link.PDB MA:link.var chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU Protein_position amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU annotation_errors_WU -A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors diff --git a/core/src/test/scripts/test_data/mutations/data_mutations_invalid_swissprot.maf b/core/src/test/scripts/test_data/mutations/data_mutations_invalid_swissprot.maf new file mode 100644 index 00000000000..aff874f34f7 --- /dev/null +++ b/core/src/test/scripts/test_data/mutations/data_mutations_invalid_swissprot.maf @@ -0,0 +1,8 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_DBSNP_VAL_STATUS ONCOTATOR_VARIANT_CLASSIFICATION HGVSp_Short SWISSPROT ONCOTATOR_REFSEQ_MRNA_ID ONCOTATOR_REFSEQ_PROT_ID ONCOTATOR_UNIPROT_ENTRY_NAME ONCOTATOR_UNIPROT_ACCESSION ONCOTATOR_CODON_CHANGE ONCOTATOR_TRANSCRIPT_CHANGE ONCOTATOR_EXON_AFFECTED ONCOTATOR_PROTEIN_POS_START ONCOTATOR_PROTEIN_POS_END ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT ONCOTATOR_GENE_SYMBOL_BEST_EFFECT ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT ONCOTATOR_CODON_CHANGE_BEST_EFFECT ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT ONCOTATOR_EXON_AFFECTED_BEST_EFFECT ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT MA:FImpact MA:FIS MA:protein.change MA:link.MSA MA:link.PDB MA:link.var chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU Protein_position amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU annotation_errors_WU +A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E P99999 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V A1CF_HUMAN NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I P99999 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C P99999,Z9ZZZ9ZZZ9 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N P99999 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E P99999 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* Z9ZZZ9ZZZ9 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors diff --git a/core/src/test/scripts/test_data/mutations/data_mutations_name_swissprot.maf b/core/src/test/scripts/test_data/mutations/data_mutations_name_swissprot.maf new file mode 100644 index 00000000000..f9dad84fe97 --- /dev/null +++ b/core/src/test/scripts/test_data/mutations/data_mutations_name_swissprot.maf @@ -0,0 +1,8 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_DBSNP_VAL_STATUS ONCOTATOR_VARIANT_CLASSIFICATION HGVSp_Short SWISSPROT ONCOTATOR_REFSEQ_MRNA_ID ONCOTATOR_REFSEQ_PROT_ID ONCOTATOR_UNIPROT_ENTRY_NAME ONCOTATOR_UNIPROT_ACCESSION ONCOTATOR_CODON_CHANGE ONCOTATOR_TRANSCRIPT_CHANGE ONCOTATOR_EXON_AFFECTED ONCOTATOR_PROTEIN_POS_START ONCOTATOR_PROTEIN_POS_END ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT ONCOTATOR_GENE_SYMBOL_BEST_EFFECT ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT ONCOTATOR_CODON_CHANGE_BEST_EFFECT ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT ONCOTATOR_EXON_AFFECTED_BEST_EFFECT ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT MA:FImpact MA:FIS MA:protein.change MA:link.MSA MA:link.PDB MA:link.var chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU Protein_position amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU annotation_errors_WU +A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E A1CF_HUMAN NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I A1CF_HUMAN NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C A1CF_HUMAN,HBB_YEAST NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N A2MG_HUMAN NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E A2MG_HUMAN NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* HBB_YEAST NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors diff --git a/core/src/test/scripts/test_data/mutations/data_mutations_wrong_aa_change.maf b/core/src/test/scripts/test_data/mutations/data_mutations_wrong_aa_change.maf index 755d9353b3e..7118c04caef 100644 --- a/core/src/test/scripts/test_data/mutations/data_mutations_wrong_aa_change.maf +++ b/core/src/test/scripts/test_data/mutations/data_mutations_wrong_aa_change.maf @@ -1,8 +1,8 @@ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_DBSNP_VAL_STATUS ONCOTATOR_VARIANT_CLASSIFICATION Amino_Acid_Change SWISSPROT ONCOTATOR_REFSEQ_MRNA_ID ONCOTATOR_REFSEQ_PROT_ID ONCOTATOR_UNIPROT_ENTRY_NAME ONCOTATOR_UNIPROT_ACCESSION ONCOTATOR_CODON_CHANGE ONCOTATOR_TRANSCRIPT_CHANGE ONCOTATOR_EXON_AFFECTED ONCOTATOR_PROTEIN_POS_START ONCOTATOR_PROTEIN_POS_END ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT ONCOTATOR_GENE_SYMBOL_BEST_EFFECT ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT ONCOTATOR_CODON_CHANGE_BEST_EFFECT ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT ONCOTATOR_EXON_AFFECTED_BEST_EFFECT ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT MA:FImpact MA:FIS MA:protein.change MA:link.MSA MA:link.PDB MA:link.var chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU Protein_position amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU annotation_errors_WU -A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V p.I167I A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167= A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.N851,Y1055delinsCC A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Missense_Mutation SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.[N851N];[Y1055C] A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Missense_Mutation p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation NULL A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V p.I167I Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167= Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.N851,Y1055delinsCC P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Missense_Mutation SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.[N851N];[Y1055C] P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Missense_Mutation p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation NULL P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors diff --git a/core/src/test/scripts/test_data/mutations/meta_mutations_invalid_swissprot_idspec.txt b/core/src/test/scripts/test_data/mutations/meta_mutations_invalid_swissprot_idspec.txt new file mode 100644 index 00000000000..fe4ec15ce93 --- /dev/null +++ b/core/src/test/scripts/test_data/mutations/meta_mutations_invalid_swissprot_idspec.txt @@ -0,0 +1,9 @@ +cancer_study_identifier: spam +genetic_alteration_type: MUTATION_EXTENDED +datatype: MAF +stable_id: mutations +show_profile_in_analysis_tab: true +profile_description: Mutation data from whole exome sequencing. +profile_name: Mutations +data_filename: spam.maf +swissprot_identifier: namelessly diff --git a/core/src/test/scripts/test_data/study_cancertype_two_files/meta_study.txt b/core/src/test/scripts/test_data/study_cancertype_two_files/meta_study.txt index a9f15f74da7..89f18fc3683 100644 --- a/core/src/test/scripts/test_data/study_cancertype_two_files/meta_study.txt +++ b/core/src/test/scripts/test_data/study_cancertype_two_files/meta_study.txt @@ -3,3 +3,4 @@ type_of_cancer: luad name: Spam (spam) description: Baked beans short_name: Spam +add_global_case_list: true diff --git a/core/src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt b/core/src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt index be1b8b72dba..813510be072 100644 --- a/core/src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt +++ b/core/src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt @@ -6,3 +6,4 @@ show_profile_in_analysis_tab: true profile_description: Mutation data from whole exome sequencing. profile_name: Mutations data_filename: brca_tcga_pub.maf +swissprot_identifier: name diff --git a/core/src/test/scripts/test_data/study_es_0/result_report.html b/core/src/test/scripts/test_data/study_es_0/result_report.html index 78a1bc2760d..e2464d0e8d1 100644 --- a/core/src/test/scripts/test_data/study_es_0/result_report.html +++ b/core/src/test/scripts/test_data/study_es_0/result_report.html @@ -170,7 +170,7 @@

General

Info – – - Validation of case lists complete + Validation of case list folder complete – diff --git a/core/src/test/scripts/test_data/study_maf_test/brca_tcga_pub.maf b/core/src/test/scripts/test_data/study_maf_test/brca_tcga_pub.maf index b9cf1fede57..9bf69c8ca71 100644 --- a/core/src/test/scripts/test_data/study_maf_test/brca_tcga_pub.maf +++ b/core/src/test/scripts/test_data/study_maf_test/brca_tcga_pub.maf @@ -1,16 +1,16 @@ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer ONCOTATOR_COSMIC_OVERLAPPING ONCOTATOR_DBSNP_RS ONCOTATOR_DBSNP_VAL_STATUS ONCOTATOR_VARIANT_CLASSIFICATION HGVSp_Short SWISSPROT ONCOTATOR_REFSEQ_MRNA_ID ONCOTATOR_REFSEQ_PROT_ID ONCOTATOR_UNIPROT_ENTRY_NAME ONCOTATOR_UNIPROT_ACCESSION ONCOTATOR_CODON_CHANGE ONCOTATOR_TRANSCRIPT_CHANGE ONCOTATOR_EXON_AFFECTED ONCOTATOR_PROTEIN_POS_START ONCOTATOR_PROTEIN_POS_END ONCOTATOR_VARIANT_CLASSIFICATION_BEST_EFFECT ONCOTATOR_PROTEIN_CHANGE_BEST_EFFECT ONCOTATOR_GENE_SYMBOL_BEST_EFFECT ONCOTATOR_REFSEQ_MRNA_ID_BEST_EFFECT ONCOTATOR_REFSEQ_PROT_ID_BEST_EFFECT ONCOTATOR_UNIPROT_ENTRY_NAME_BEST_EFFECT ONCOTATOR_UNIPROT_ACCESSION_BEST_EFFECT ONCOTATOR_CODON_CHANGE_BEST_EFFECT ONCOTATOR_TRANSCRIPT_CHANGE_BEST_EFFECT ONCOTATOR_EXON_AFFECTED_BEST_EFFECT ONCOTATOR_PROTEIN_POS_START_BEST_EFFECT ONCOTATOR_PROTEIN_POS_END_BEST_EFFECT MA:FImpact MA:FIS MA:protein.change MA:link.MSA MA:link.PDB MA:link.var chromosome_name_WU Protein_position stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU annotation_errors_WU -A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I A1CF NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors -A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2M 2 genome.wustl.edu 37 12 9256962 9256962 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0IQ-01 TCGA-B6-A0IQ-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.P380Q A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1138-1140)CCA>CAA c.1139C>A 11 380 380 Missense_Mutation p.P380Q A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1138-1140)CCA>CAA c.1139C>A 11 380 380 medium 2.625 P380Q getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=222&re=421&var=P380Q getma.org/pdb.php?prot=A2MG_HUMAN&from=222&to=421&var=P380Q getma.org/?cm=var&var=hg19,12,9256962,G,T&fts=all 12 9256962 9256962 G T SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.1139 p.P380Q 0.957 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 8975820 8975820 + Silent SNP C C A novel unknown TCGA-C8-A12Y-01 TCGA-C8-A12Y-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.S35S A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(103-105)TCC>TCA c.105C>A 2 35 35 Silent p.S35S A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(103-105)TCC>TCA c.105C>A 2 35 35 NA NA NA NA NA NA 12 8975820 8975820 C A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.105 p.S35 0.801 HMMPfam_A2M_N HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 8994108 8994108 + Missense_Mutation SNP G G C novel unknown TCGA-A1-A0SO-01 TCGA-A1-A0SO-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.W408C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1222-1224)TGG>TGC c.1224G>C 11 408 408 Missense_Mutation p.W408C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1222-1224)TGG>TGC c.1224G>C 11 408 408 medium 2.005 W408C getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=239&re=438&var=W408C getma.org/pdb.php?prot=A2ML1_HUMAN&from=239&to=438&var=W408C getma.org/?cm=var&var=hg19,12,8994108,G,C&fts=all 12 8994108 8994108 G C SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1224 p.W408C 0.995 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 8995779 8995779 + Missense_Mutation SNP G G A novel unknown TCGA-A8-A08P-01 TCGA-A8-A08P-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.R433H A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1297-1299)CGT>CAT c.1298G>A 12 433 433 Missense_Mutation p.R433H A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1297-1299)CGT>CAT c.1298G>A 12 433 433 neutral 0.255 R433H getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=239&re=438&var=R433H getma.org/pdb.php?prot=A2ML1_HUMAN&from=239&to=438&var=R433H getma.org/?cm=var&var=hg19,12,8995779,G,A&fts=all 12 8995779 8995779 G A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1298 p.R433H 0.108 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 8995942 8995942 + Silent SNP C C T novel unknown TCGA-C8-A12T-01 TCGA-C8-A12T-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I487I A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1459-1461)ATC>ATT c.1461C>T 12 487 487 Silent p.I487I A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1459-1461)ATC>ATT c.1461C>T 12 487 487 NA NA NA NA NA NA 12 8995942 8995942 C T SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1461 p.I487 0.040 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 8998791 8998791 + Silent SNP C C T novel unknown TCGA-BH-A0AV-01 TCGA-BH-A0AV-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.F552F A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1654-1656)TTC>TTT c.1656C>T 14 552 552 Silent p.F552F A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1654-1656)TTC>TTT c.1656C>T 14 552 552 NA NA NA NA NA NA 12 8998791 8998791 C T SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1656 p.F552 0.669 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 9000231 9000231 + Silent SNP G G A novel unknown TCGA-AN-A0FT-01 TCGA-AN-A0FT-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.A590A A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1768-1770)GCG>GCA c.1770G>A 15 590 590 Silent p.A590A A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1768-1770)GCG>GCA c.1770G>A 15 590 590 NA NA NA NA NA NA 12 9000231 9000231 G A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1770 p.A590 0.000 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors -A2ML1 144568 genome.wustl.edu 37 12 9001389 9001389 + Missense_Mutation SNP C C G novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.S636C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1906-1908)TCT>TGT c.1907C>G 16 636 636 Missense_Mutation p.S636C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1906-1908)TCT>TGT c.1907C>G 16 636 636 neutral -0.22 S636C getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=602&re=735&var=S636C getma.org/pdb.php?prot=A2ML1_HUMAN&from=602&to=735&var=S636C getma.org/?cm=var&var=hg19,12,9001389,C,G&fts=all 12 9001389 9001389 C G SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1907 p.S636C 0.165 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A1CF 29974 genome.wustl.edu 37 10 52573692 52573692 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0RS-01 TCGA-B6-A0RS-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.D424E Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(1270-1272)GAC>GAA c.1272C>A 10 424 424 Missense_Mutation p.D432E A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(1294-1296)GAC>GAA c.1296C>A 12 432 432 neutral 0.345 D424E getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=298&re=497&var=D424E NA getma.org/?cm=var&var=hg19,10,52573692,G,T&fts=all 10 52573692 52573692 G T SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.1272 p.D424E 0.768 NULL superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595854 52595854 + Missense_Mutation SNP G G A novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.A195V Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(583-585)GCG>GTG c.584C>T 6 195 195 Missense_Mutation p.A203V A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(607-609)GCG>GTG c.608C>T 8 203 203 medium 2.955 A195V getma.org/?cm=msa&ty=f&p=A1CF_HUMAN&rb=138&re=203&var=A195V getma.org/pdb.php?prot=A1CF_HUMAN&from=138&to=203&var=A195V getma.org/?cm=var&var=hg19,10,52595854,G,A&fts=all 10 52595854 52595854 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed missense c.584 p.A195V 1.000 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A1CF 29974 genome.wustl.edu 37 10 52595937 52595937 + Silent SNP G G A novel unknown TCGA-BH-A18P-01 TCGA-BH-A18P-11 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I167I Q9NQ94 NM_138932 NP_620310 A1CF_HUMAN Q9NQ94 c.(499-501)ATC>ATT c.501C>T 6 167 167 Silent p.I175I A1CF NM_138933 NP_620311 A1CF_HUMAN Q9NQ94 c.(523-525)ATC>ATT c.525C>T 8 175 175 NA NA NA NA NA NA 10 52595937 52595937 G A SNP A1CF NM_138932 human genbank 57_37b -1 reviewed silent c.501 p.I167 0.615 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1 superfamily_RNA-binding domain RBD,HMMSmart_SM00360,HMMPfam_RRM_1,superfamily_dsRNA-binding domain-like - no_errors +A2M 2 genome.wustl.edu 37 12 9230409 9230409 + Missense_Mutation SNP T T C novel unknown TCGA-BH-A18H-01 TCGA-BH-A18H-10 T T NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.Y1055C P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 Missense_Mutation p.Y1055C A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(3163-3165)TAC>TGC c.3164A>G 26 1055 1055 high 3.73 Y1055C getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=1010&re=1266&var=Y1055C getma.org/pdb.php?prot=A2MG_HUMAN&from=1010&to=1266&var=Y1055C getma.org/?cm=var&var=hg19,12,9230409,T,C&fts=all 12 9230409 9230409 T C SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.3164 p.Y1055C 0.143 superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_A2M_comp PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9242995 9242995 + Silent SNP G G A novel unknown TCGA-C8-A138-01 TCGA-C8-A138-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.N851N P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 Silent p.N851N A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2551-2553)AAC>AAT c.2553C>T 20 851 851 NA NA NA NA NA NA 12 9242995 9242995 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2553 p.N851 0.003 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9246090 9246090 + Silent SNP C C T novel unknown TCGA-A2-A0EY-01 TCGA-A2-A0EY-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.E737E P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 Silent p.E737E A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(2209-2211)GAG>GAA c.2211G>A 18 737 737 NA NA NA NA NA NA 12 9246090 9246090 C T SNP A2M NM_000014 human genbank 57_37b -1 reviewed silent c.2211 p.E737 1.000 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9251298 9251298 + Nonsense_Mutation SNP G G A novel unknown TCGA-A8-A08G-01 TCGA-A8-A08G-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Nonsense_Mutation p.R586* P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 Nonsense_Mutation p.R586* A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1756-1758)CGA>TGA c.1756C>T 15 586 586 NA 0.0 R586* NA NA getma.org/?cm=var&var=hg19,12,9251298,G,A&fts=all 12 9251298 9251298 G A SNP A2M NM_000014 human genbank 57_37b -1 reviewed nonsense c.1756 p.R586* 0.003 HMMPfam_A2M_N_2 PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2M 2 genome.wustl.edu 37 12 9256962 9256962 + Missense_Mutation SNP G G T novel unknown TCGA-B6-A0IQ-01 TCGA-B6-A0IQ-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.P380Q P01023 NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1138-1140)CCA>CAA c.1139C>A 11 380 380 Missense_Mutation p.P380Q A2M NM_000014 NP_000005 A2MG_HUMAN P01023 c.(1138-1140)CCA>CAA c.1139C>A 11 380 380 medium 2.625 P380Q getma.org/?cm=msa&ty=f&p=A2MG_HUMAN&rb=222&re=421&var=P380Q getma.org/pdb.php?prot=A2MG_HUMAN&from=222&to=421&var=P380Q getma.org/?cm=var&var=hg19,12,9256962,G,T&fts=all 12 9256962 9256962 G T SNP A2M NM_000014 human genbank 57_37b -1 reviewed missense c.1139 p.P380Q 0.957 NULL PatternScan_TONB_DEPENDENT_REC_1,HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 8975820 8975820 + Silent SNP C C A novel unknown TCGA-C8-A12Y-01 TCGA-C8-A12Y-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.S35S A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(103-105)TCC>TCA c.105C>A 2 35 35 Silent p.S35S A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(103-105)TCC>TCA c.105C>A 2 35 35 NA NA NA NA NA NA 12 8975820 8975820 C A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.105 p.S35 0.801 HMMPfam_A2M_N HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 8994108 8994108 + Missense_Mutation SNP G G C novel unknown TCGA-A1-A0SO-01 TCGA-A1-A0SO-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.W408C A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1222-1224)TGG>TGC c.1224G>C 11 408 408 Missense_Mutation p.W408C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1222-1224)TGG>TGC c.1224G>C 11 408 408 medium 2.005 W408C getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=239&re=438&var=W408C getma.org/pdb.php?prot=A2ML1_HUMAN&from=239&to=438&var=W408C getma.org/?cm=var&var=hg19,12,8994108,G,C&fts=all 12 8994108 8994108 G C SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1224 p.W408C 0.995 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 8995779 8995779 + Missense_Mutation SNP G G A novel unknown TCGA-A8-A08P-01 TCGA-A8-A08P-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.R433H A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1297-1299)CGT>CAT c.1298G>A 12 433 433 Missense_Mutation p.R433H A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1297-1299)CGT>CAT c.1298G>A 12 433 433 neutral 0.255 R433H getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=239&re=438&var=R433H getma.org/pdb.php?prot=A2ML1_HUMAN&from=239&to=438&var=R433H getma.org/?cm=var&var=hg19,12,8995779,G,A&fts=all 12 8995779 8995779 G A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1298 p.R433H 0.108 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 8995942 8995942 + Silent SNP C C T novel unknown TCGA-C8-A12T-01 TCGA-C8-A12T-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.I487I A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1459-1461)ATC>ATT c.1461C>T 12 487 487 Silent p.I487I A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1459-1461)ATC>ATT c.1461C>T 12 487 487 NA NA NA NA NA NA 12 8995942 8995942 C T SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1461 p.I487 0.040 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 8998791 8998791 + Silent SNP C C T novel unknown TCGA-BH-A0AV-01 TCGA-BH-A0AV-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.F552F A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1654-1656)TTC>TTT c.1656C>T 14 552 552 Silent p.F552F A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1654-1656)TTC>TTT c.1656C>T 14 552 552 NA NA NA NA NA NA 12 8998791 8998791 C T SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1656 p.F552 0.669 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 9000231 9000231 + Silent SNP G G A novel unknown TCGA-AN-A0FT-01 TCGA-AN-A0FT-10 G G NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Silent p.A590A A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1768-1770)GCG>GCA c.1770G>A 15 590 590 Silent p.A590A A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1768-1770)GCG>GCA c.1770G>A 15 590 590 NA NA NA NA NA NA 12 9000231 9000231 G A SNP A2ML1 NM_144670 human genbank 57_37b +1 validated silent c.1770 p.A590 0.000 HMMPfam_A2M_N_2 HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors +A2ML1 144568 genome.wustl.edu 37 12 9001389 9001389 + Missense_Mutation SNP C C G novel unknown TCGA-BH-A0HP-01 TCGA-BH-A0HP-10 C C NA NA NA NA Unknown Untested Somatic Phase_IV Capture NA 1 dbGAP Illumina GAIIx NA NA NA Missense_Mutation p.S636C A8K2U0 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1906-1908)TCT>TGT c.1907C>G 16 636 636 Missense_Mutation p.S636C A2ML1 NM_144670 NP_653271 A2ML1_HUMAN A8K2U0 c.(1906-1908)TCT>TGT c.1907C>G 16 636 636 neutral -0.22 S636C getma.org/?cm=msa&ty=f&p=A2ML1_HUMAN&rb=602&re=735&var=S636C getma.org/pdb.php?prot=A2ML1_HUMAN&from=602&to=735&var=S636C getma.org/?cm=var&var=hg19,12,9001389,C,G&fts=all 12 9001389 9001389 C G SNP A2ML1 NM_144670 human genbank 57_37b +1 validated missense c.1907 p.S636C 0.165 NULL HMMPfam_A2M_N,HMMPfam_A2M_N_2,HMMPfam_A2M,superfamily_Terpenoid cyclases/Protein prenyltransferases,HMMPfam_Thiol-ester_cl,PatternScan_ALPHA_2_MACROGLOBULIN,HMMPfam_A2M_comp,superfamily_Alpha-macroglobulin receptor domain,HMMPfam_A2M_recep - no_errors diff --git a/core/src/test/scripts/test_data/study_maf_test/meta_mutations_extended.txt b/core/src/test/scripts/test_data/study_maf_test/meta_mutations_extended.txt index fa760e233dd..bb41d75e38f 100644 --- a/core/src/test/scripts/test_data/study_maf_test/meta_mutations_extended.txt +++ b/core/src/test/scripts/test_data/study_maf_test/meta_mutations_extended.txt @@ -7,3 +7,4 @@ profile_description: Mutation data from whole exome sequencing. profile_name: Mutations data_filename: brca_tcga_pub.maf normal_samples_list: TCGA-B6-A0RS-10,TCGA-BH-A0HP-10,TCGA-BH-A18P-11,TCGA-BH-A18H-10,TCGA-C8-A138-10,TCGA-BH-A0HP-10 +swissprot_identifier: accession diff --git a/core/src/test/scripts/test_data/study_maf_test/result_report.html b/core/src/test/scripts/test_data/study_maf_test/result_report.html index c52e4061056..0ba0bdb1a5e 100644 --- a/core/src/test/scripts/test_data/study_maf_test/result_report.html +++ b/core/src/test/scripts/test_data/study_maf_test/result_report.html @@ -159,18 +159,18 @@

General

Validating case lists – - - Error + + Info – – - No case list found for stable_id 'brca_tcga_pub_all'. Consider adding 'add_global_case_list: true' to the study metadata file + Validation of case list folder complete – - - Info + + Error – – - Validation of case lists complete + No case list found for stable_id 'brca_tcga_pub_all', consider adding 'add_global_case_list: true' to the study metadata file – diff --git a/core/src/test/scripts/test_data/study_missing_caselists/data_samples.txt b/core/src/test/scripts/test_data/study_missing_caselists/data_samples.txt new file mode 100644 index 00000000000..50f22028c97 --- /dev/null +++ b/core/src/test/scripts/test_data/study_missing_caselists/data_samples.txt @@ -0,0 +1,13 @@ +#Patient Identifier Sample Identifier Subtype Cancer Type Cancer Type Detailed +#Identifier to uniquely specify a patient. A unique sample identifier. Subtype description. Disease type. Cancer Type Detailed. +#STRING STRING STRING STRING STRING +#1 1 1 1 1 +PATIENT_ID SAMPLE_ID SUBTYPE CANCER_TYPE CANCER_TYPE_DETAILED +TEST-PAT1 TEST-PAT1-SAMPLE1 Luminal A Cancer_type20 Cancer_type20_Sub16 +TEST-PAT1 TEST-PAT1-SAMPLE2 Luminal B Cancer_type27 Cancer_type27_Sub16 +TEST-PAT1 TEST-PAT1-SAMPLE3 basal-like Cancer_type8 Cancer_type8_Sub13 +TEST-PAT1 TEST-PAT1-SAMPLE4 basal-like Cancer_type17 Cancer_type17_Sub10 +TEST-PAT1 TEST-PAT1-SAMPLE5 Her2 enriched Cancer_type27 Cancer_type27_Sub18 +TEST-PAT1 TEST-PAT1-SAMPLE6 Luminal A Cancer_type7 Cancer_type7_Sub13 +TEST-PAT2 TEST-PAT2-SAMPLE1 Luminal A Cancer_type23 Cancer_type23_Sub3 +TEST-PAT2 TEST-PAT2-SAMPLE2 basal-like Cancer_type7 Cancer_type7_Sub5 diff --git a/core/src/test/scripts/test_data/study_missing_caselists/meta_samples.txt b/core/src/test/scripts/test_data/study_missing_caselists/meta_samples.txt new file mode 100644 index 00000000000..3dba005bab9 --- /dev/null +++ b/core/src/test/scripts/test_data/study_missing_caselists/meta_samples.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: spam +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: data_samples.txt diff --git a/core/src/test/scripts/test_data/study_missing_caselists/meta_study.txt b/core/src/test/scripts/test_data/study_missing_caselists/meta_study.txt new file mode 100644 index 00000000000..9ccec17d188 --- /dev/null +++ b/core/src/test/scripts/test_data/study_missing_caselists/meta_study.txt @@ -0,0 +1,5 @@ +cancer_study_identifier: spam +type_of_cancer: brca +name: Spam (spam) +description: Baked beans +short_name: Spam diff --git a/core/src/test/scripts/test_data/study_quotes/meta_mutations_extended.txt b/core/src/test/scripts/test_data/study_quotes/meta_mutations_extended.txt index be1b8b72dba..813510be072 100644 --- a/core/src/test/scripts/test_data/study_quotes/meta_mutations_extended.txt +++ b/core/src/test/scripts/test_data/study_quotes/meta_mutations_extended.txt @@ -6,3 +6,4 @@ show_profile_in_analysis_tab: true profile_description: Mutation data from whole exome sequencing. profile_name: Mutations data_filename: brca_tcga_pub.maf +swissprot_identifier: name diff --git a/core/src/test/scripts/test_data/study_quotes/result_report.html b/core/src/test/scripts/test_data/study_quotes/result_report.html index 98e8bb25d9d..4da015163ed 100644 --- a/core/src/test/scripts/test_data/study_quotes/result_report.html +++ b/core/src/test/scripts/test_data/study_quotes/result_report.html @@ -163,7 +163,7 @@

General

Info – – - Validation of case lists complete + Validation of case list folder complete – diff --git a/core/src/test/scripts/unit_tests_validate_data.py b/core/src/test/scripts/unit_tests_validate_data.py index 9e12d38e9e8..d5355df79e6 100755 --- a/core/src/test/scripts/unit_tests_validate_data.py +++ b/core/src/test/scripts/unit_tests_validate_data.py @@ -686,11 +686,12 @@ def test_normal_samples_list_in_maf(self): {'normal_samples_list': 'TCGA-BH-A18H-10,' 'TCGA-B6-A0RS-10,' - '' # TCGA-BH-A0HP-10 + '' # TCGA-BH-A0HP-10 'TCGA-BH-A18P-11, ' 'TCGA-C8-A138-10' 'TCGA-A2-A0EY-10,' - ''}) # TCGA-A8-A08G-10 + '', # TCGA-A8-A08G-10 + 'swissprot_identifier': 'accession'}) # we expect 2 errors about invalid normal samples self.assertEqual(len(record_list), 2) # check if both messages come from printDataInvalidStatement: @@ -727,14 +728,94 @@ def test_warning_for_missing_SWISSPROT(self): # WARNING: data_mutations_missing_swissprot.maf: line 1: SWISSPROT column is recommended if you want to make sure that a specific isoform is used for the PFAM domains drawing in the mutations view.; wrong value: 'SWISSPROT column not found' self.assertEqual(len(record_list), 1) # check if both messages come from printDataInvalidStatement: - self.assertIn("swissprot", record_list[0].getMessage().lower()) + self.assertIn("swissprot column is recommended", + record_list[0].getMessage().lower()) + + def test_unknown_or_invalid_swissprot(self): + """Test errors for invalid and unknown accessions under SWISSPROT.""" + self.logger.setLevel(logging.WARNING) + record_list = self.validate( + 'mutations/data_mutations_invalid_swissprot.maf', + validateData.MutationsExtendedValidator, + extra_meta_fields={ + 'swissprot_identifier': 'accession'}) + self.assertEqual(len(record_list), 2) + record_iterator = iter(record_list) + # used a name instead of an accession + record = record_iterator.next() + self.assertEqual(record.levelno, logging.ERROR) + self.assertEqual(record.line_number, 3) + self.assertEqual(record.cause, 'A1CF_HUMAN') + self.assertNotIn('portal', record.getMessage().lower()) + # neither a name nor an accession + record = record_iterator.next() + self.assertEqual(record.levelno, logging.ERROR) + self.assertEqual(record.line_number, 5) + self.assertEqual(record.cause, 'P99999,Z9ZZZ9ZZZ9') + self.assertNotIn('portal', record.getMessage().lower()) + + def test_name_as_swissprot_identifier(self): + """Test if the SWISSPROT column is parsed as a name if meta says so.""" + self.logger.setLevel(logging.WARNING) + record_list = self.validate( + 'mutations/data_mutations_name_swissprot.maf', + validateData.MutationsExtendedValidator, + extra_meta_fields={'swissprot_identifier': 'name'}) + # the same errors as in test_implicit_name_as_swissprot_identifier() + self.assert_swissprotname_validated(record_list) + + def test_implicit_name_as_swissprot_identifier(self): + """Test if the SWISSPROT column is parsed as a name if unspecified.""" + self.logger.setLevel(logging.WARNING) + record_list = self.validate( + 'mutations/data_mutations_name_swissprot.maf', + validateData.MutationsExtendedValidator) + # warning about the implicit Swiss-Prot identifier type + record = record_list[0] + self.assertEqual(record.levelno, logging.WARNING) + self.assertIn('swissprot_identifier', record.getMessage()) + # the same errors as in test_name_as_swissprot_identifier() + self.assert_swissprotname_validated(record_list[1:]) + + def assert_swissprotname_validated(self, record_list): + """Assert names are validated in data_mutations_name_swissprot.maf.""" + self.assertEqual(len(record_list), 2) + record_iterator = iter(record_list) + # used an accession instead of a name + record = record_iterator.next() + self.assertEqual(record.levelno, logging.ERROR) + self.assertEqual(record.line_number, 3) + self.assertEqual(record.cause, 'Q9NQ94') + self.assertNotIn('portal', record.getMessage().lower()) + # neither a name nor an accession + record = record_iterator.next() + self.assertEqual(record.levelno, logging.ERROR) + self.assertEqual(record.line_number, 5) + self.assertEqual(record.cause, 'A1CF_HUMAN,HBB_YEAST') + self.assertNotIn('portal', record.getMessage().lower()) + + def test_invalid_swissprot_identifier_type(self): + """Test if the validator rejects files with nonsensical id types.""" + self.logger.setLevel(logging.ERROR) + mvals, mtype = validateData.cbioportal_common.parse_metadata_file( + 'test_data/mutations/meta_mutations_invalid_swissprot_idspec.txt', + self.logger, + study_id='spam') + record_list = self.get_log_records() + self.assertEqual(len(record_list), 1) + record = record_list.pop() + self.assertEqual(record.levelno, logging.ERROR) + self.assertEqual(record.cause, 'namelessly') + self.assertIsNone(mtype, 'metadata file was not rejected as invalid') def test_isValidAminoAcidChange(self): """Test if proper warnings are given for wrong/blank AA change vals.""" # set level according to this test case: self.logger.setLevel(logging.WARNING) - record_list = self.validate('mutations/data_mutations_wrong_aa_change.maf', - validateData.MutationsExtendedValidator) + record_list = self.validate( + 'mutations/data_mutations_wrong_aa_change.maf', + validateData.MutationsExtendedValidator, + extra_meta_fields={'swissprot_identifier': 'accession'}) self.assertEqual(len(record_list), 5) record_iterator = iter(record_list) # empty field (and no HGVSp_Short column) @@ -774,7 +855,9 @@ def test_silent_mutation_skipped(self): # set level according to this test case: self.logger.setLevel(logging.INFO) record_list = self.validate('mutations/data_mutations_some_silent.maf', - validateData.MutationsExtendedValidator) + validateData.MutationsExtendedValidator, + extra_meta_fields={ + 'swissprot_identifier': 'name'}) # we expect 5 infos: 3 about silent mutations, 2 general info messages: self.assertEqual(len(record_list), 5) # First 3 INFO messages should be something like: "Validation of line skipped due to cBioPortal's filtering. Filtered types:" @@ -795,7 +878,9 @@ def test_alternative_notation_for_intergenic_mutation(self): # set level according to this test case: self.logger.setLevel(logging.WARNING) record_list = self.validate('mutations/data_mutations_silent_alternative.maf', - validateData.MutationsExtendedValidator) + validateData.MutationsExtendedValidator, + extra_meta_fields={ + 'swissprot_identifier': 'name'}) # we expect 1 ERROR and 2 WARNINGs : self.assertEqual(len(record_list), 3) @@ -1096,6 +1181,21 @@ def test_duplicated_stable_id(self): self.assertTrue(record.cause.startswith('brca_tcga_pub_all'), "Error is not about the id 'brca_tcga_pub_all'") + def test_missing_caselists(self): + """Test if errors are issued if certain case lists are not defined.""" + self.logger.setLevel(logging.ERROR) + validateData.validate_study( + 'test_data/study_missing_caselists', + PORTAL_INSTANCE, + self.logger) + record_list = self.get_log_records() + self.assertEqual(len(record_list), 1) + # _all + record = record_list.pop() + self.assertEqual(record.levelno, logging.ERROR) + self.assertIn('spam_all', record.getMessage()) + self.assertIn('add_global_case_list', record.getMessage()) + class StableIdValidationTestCase(LogBufferTestCase): @@ -1129,5 +1229,19 @@ def test_unnecessary_and_wrong_stable_id(self): self.assertEqual(warning.cause, 'stable_id') +class DataFileIOTestCase(PostClinicalDataFileTestCase): + """Test if the right behavior occurs if study files cannot be read.""" + + def test_missing_datafile(self): + """Test the error if files referenced from meta files do not exist.""" + self.logger.setLevel(logging.ERROR) + record_list = self.validate('filename-that-does-not-exist.txt', + validateData.ContinuousValuesValidator) + self.assertEqual(len(record_list), 1) + record = record_list.pop() + self.assertEqual(record.levelno, logging.ERROR) + self.assertIn('file', record.getMessage().lower()) + + if __name__ == '__main__': unittest.main(buffer=True) diff --git a/docs/File-Formats.md b/docs/File-Formats.md index 33596124a6e..c0e670383c3 100644 --- a/docs/File-Formats.md +++ b/docs/File-Formats.md @@ -34,7 +34,7 @@ This file contains metadata about the cancer study. The file contains the follow 3. **name**: The name of the cancer study, e.g., "Breast Cancer (Jones Lab 2013)". 4. **description**: A description of the cancer study, e.g., "Comprehensive profiling of 103 breast cancer samples. Generated by the Jones Lab 2013". This description may contain one or more URLs to relevant information. 5. **citation (optional)**: A relevant citation, e.g., "TCGA, Nature 2012". -6. **pmid (optional)**: A relevant pubmed id. +6. **pmid (optional)**: A relevant pubmed id. If used, the field citation has to be filled, too. 7. **short_name**: A short name used for display used on various web pages within the cBioPortal, e.g., "BRCA (Jones)". 8. **groups (optional)**: When using an authenticating cBioPortal, lists the user-groups that are allowed access to this study. Multiple groups are separated with a semicolon ";". The study will be invisible to users not in _at least one_ of the listed groups, as if it wasn't loaded at all. e.g., "PUBLIC;GDAC;SU2C-PI3K". see [User-Authorization](User-Authorization.md) for more information on groups 9. **add_global_case_list (optional)**: set to 'true' if you would like the "All samples" case list to be generated automatically for you. See also [Case lists](#case-lists). @@ -484,7 +484,8 @@ The mutation metadata file should contain the following fields: 5. **show_profile_in_analysis_tab**: true 6. **profile_name**: A name for the mutation data, e.g., "Mutations". 7. **profile_description**: A description of the mutation data, e.g., "Mutation data from whole exome sequencing.". -8. **data_filename**: <your datafile> +8. **data_filename**: <your data file> +9. **swissprot_identifier (optional)**: either `accession` or `name`, indicating the type of identifier in the `SWISSPROT` column An example metadata file would be: @@ -516,7 +517,7 @@ Note: next to Hugo_Symbol, it is recommended to have the Entrez gene ID: The following extra annotation columns are also important for making sure mutation specific UI functionality works well in the portal: * **Protein_position**: (annotation column) Required to initialize the 3D viewer in [mutations view](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#mutation_details) -* **SWISSPROT**: (annotation column) swissprot code, e.g. O11H1_HUMAN. Is not absolutely required, but not having it may result in inconsistent PDB structure matching in [mutations view](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#mutation_details). :warning: After running vcf2maf (or VEP) the SWISSPROT column contains the uniprot accession and **NOT** the entry name (e.g. for TP53 the column will contain P04637 and not P53_HUMAN). cBioPortal currently only supports the entry name. Work is presently being done to allow cBioPortal to import the accessions, but until that work is finished you will have to convert the accessions to entry names yourself. +* **SWISSPROT**: (annotation column) UniProtKB/SWISS-PROT name (formerly called ID) or accession code depending on the value of the `swissprot_identifier` metadatum, e.g. O11H1_HUMAN or Q8NG94. Is not absolutely required, but not having it may result in inconsistent PDB structure matching in [mutations view](http://www.cbioportal.org/index.do?cancer_study_list=brca_tcga_pub&cancer_study_id=brca_tcga_pub&genetic_profile_ids_PROFILE_MUTATION_EXTENDED=brca_tcga_pub_mutations&genetic_profile_ids_PROFILE_COPY_NUMBER_ALTERATION=brca_tcga_pub_gistic&genetic_profile_ids_PROFILE_MRNA_EXPRESSION=brca_tcga_pub_mrna_median_Zscores&Z_SCORE_THRESHOLD=2.0&RPPA_SCORE_THRESHOLD=2.0&data_priority=0&case_set_id=brca_tcga_pub_complete&case_ids=&patient_case_select=sample&gene_set_choice=prostate-cancer%3A-ar-signaling-%2810-genes%29&gene_list=TP53&clinical_param_selection=null&tab_index=tab_visualize&Action=Submit#mutation_details). ##### Extending the MAF format :warning: **Attention**: for the list of ***required*** and ***recommended*** fields, check the subsection above :arrow_up:. The section below :arrow_down: only describes some of the *extra* fields you can have in your mutations file. @@ -720,7 +721,7 @@ When **not** using the *add_global_case_list* attribute in [Study metadata](#can The case list file should contain the following fields: 1. **cancer_study_identifier**: same value as specified in [study meta file](#cancer-study) -2. **stable_id**: typically the cancer_study_identifier with an relevant suffix, e.g., "_custom". There are some naming rules to follow if you want the case list to be selected automatically in the query UI base on the selected sample profiles. See subsection below. +2. **stable_id**: it must contain the cancer_study_identifier followed by an underscore. Typically, after this a relevant suffix, e.g., "_custom", is added. There are some naming rules to follow if you want the case list to be selected automatically in the query UI base on the selected sample profiles. See subsection below. 3. **case_list_name**: A name for the patient list, e.g., "All Tumors". 4. **case_list_description**: A description of the patient list, e.g., "All tumor samples (825 samples).". 5. **case_list_ids**: A tab-delimited list of sample ids from the dataset. diff --git a/docs/Updating-pfam_graphics-table.md b/docs/Updating-pfam_graphics-table.md new file mode 100644 index 00000000000..b89e93bd33a --- /dev/null +++ b/docs/Updating-pfam_graphics-table.md @@ -0,0 +1,18 @@ +# Updating pfam_graphics table + +Updating the pfam_graphics table is easy using the scripts provided in cBioPortal. There is no need to download any file previously. + +1- Empty the table `pfam_graphics` (the scripts raise an error if one of the domains that we want to add is already in the table). +```sql +TRUNCATE TABLE pfam_graphics; +``` + +2- Run the script `FetchPfamGraphics.java` with the file path of the output file and its name as an argument. This function first searches for all the proteins from mouse that have been reviewed in UniProtKB, then tries to retrieve their Pfam domains, and finally saves the information in a file. This script can also be run in the command line by going into `/core/src/main/scripts/` and typing: +``` +fetchPfamGraphicsData.sh +``` + +3- Run the script `ImportPfamGraphics.java`, which uses the output of the previous script as an input to write all the information retrieved to the database. This script can also be run in the command line by typing: +``` +importPfamGraphics.pl +``` \ No newline at end of file diff --git a/docs/portal.properties-Reference.md b/docs/portal.properties-Reference.md index 96db601ebf8..4fab42e212c 100644 --- a/docs/portal.properties-Reference.md +++ b/docs/portal.properties-Reference.md @@ -45,7 +45,15 @@ google_analytics_profile_id # Password Authentication -The portal supports password authentication via Google+. To active password authentication, then the following properties are required: +The portal supports password authentication via Google+. Before you start you need to setup a google account that will own the authentication API. Follow https://developers.google.com/identity/sign-in/web/devconsole-project to get clientID and secret. Fill it in portal.properties: +``` +googleplus.consumer.key=195047654890-499gl89hj65j8d2eorqe0jvjnfaxcln0.apps.googleusercontent.com +googleplus.consumer.secret=2jCfg4SPWdGfXF44WC588dK +``` +(note: these are just examples, you need to get your own) You will also need to go to "Google+ API" and click Enable button. In case of problems make sure to enable DEBUG logging for org.springframework.social and org.springframework.security.web.authentication. + + +To active password authentication, then the following properties are required: ``` app.name= authenticate=googleplus diff --git a/pom.xml b/pom.xml index 97742048ee1..b7a145b88ab 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ master pom Portal Master - 1.2.4 + 1.2.5 master maven module diff --git a/portal/pom.xml b/portal/pom.xml index 550f1b45e02..ef43fc1b192 100644 --- a/portal/pom.xml +++ b/portal/pom.xml @@ -4,7 +4,7 @@ master org.mskcc.cbio - 1.2.4 + 1.2.5 4.0.0 cbioportal diff --git a/portal/src/main/webapp/WEB-INF/jsp/mutation_details.jsp b/portal/src/main/webapp/WEB-INF/jsp/mutation_details.jsp index ed1497cff97..530f87ac509 100644 --- a/portal/src/main/webapp/WEB-INF/jsp/mutation_details.jsp +++ b/portal/src/main/webapp/WEB-INF/jsp/mutation_details.jsp @@ -97,8 +97,8 @@ }, view: { vis3d: { - //for https, use a proxy since rcsb.org is not serving https and browsers will complain about the mixed https/http content - pdbUri: (document.location.protocol != "https:"? "http://files.rcsb.org/view/" : "api/proxy/jsmol/") + // use https for all portal instances + pdbUri: "https://files.rcsb.org/view/" }, mutationTable: { columnRender: { diff --git a/portal/src/main/webapp/WEB-INF/jsp/patient_view/clinical_timeline.jsp b/portal/src/main/webapp/WEB-INF/jsp/patient_view/clinical_timeline.jsp index 180829df094..2b1614a9e18 100644 --- a/portal/src/main/webapp/WEB-INF/jsp/patient_view/clinical_timeline.jsp +++ b/portal/src/main/webapp/WEB-INF/jsp/patient_view/clinical_timeline.jsp @@ -134,10 +134,11 @@ if (specRefNum) { if (specRefNum.length > 1) { console.warn("More than 1 specimen reference number found in tooltip table"); - } - sortOrder = caseIds.indexOf(specRefNum[0][1]); - if (sortOrder === -1) { - sortOrder = Infinity; + } else if (specRefNum.length === 1) { + sortOrder = caseIds.indexOf(specRefNum[0][1]); + if (sortOrder === -1) { + sortOrder = Infinity; + } } } return sortOrder; diff --git a/portal/src/main/webapp/WEB-INF/jsp/patient_view/cna.jsp b/portal/src/main/webapp/WEB-INF/jsp/patient_view/cna.jsp index 5367207d10c..198b2dbc4dd 100644 --- a/portal/src/main/webapp/WEB-INF/jsp/patient_view/cna.jsp +++ b/portal/src/main/webapp/WEB-INF/jsp/patient_view/cna.jsp @@ -35,39 +35,14 @@