Merge pull request #16 from macarthur-lab/move_output_files_to_base_dir

Move output files to base dir
macarthur-lab · Oct 4, 2016 · e46e239 · e46e239
2 parents 4d9dde4 + c73e7a7
commit e46e239
Show file tree

Hide file tree

Showing 23 changed files with 2,507 additions and 46,790 deletions.
diff --git a/README.md b/README.md
@@ -21,14 +21,14 @@ To create a flat representation of ClinVar suited for our purposes, we took seve
 1. Download the latest XML and TXT dumps from ClinVar FTP.
 2. Parse the XML file using [src/parse_clinvar_xml.py](src/parse_clinvar_xml.py) to extract fields of interest into a flat file.
 3. Sort on genomic coordinates (we use GRCh37).
-4. De-duplicate using [src/dedup_clinvar.py](src/dedup_clinvar.py), combining records that refer to the same genomic variant.
-5. Normalize using [our Python implementation](https://github.com/ericminikel/minimal_representation/blob/master/normalize.py) of [vt normalize](http://genome.sph.umich.edu/wiki/Variant_Normalization) (see [[Tan 2015]]).
-6. Join to some of the fields of interest from the TXT file using [src/join_data.R](src/join_data.R), and create some new fields&dagger;.
-7. Sort and de-duplicate again (this removes dups arising from duplicate records in the TXT dump).
+4. Normalize using [our Python implementation](https://github.com/ericminikel/minimal_representation/blob/master/normalize.py) of [vt normalize](http://genome.sph.umich.edu/wiki/Variant_Normalization) (see [[Tan 2015]]).
+5. Join to some of the fields of interest from the TXT file using [src/join_data.R](src/join_data.R), and create some new fields&dagger;.
+6. Sort and de-duplicate  (this removes dups arising from duplicate records in the TXT dump).
 
-&dagger;Because a ClinVar record may contain multiple assertions of Clinical Significance, we defined two additional columns:
+&dagger;Because a ClinVar record may contain multiple assertions of Clinical Significance, we defined three additional columns:
 
 + `pathogenic` is `1` if the variant has *ever* been asserted "Pathogenic" or "Likely pathogenic" by any submitter for any phenotype, and `0` otherwise
++ `benign` is `1` if the variant has *ever* been asserted "Benign" or "Likely benign" by any submitter for any phenotype, and `0` otherwise
 + `conflicted` is `1` if the variant has *ever* been asserted "Pathogenic" or "Likely pathogenic" by any submitter for any phenotype, and has also been asserted "Benign" or "Likely benign" by any submitter for any phenotype, and `0` otherwise. Note that having one assertion of pathogenic and one of uncertain significance does *not* count as conflicted for this column. 
 
 To run the pipeline:
@@ -40,10 +40,10 @@ python master.py -R hg19.fasta -E ExAC.r0.3.1.sites.vep.vcf.gz
 
 #### Results
 
-The resulting output files are:
-* [output/clinvar.tsv.gz](output/clinvar.tsv.gz)  
-* [output/clinvar.vcf.gz](output/clinvar.vcf)  
-* [output/clinvar_with_exac.tsv.gz](output/clinvar_with_exac.tsv.gz)  
+The main output files are:
+* [clinvar.tsv.gz](clinvar.tsv.gz)  
+* [clinvar.vcf.gz](clinvar.vcf)  
+* [clinvar_with_exac.tsv.gz](clinvar_with_exac.tsv.gz)  
 
 
 #### Usage notes

diff --git a/clinvar.tsv.gz b/clinvar.tsv.gz
diff --git a/clinvar.tsv.gz.tbi b/clinvar.tsv.gz.tbi
diff --git a/clinvar.vcf.gz b/clinvar.vcf.gz
diff --git a/clinvar.vcf.gz.tbi b/clinvar.vcf.gz.tbi
diff --git a/clinvar_example_750_rows.tsv b/clinvar_example_750_rows.tsv
diff --git a/clinvar_example_750_rows.vcf b/clinvar_example_750_rows.vcf
diff --git a/clinvar_stats.txt b/clinvar_stats.txt
diff --git a/clinvar_with_exac.tsv.gz b/clinvar_with_exac.tsv.gz
diff --git a/clinvar_with_exac.tsv.gz.tbi b/clinvar_with_exac.tsv.gz.tbi
diff --git a/clinvar_with_exac_example_750_rows.tsv b/clinvar_with_exac_example_750_rows.tsv
diff --git a/output/DEPRECATED.txt b/output/DEPRECATED.txt
@@ -0,0 +1,3 @@
+This output directory is being deprecated.
+For easier access, all output files will now be in the top level directory of the repo.
+
diff --git a/output/clinvar.tsv.gz b/output/clinvar.tsv.gz
diff --git a/output/clinvar.tsv.gz b/output/clinvar.tsv.gz
@@ -0,0 +1 @@
+../clinvar.tsv.gz
diff --git a/output/clinvar.tsv.gz.tbi b/output/clinvar.tsv.gz.tbi
diff --git a/output/clinvar.tsv.gz.tbi b/output/clinvar.tsv.gz.tbi
@@ -0,0 +1 @@
+../clinvar.tsv.gz.tbi
diff --git a/output/clinvar.vcf.gz b/output/clinvar.vcf.gz
diff --git a/output/clinvar.vcf.gz b/output/clinvar.vcf.gz
@@ -0,0 +1 @@
+../clinvar.vcf.gz
diff --git a/output/clinvar.vcf.gz.tbi b/output/clinvar.vcf.gz.tbi
diff --git a/output/clinvar.vcf.gz.tbi b/output/clinvar.vcf.gz.tbi
@@ -0,0 +1 @@
+../clinvar.vcf.gz.tbi
diff --git a/output/clinvar_example_750_rows.tsv b/output/clinvar_example_750_rows.tsv
diff --git a/output/clinvar_example_750_rows.tsv b/output/clinvar_example_750_rows.tsv
@@ -0,0 +1 @@
+../clinvar_example_750_rows.tsv
diff --git a/output/clinvar_example_750_rows.vcf b/output/clinvar_example_750_rows.vcf
diff --git a/output/clinvar_example_750_rows.vcf b/output/clinvar_example_750_rows.vcf
@@ -0,0 +1 @@
+../clinvar_example_750_rows.vcf
diff --git a/output/clinvar_stats.txt b/output/clinvar_stats.txt
diff --git a/output/clinvar_stats.txt b/output/clinvar_stats.txt
@@ -0,0 +1 @@
+../clinvar_stats.txt
diff --git a/output/clinvar_with_exac.tsv.gz b/output/clinvar_with_exac.tsv.gz
diff --git a/output/clinvar_with_exac.tsv.gz b/output/clinvar_with_exac.tsv.gz
@@ -0,0 +1 @@
+../clinvar_with_exac.tsv.gz
diff --git a/output/clinvar_with_exac.tsv.gz.tbi b/output/clinvar_with_exac.tsv.gz.tbi
diff --git a/output/clinvar_with_exac.tsv.gz.tbi b/output/clinvar_with_exac.tsv.gz.tbi
@@ -0,0 +1 @@
+../clinvar_with_exac.tsv.gz.tbi
diff --git a/output/clinvar_with_exac_example_750_rows.tsv b/output/clinvar_with_exac_example_750_rows.tsv
diff --git a/output/clinvar_with_exac_example_750_rows.tsv b/output/clinvar_with_exac_example_750_rows.tsv
@@ -0,0 +1 @@
+../clinvar_with_exac_example_750_rows.tsv
diff --git a/src/master.py b/src/master.py
@@ -87,12 +87,12 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
 # now de-dup _again_, because the tab-delimited summary contains dups
 job.add("python -u IN:dedup_clinvar.py < IN:clinvar_combined_sorted.tsv | tee clinvar.tsv | bgzip -c > OUT:clinvar.tsv.gz")  # clinvar_combined_sorted_dedup.tsv.gz
 job.add("tabix -S 1 -s 1 -b 2 -e 2 IN:clinvar.tsv.gz", output_filenames=["clinvar.tsv.gz.tbi"])
-job.add("cp IN:clinvar.tsv.gz IN:clinvar.tsv.gz.tbi ../output", output_filenames=["../output/clinvar.tsv", "../output/clinvar.tsv.gz", "../output/clinvar.tsv.gz.tbi"])
+job.add("cp IN:clinvar.tsv.gz IN:clinvar.tsv.gz.tbi ../", output_filenames=["../clinvar.tsv", "../clinvar.tsv.gz", "../clinvar.tsv.gz.tbi"])
 
 # create vcf
 job.add("python -u IN:clinvar_table_to_vcf.py IN:clinvar.tsv | bgzip -c > OUT:clinvar.vcf.gz")  # create compressed version
 job.add("tabix IN:clinvar.vcf.gz", output_filenames=["clinvar.vcf.gz.tbi"])
-job.add("cp IN:clinvar.vcf.gz IN:clinvar.vcf.gz.tbi ../output", output_filenames=["../output/clinvar.vcf.gz", "../output/clinvar.vcf.gz.tbi"])
+job.add("cp IN:clinvar.vcf.gz IN:clinvar.vcf.gz.tbi ../", output_filenames=["../clinvar.vcf.gz", "../clinvar.vcf.gz.tbi"])
 
 # create tsv table with extra fields from ExAC: filter, ac_adj, an_adj, popmax_ac, popmax_an, popmax
 if args.exac_sites_vcf:
@@ -101,12 +101,12 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
     job.add("tabix IN:"+normalized_vcf, output_filenames=[normalized_vcf+".tbi"])
     job.add("python -u IN:add_exac_fields.py -i IN:clinvar.tsv -e IN:%(normalized_vcf)s | bgzip -c > OUT:clinvar_with_exac.tsv.gz" % locals())
     job.add("tabix -S 1 -s 1 -b 2 -e 2 IN:clinvar_with_exac.tsv.gz", output_filenames=["clinvar_with_exac.tsv.gz.tbi"])
-    job.add("cp IN:clinvar_with_exac.tsv.gz IN:clinvar_with_exac.tsv.gz.tbi ../output", output_filenames=["../output/clinvar_with_exac.tsv.gz", "../output/clinvar_with_exac.tsv.gz.tbi"])
+    job.add("cp IN:clinvar_with_exac.tsv.gz IN:clinvar_with_exac.tsv.gz.tbi ../", output_filenames=["../clinvar_with_exac.tsv.gz", "../clinvar_with_exac.tsv.gz.tbi"])
 
 # create uncompressed example files that contain the 1st 750 lines of the compressed tsvs so people can easily see typical values online on github
-job.add("gunzip -c IN:clinvar.vcf.gz | head -n 750 > OUT:../output/clinvar_example_750_rows.vcf")
-job.add("gunzip -c IN:clinvar.tsv.gz | head -n 750 > OUT:../output/clinvar_example_750_rows.tsv")
-job.add("gunzip -c IN:clinvar_with_exac.tsv.gz | head -n 750 > OUT:../output/clinvar_with_exac_example_750_rows.tsv")
+job.add("gunzip -c IN:clinvar.vcf.gz | head -n 750 > OUT:../clinvar_example_750_rows.vcf")
+job.add("gunzip -c IN:clinvar.tsv.gz | head -n 750 > OUT:../clinvar_example_750_rows.tsv")
+job.add("gunzip -c IN:clinvar_with_exac.tsv.gz | head -n 750 > OUT:../clinvar_with_exac_example_750_rows.tsv")
 
 # create a stats file that summarizes some of the columns
 # Columns: 1: chrom, 2: pos, 3: ref, 4: alt, 5: mut, 6: measureset_id, 7: symbol, 8: clinical_significance, 9: pathogenic, 10: benign, 11: conflicted, 12: review_status, 13: gold_stars, 14: hgvs_c, 15: hgvs_p, 16: all_submitters, 17: all_traits, 18: all_pmids, 19: inheritance_modes, 20: age_of_onset, 21: prevalence, 22: disease_mechanism, 23: origin, 24: xrefs
@@ -122,7 +122,7 @@ def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
 done
 """, input_filenames=["clinvar.tsv.gz", "master.py"])
 
-job.add("cp IN:clinvar_stats.txt OUT:../output/clinvar_stats.txt")
+job.add("cp IN:clinvar_stats.txt OUT:../clinvar_stats.txt")
 
 # run the above commands
 jr.run(job)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		This output directory is being deprecated.
		For easier access, all output files will now be in the top level directory of the repo.