Merge pull request #17 from calico/data-processing

Added training data and QTL scripts
calico · Apr 19, 2024 · 8bef0e2 · 8bef0e2
2 parents 3ef4859 + 2eebb71
commit 8bef0e2
Show file tree

Hide file tree

Showing 17 changed files with 2,465 additions and 0 deletions.
diff --git a/src/scripts/data/qtl_data/README.md b/src/scripts/data/qtl_data/README.md
@@ -0,0 +1,33 @@
+## QTL data processing
+
+The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/>
+
+*Notes*: 
+- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
+<br/>
+
+As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):
+1. download_finemap.py
+2. download_sumstat.py
+3. merge_finemapping_tables.py
+4. make_expression_tables.py
+<br/>
+
+To prepare the sQTL dataset, run these scripts:
+1. sqtl_make_positive_sets.py
+2. sqtl_make_negative_sets.py
+<br/>
+
+To prepare the paQTL dataset, run these scripts:
+1. paqtl_make_positive_sets.py
+2. paqtl_make_negative_sets.py
+<br/>
+
+To prepare the ipaQTL dataset, run these scripts:
+1. ipaqtl_make_positive_sets.py
+2. ipaqtl_make_negative_sets.py
+<br/>
+
+Finally, to generate the QTL VCF files, run this script:
+1. make_vcfs.py
+<br/>
diff --git a/src/scripts/data/qtl_data/download_finemap.py b/src/scripts/data/qtl_data/download_finemap.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_finemap.py
+
+Download QTL Catalogue fine-mapping results.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    # read remote table
+    samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+    # filter GTEx (for now)
+    samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+    ################################################
+    # txrevise for splicing / polyA / TSS QTLs
+
+    os.makedirs('txrev', exist_ok=True)
+    txrev_df = samples_df[samples_df.quant_method == 'txrev']
+
+    jobs = []
+    for all_ftp_path in txrev_df.ftp_path:
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz
+
+        all_ftp_file = all_ftp_path.split('/')[-1]
+        fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt')
+
+        fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/'
+        fine_ftp_path += fine_ftp_file
+
+        local_path = 'txrev/%s' % fine_ftp_file
+        if not os.path.isfile(local_path):
+            cmd = 'curl -o %s %s' % (local_path, fine_ftp_path)
+            jobs.append(cmd)
+
+    util.exec_par(jobs, 4, verbose=True)
+    # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/download_sumstat.py b/src/scripts/data/qtl_data/download_sumstat.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import pandas as pd
+
+import util
+
+'''
+download_sumstat.py
+
+Download QTL Catalogue sumstats.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    # read remote table
+    samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')
+
+    # filter GTEx (for now)
+    samples_df = samples_df[samples_df.study == 'GTEx']
+
+
+    ################################################
+    # ge for sumstat (we want SNPs and possibly also base expression)
+
+    os.makedirs('ge', exist_ok=True)
+    txrev_df = samples_df[samples_df.quant_method == 'ge']
+
+    jobs = []
+    for all_ftp_path in txrev_df.ftp_path:
+        # ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
+
+        local_path = 'ge/%s' % all_ftp_path.split("/")[-1]
+
+        if not os.path.isfile(local_path):
+            cmd = 'curl -o %s %s' % (local_path, all_ftp_path)
+            jobs.append(cmd)
+
+    util.exec_par(jobs, 4, verbose=True)
+    # print('\n'.join(jobs))
+
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()
diff --git a/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py b/src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python
+from optparse import OptionParser
+
+import os
+
+import util
+
+import numpy as np
+import pandas as pd
+
+import pyranges as pr
+
+'''
+paqtl_make_negative_sets.py
+
+Build tables with negative (non-causal) SNPs for paQTLs.
+'''
+
+################################################################################
+# main
+################################################################################
+def main():
+    usage = 'usage: %prog [options] arg'
+    parser = OptionParser(usage)
+    #parser.add_option()
+    (options,args) = parser.parse_args()
+
+    #Parameters
+    pip_cutoff = 0.01
+    max_distance = 10000
+    gene_pad = 50
+    apa_file = 'polyadb_intron.bed'
+    gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
+    finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'
+
+    #Define tissues
+    tissue_names = [
+        'adipose_subcutaneous',
+        'adipose_visceral',
+        'adrenal_gland',
+        'artery_aorta',
+        'artery_coronary',
+        'artery_tibial',
+        'blood',
+        'brain_amygdala',
+        'brain_anterior_cingulate_cortex',
+        'brain_caudate',
+        'brain_cerebellar_hemisphere',
+        'brain_cerebellum',
+        'brain_cortex',
+        'brain_frontal_cortex',
+        'brain_hippocampus',
+        'brain_hypothalamus',
+        'brain_nucleus_accumbens',
+        'brain_putamen',
+        'brain_spinal_cord',
+        'brain_substantia_nigra',
+        'breast',
+        'colon_sigmoid',
+        'colon_transverse',
+        'esophagus_gej',
+        'esophagus_mucosa',
+        'esophagus_muscularis',
+        'fibroblast',
+        'heart_atrial_appendage',
+        'heart_left_ventricle',
+        'kidney_cortex',
+        'LCL',
+        'liver',
+        'lung',
+        'minor_salivary_gland',
+        'muscle',
+        'nerve_tibial',
+        'ovary',
+        'pancreas',
+        'pituitary',
+        'prostate',
+        'skin_not_sun_exposed',
+        'skin_sun_exposed',
+        'small_intestine',
+        'spleen',
+        'stomach',
+        'testis',
+        'thyroid',
+        'uterus',
+        'vagina',
+    ]
+
+    #Compile negative SNP set for each tissue
+    for tissue_name in tissue_names :
+
+        print("-- " + str(tissue_name) + " --")
+
+        #Load summary stats and extract unique set of SNPs
+        vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+
+        #Only keep SNPs (no indels)
+        vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)
+
+        vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
+        vcf_df['start'] = vcf_df['position'].astype(int)
+        vcf_df['end'] = vcf_df['start'] + 1
+        vcf_df['strand'] = "."
+
+        vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
+        vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})
+
+        print("len(vcf_df) = " + str(len(vcf_df)))
+
+        #Store intermediate SNPs
+        #vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)
+
+        #Load polyadenylation site annotation
+        apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
+        apa_df['Start'] += 1
+
+        #Load gene span annotation
+        gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
+        gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)
+
+        gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])
+
+        gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)
+
+        gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
+        gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad
+
+        #Join dataframes against gtf annotation
+        apa_pr = pr.PyRanges(apa_df)
+        gtf_pr = pr.PyRanges(gtf_df)
+        vcf_pr = pr.PyRanges(vcf_df)
+
+        apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
+        vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)
+
+        apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
+        vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)
+
+        apa_gtf_df['Start'] -= max_distance
+        apa_gtf_df['End'] += max_distance
+
+        #Join vcf against polyadenylation annotation
+        apa_gtf_pr = pr.PyRanges(apa_gtf_df)
+        vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)
+
+        vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)
+
+        #Force gene_id of SNP to be same as the gene_id of the polyA site        
+        vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]
+
+        #PolyA site position
+        vcf_apa_df['Start_b'] += max_distance
+        vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})
+
+        #Distance to polyA site
+        vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])
+
+        #Choose unique SNPs by shortest distance to polyA site
+        vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
+        vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)
+
+        vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
+        vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]
+
+        print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))
+
+        #Store intermediate SNPs (filtered)
+        vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False)
+
+        #Reload filtered SNP file
+        vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip')
+
+        #Create variant identifier
+        vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']
+
+        #Load merged fine-mapping dataframe
+        finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]
+
+        #Join against fine-mapping dataframe
+        neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
+        neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.
+
+        #Only keep SNPs with PIP < cutoff
+        neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)
+
+        #Store final table of negative SNPs
+        neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False)
+
+        print("len(neg_df) = " + str(len(neg_df)))
+
+################################################################################
+# __main__
+################################################################################
+if __name__ == '__main__':
+    main()