Skip to content

Commit

Permalink
Merge pull request #17 from calico/data-processing
Browse files Browse the repository at this point in the history
Added training data and QTL scripts
  • Loading branch information
johli authored Apr 19, 2024
2 parents 3ef4859 + 2eebb71 commit 8bef0e2
Show file tree
Hide file tree
Showing 17 changed files with 2,465 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/scripts/data/qtl_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## QTL data processing

The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/>

*Notes*:
- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz).
<br/>

As a prerequisite to generating any of the QTL datasets, run the following scripts (in order):
1. download_finemap.py
2. download_sumstat.py
3. merge_finemapping_tables.py
4. make_expression_tables.py
<br/>

To prepare the sQTL dataset, run these scripts:
1. sqtl_make_positive_sets.py
2. sqtl_make_negative_sets.py
<br/>

To prepare the paQTL dataset, run these scripts:
1. paqtl_make_positive_sets.py
2. paqtl_make_negative_sets.py
<br/>

To prepare the ipaQTL dataset, run these scripts:
1. ipaqtl_make_positive_sets.py
2. ipaqtl_make_negative_sets.py
<br/>

Finally, to generate the QTL VCF files, run this script:
1. make_vcfs.py
<br/>
62 changes: 62 additions & 0 deletions src/scripts/data/qtl_data/download_finemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
from optparse import OptionParser

import os

import pandas as pd

import util

'''
download_finemap.py
Download QTL Catalogue fine-mapping results.
'''

################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] arg'
parser = OptionParser(usage)
#parser.add_option()
(options,args) = parser.parse_args()

# read remote table
samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')

# filter GTEx (for now)
samples_df = samples_df[samples_df.study == 'GTEx']


################################################
# txrevise for splicing / polyA / TSS QTLs

os.makedirs('txrev', exist_ok=True)
txrev_df = samples_df[samples_df.quant_method == 'txrev']

jobs = []
for all_ftp_path in txrev_df.ftp_path:
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz

all_ftp_file = all_ftp_path.split('/')[-1]
fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt')

fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/'
fine_ftp_path += fine_ftp_file

local_path = 'txrev/%s' % fine_ftp_file
if not os.path.isfile(local_path):
cmd = 'curl -o %s %s' % (local_path, fine_ftp_path)
jobs.append(cmd)

util.exec_par(jobs, 4, verbose=True)
# print('\n'.join(jobs))


################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
56 changes: 56 additions & 0 deletions src/scripts/data/qtl_data/download_sumstat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python
from optparse import OptionParser

import os

import pandas as pd

import util

'''
download_sumstat.py
Download QTL Catalogue sumstats.
'''

################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] arg'
parser = OptionParser(usage)
#parser.add_option()
(options,args) = parser.parse_args()

# read remote table
samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t')

# filter GTEx (for now)
samples_df = samples_df[samples_df.study == 'GTEx']


################################################
# ge for sumstat (we want SNPs and possibly also base expression)

os.makedirs('ge', exist_ok=True)
txrev_df = samples_df[samples_df.quant_method == 'ge']

jobs = []
for all_ftp_path in txrev_df.ftp_path:
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz

local_path = 'ge/%s' % all_ftp_path.split("/")[-1]

if not os.path.isfile(local_path):
cmd = 'curl -o %s %s' % (local_path, all_ftp_path)
jobs.append(cmd)

util.exec_par(jobs, 4, verbose=True)
# print('\n'.join(jobs))


################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
196 changes: 196 additions & 0 deletions src/scripts/data/qtl_data/ipaqtl_make_negative_sets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env python
from optparse import OptionParser

import os

import util

import numpy as np
import pandas as pd

import pyranges as pr

'''
paqtl_make_negative_sets.py
Build tables with negative (non-causal) SNPs for paQTLs.
'''

################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] arg'
parser = OptionParser(usage)
#parser.add_option()
(options,args) = parser.parse_args()

#Parameters
pip_cutoff = 0.01
max_distance = 10000
gene_pad = 50
apa_file = 'polyadb_intron.bed'
gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf'
finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz'

#Define tissues
tissue_names = [
'adipose_subcutaneous',
'adipose_visceral',
'adrenal_gland',
'artery_aorta',
'artery_coronary',
'artery_tibial',
'blood',
'brain_amygdala',
'brain_anterior_cingulate_cortex',
'brain_caudate',
'brain_cerebellar_hemisphere',
'brain_cerebellum',
'brain_cortex',
'brain_frontal_cortex',
'brain_hippocampus',
'brain_hypothalamus',
'brain_nucleus_accumbens',
'brain_putamen',
'brain_spinal_cord',
'brain_substantia_nigra',
'breast',
'colon_sigmoid',
'colon_transverse',
'esophagus_gej',
'esophagus_mucosa',
'esophagus_muscularis',
'fibroblast',
'heart_atrial_appendage',
'heart_left_ventricle',
'kidney_cortex',
'LCL',
'liver',
'lung',
'minor_salivary_gland',
'muscle',
'nerve_tibial',
'ovary',
'pancreas',
'pituitary',
'prostate',
'skin_not_sun_exposed',
'skin_sun_exposed',
'small_intestine',
'spleen',
'stomach',
'testis',
'thyroid',
'uterus',
'vagina',
]

#Compile negative SNP set for each tissue
for tissue_name in tissue_names :

print("-- " + str(tissue_name) + " --")

#Load summary stats and extract unique set of SNPs
vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)

#Only keep SNPs (no indels)
vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True)

vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str)
vcf_df['start'] = vcf_df['position'].astype(int)
vcf_df['end'] = vcf_df['start'] + 1
vcf_df['strand'] = "."

vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']]
vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'})

print("len(vcf_df) = " + str(len(vcf_df)))

#Store intermediate SNPs
#vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False)

#Load polyadenylation site annotation
apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand'])
apa_df['Start'] += 1

#Load gene span annotation
gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str'])
gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True)

gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0])

gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True)

gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad
gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad

#Join dataframes against gtf annotation
apa_pr = pr.PyRanges(apa_df)
gtf_pr = pr.PyRanges(gtf_df)
vcf_pr = pr.PyRanges(vcf_df)

apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same')
vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False)

apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True)
vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True)

apa_gtf_df['Start'] -= max_distance
apa_gtf_df['End'] += max_distance

#Join vcf against polyadenylation annotation
apa_gtf_pr = pr.PyRanges(apa_gtf_df)
vcf_gtf_pr = pr.PyRanges(vcf_gtf_df)

vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False)

#Force gene_id of SNP to be same as the gene_id of the polyA site
vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True)
vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']]

#PolyA site position
vcf_apa_df['Start_b'] += max_distance
vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'})

#Distance to polyA site
vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos'])

#Choose unique SNPs by shortest distance to polyA site
vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True)
vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True)

vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'})
vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']]

print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered)))

#Store intermediate SNPs (filtered)
vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False)

#Reload filtered SNP file
vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip')

#Create variant identifier
vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt']

#Load merged fine-mapping dataframe
finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']]

#Join against fine-mapping dataframe
neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left')
neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0.

#Only keep SNPs with PIP < cutoff
neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True)

#Store final table of negative SNPs
neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False)

print("len(neg_df) = " + str(len(neg_df)))

################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
Loading

0 comments on commit 8bef0e2

Please sign in to comment.