-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #17 from calico/data-processing
Added training data and QTL scripts
- Loading branch information
Showing
17 changed files
with
2,465 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
## QTL data processing | ||
|
||
The scripts in this folder are used to extract fine-mapped causal sQTLs, paQTLs and ipaQTLs from the results of the eQTL Catalogue, as well as construct distance- and expression-matched negative SNPs.<br/> | ||
|
||
*Notes*: | ||
- The pipeline requires the GTEx v8 (median) TPM matrix, which can be downloaded [here](https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz). | ||
<br/> | ||
|
||
As a prerequisite to generating any of the QTL datasets, run the following scripts (in order): | ||
1. download_finemap.py | ||
2. download_sumstat.py | ||
3. merge_finemapping_tables.py | ||
4. make_expression_tables.py | ||
<br/> | ||
|
||
To prepare the sQTL dataset, run these scripts: | ||
1. sqtl_make_positive_sets.py | ||
2. sqtl_make_negative_sets.py | ||
<br/> | ||
|
||
To prepare the paQTL dataset, run these scripts: | ||
1. paqtl_make_positive_sets.py | ||
2. paqtl_make_negative_sets.py | ||
<br/> | ||
|
||
To prepare the ipaQTL dataset, run these scripts: | ||
1. ipaqtl_make_positive_sets.py | ||
2. ipaqtl_make_negative_sets.py | ||
<br/> | ||
|
||
Finally, to generate the QTL VCF files, run this script: | ||
1. make_vcfs.py | ||
<br/> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env python | ||
from optparse import OptionParser | ||
|
||
import os | ||
|
||
import pandas as pd | ||
|
||
import util | ||
|
||
''' | ||
download_finemap.py | ||
Download QTL Catalogue fine-mapping results. | ||
''' | ||
|
||
################################################################################ | ||
# main | ||
################################################################################ | ||
def main(): | ||
usage = 'usage: %prog [options] arg' | ||
parser = OptionParser(usage) | ||
#parser.add_option() | ||
(options,args) = parser.parse_args() | ||
|
||
# read remote table | ||
samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t') | ||
|
||
# filter GTEx (for now) | ||
samples_df = samples_df[samples_df.study == 'GTEx'] | ||
|
||
|
||
################################################ | ||
# txrevise for splicing / polyA / TSS QTLs | ||
|
||
os.makedirs('txrev', exist_ok=True) | ||
txrev_df = samples_df[samples_df.quant_method == 'txrev'] | ||
|
||
jobs = [] | ||
for all_ftp_path in txrev_df.ftp_path: | ||
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz | ||
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets//Alasoo_2018_txrev_macrophage_IFNg+Salmonella.purity_filtered.txt.gz | ||
|
||
all_ftp_file = all_ftp_path.split('/')[-1] | ||
fine_ftp_file = all_ftp_file.replace('all.tsv', 'purity_filtered.txt') | ||
|
||
fine_ftp_path = 'ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/credible_sets/' | ||
fine_ftp_path += fine_ftp_file | ||
|
||
local_path = 'txrev/%s' % fine_ftp_file | ||
if not os.path.isfile(local_path): | ||
cmd = 'curl -o %s %s' % (local_path, fine_ftp_path) | ||
jobs.append(cmd) | ||
|
||
util.exec_par(jobs, 4, verbose=True) | ||
# print('\n'.join(jobs)) | ||
|
||
|
||
################################################################################ | ||
# __main__ | ||
################################################################################ | ||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/usr/bin/env python | ||
from optparse import OptionParser | ||
|
||
import os | ||
|
||
import pandas as pd | ||
|
||
import util | ||
|
||
''' | ||
download_sumstat.py | ||
Download QTL Catalogue sumstats. | ||
''' | ||
|
||
################################################################################ | ||
# main | ||
################################################################################ | ||
def main(): | ||
usage = 'usage: %prog [options] arg' | ||
parser = OptionParser(usage) | ||
#parser.add_option() | ||
(options,args) = parser.parse_args() | ||
|
||
# read remote table | ||
samples_df = pd.read_csv('https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/master/tabix/tabix_ftp_paths.tsv', sep='\t') | ||
|
||
# filter GTEx (for now) | ||
samples_df = samples_df[samples_df.study == 'GTEx'] | ||
|
||
|
||
################################################ | ||
# ge for sumstat (we want SNPs and possibly also base expression) | ||
|
||
os.makedirs('ge', exist_ok=True) | ||
txrev_df = samples_df[samples_df.quant_method == 'ge'] | ||
|
||
jobs = [] | ||
for all_ftp_path in txrev_df.ftp_path: | ||
# ftp://ftp.ebi.ac.uk/pub/databases/spot/eQTL/sumstats/Alasoo_2018/txrev/Alasoo_2018_txrev_macrophage_IFNg+Salmonella.all.tsv.gz | ||
|
||
local_path = 'ge/%s' % all_ftp_path.split("/")[-1] | ||
|
||
if not os.path.isfile(local_path): | ||
cmd = 'curl -o %s %s' % (local_path, all_ftp_path) | ||
jobs.append(cmd) | ||
|
||
util.exec_par(jobs, 4, verbose=True) | ||
# print('\n'.join(jobs)) | ||
|
||
|
||
################################################################################ | ||
# __main__ | ||
################################################################################ | ||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#!/usr/bin/env python | ||
from optparse import OptionParser | ||
|
||
import os | ||
|
||
import util | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
import pyranges as pr | ||
|
||
''' | ||
paqtl_make_negative_sets.py | ||
Build tables with negative (non-causal) SNPs for paQTLs. | ||
''' | ||
|
||
################################################################################ | ||
# main | ||
################################################################################ | ||
def main(): | ||
usage = 'usage: %prog [options] arg' | ||
parser = OptionParser(usage) | ||
#parser.add_option() | ||
(options,args) = parser.parse_args() | ||
|
||
#Parameters | ||
pip_cutoff = 0.01 | ||
max_distance = 10000 | ||
gene_pad = 50 | ||
apa_file = 'polyadb_intron.bed' | ||
gtf_file = '/home/drk/common/data/genomes/hg38/genes/gencode41/gencode41_basic_nort.gtf' | ||
finemap_file = 'txrev/GTEx_txrev_finemapped_merged.csv.gz' | ||
|
||
#Define tissues | ||
tissue_names = [ | ||
'adipose_subcutaneous', | ||
'adipose_visceral', | ||
'adrenal_gland', | ||
'artery_aorta', | ||
'artery_coronary', | ||
'artery_tibial', | ||
'blood', | ||
'brain_amygdala', | ||
'brain_anterior_cingulate_cortex', | ||
'brain_caudate', | ||
'brain_cerebellar_hemisphere', | ||
'brain_cerebellum', | ||
'brain_cortex', | ||
'brain_frontal_cortex', | ||
'brain_hippocampus', | ||
'brain_hypothalamus', | ||
'brain_nucleus_accumbens', | ||
'brain_putamen', | ||
'brain_spinal_cord', | ||
'brain_substantia_nigra', | ||
'breast', | ||
'colon_sigmoid', | ||
'colon_transverse', | ||
'esophagus_gej', | ||
'esophagus_mucosa', | ||
'esophagus_muscularis', | ||
'fibroblast', | ||
'heart_atrial_appendage', | ||
'heart_left_ventricle', | ||
'kidney_cortex', | ||
'LCL', | ||
'liver', | ||
'lung', | ||
'minor_salivary_gland', | ||
'muscle', | ||
'nerve_tibial', | ||
'ovary', | ||
'pancreas', | ||
'pituitary', | ||
'prostate', | ||
'skin_not_sun_exposed', | ||
'skin_sun_exposed', | ||
'small_intestine', | ||
'spleen', | ||
'stomach', | ||
'testis', | ||
'thyroid', | ||
'uterus', | ||
'vagina', | ||
] | ||
|
||
#Compile negative SNP set for each tissue | ||
for tissue_name in tissue_names : | ||
|
||
print("-- " + str(tissue_name) + " --") | ||
|
||
#Load summary stats and extract unique set of SNPs | ||
vcf_df = pd.read_csv("ge/GTEx_ge_" + tissue_name + ".all.tsv.gz", sep='\t', compression='gzip', usecols=['chromosome', 'position', 'ref', 'alt']).drop_duplicates(subset=['chromosome', 'position', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) | ||
|
||
#Only keep SNPs (no indels) | ||
vcf_df = vcf_df.loc[(vcf_df['ref'].str.len() == vcf_df['alt'].str.len()) & (vcf_df['ref'].str.len() == 1)].copy().reset_index(drop=True) | ||
|
||
vcf_df['chromosome'] = 'chr' + vcf_df['chromosome'].astype(str) | ||
vcf_df['start'] = vcf_df['position'].astype(int) | ||
vcf_df['end'] = vcf_df['start'] + 1 | ||
vcf_df['strand'] = "." | ||
|
||
vcf_df = vcf_df[['chromosome', 'start', 'end', 'ref', 'alt', 'strand']] | ||
vcf_df = vcf_df.rename(columns={'chromosome' : 'Chromosome', 'start' : 'Start', 'end' : 'End', 'strand' : 'Strand'}) | ||
|
||
print("len(vcf_df) = " + str(len(vcf_df))) | ||
|
||
#Store intermediate SNPs | ||
#vcf_df.to_csv("ge/GTEx_snps_" + tissue_name + ".bed.gz", sep='\t', index=False, header=False) | ||
|
||
#Load polyadenylation site annotation | ||
apa_df = pd.read_csv(apa_file, sep='\t', names=['Chromosome', 'Start', 'End', 'pas_id', 'feat1', 'Strand']) | ||
apa_df['Start'] += 1 | ||
|
||
#Load gene span annotation | ||
gtf_df = pd.read_csv(gtf_file, sep='\t', skiprows=5, names=['Chromosome', 'havana_str', 'feature', 'Start', 'End', 'feat1', 'Strand', 'feat2', 'id_str']) | ||
gtf_df = gtf_df.query("feature == 'gene'").copy().reset_index(drop=True) | ||
|
||
gtf_df['gene_id'] = gtf_df['id_str'].apply(lambda x: x.split("gene_id \"")[1].split("\";")[0].split(".")[0]) | ||
|
||
gtf_df = gtf_df[['Chromosome', 'Start', 'End', 'gene_id', 'feat1', 'Strand']].drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index(drop=True) | ||
|
||
gtf_df['Start'] = gtf_df['Start'].astype(int) - gene_pad | ||
gtf_df['End'] = gtf_df['End'].astype(int) + gene_pad | ||
|
||
#Join dataframes against gtf annotation | ||
apa_pr = pr.PyRanges(apa_df) | ||
gtf_pr = pr.PyRanges(gtf_df) | ||
vcf_pr = pr.PyRanges(vcf_df) | ||
|
||
apa_gtf_pr = apa_pr.join(gtf_pr, strandedness='same') | ||
vcf_gtf_pr = vcf_pr.join(gtf_pr, strandedness=False) | ||
|
||
apa_gtf_df = apa_gtf_pr.df[['Chromosome', 'Start', 'End', 'pas_id', 'gene_id', 'Strand']].copy().reset_index(drop=True) | ||
vcf_gtf_df = vcf_gtf_pr.df[['Chromosome', 'Start', 'End', 'ref', 'alt', 'Strand', 'gene_id']].copy().reset_index(drop=True) | ||
|
||
apa_gtf_df['Start'] -= max_distance | ||
apa_gtf_df['End'] += max_distance | ||
|
||
#Join vcf against polyadenylation annotation | ||
apa_gtf_pr = pr.PyRanges(apa_gtf_df) | ||
vcf_gtf_pr = pr.PyRanges(vcf_gtf_df) | ||
|
||
vcf_apa_pr = vcf_gtf_pr.join(apa_gtf_pr, strandedness=False) | ||
|
||
#Force gene_id of SNP to be same as the gene_id of the polyA site | ||
vcf_apa_df = vcf_apa_pr.df.query("gene_id == gene_id_b").copy().reset_index(drop=True) | ||
vcf_apa_df = vcf_apa_df[['Chromosome', 'Start', 'ref', 'alt', 'gene_id', 'pas_id', 'Strand_b', 'Start_b']] | ||
|
||
#PolyA site position | ||
vcf_apa_df['Start_b'] += max_distance | ||
vcf_apa_df = vcf_apa_df.rename(columns={'Start' : 'Pos', 'Start_b' : 'pas_pos', 'Strand_b' : 'Strand'}) | ||
|
||
#Distance to polyA site | ||
vcf_apa_df['distance'] = np.abs(vcf_apa_df['Pos'] - vcf_apa_df['pas_pos']) | ||
|
||
#Choose unique SNPs by shortest distance to polyA site | ||
vcf_apa_df = vcf_apa_df.sort_values(by='distance', ascending=True).drop_duplicates(subset=['Chromosome', 'Pos', 'ref', 'alt'], keep='first').copy().reset_index(drop=True) | ||
vcf_apa_df = vcf_apa_df.sort_values(['Chromosome', 'Pos', 'alt'], ascending=True).copy().reset_index(drop=True) | ||
|
||
vcf_df_filtered = vcf_apa_df.rename(columns={'Chromosome' : 'chrom', 'Pos' : 'pos', 'Strand' : 'strand'}) | ||
vcf_df_filtered = vcf_df_filtered[['chrom', 'pos', 'ref', 'alt', 'gene_id', 'pas_id', 'strand', 'pas_pos', 'distance']] | ||
|
||
print("len(vcf_df_filtered) = " + str(len(vcf_df_filtered))) | ||
|
||
#Store intermediate SNPs (filtered) | ||
vcf_df_filtered.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', index=False) | ||
|
||
#Reload filtered SNP file | ||
vcf_df_filtered = pd.read_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_filtered.bed.gz", sep='\t', compression='gzip') | ||
|
||
#Create variant identifier | ||
vcf_df_filtered['variant'] = vcf_df_filtered['chrom'] + "_" + vcf_df_filtered['pos'].astype(str) + "_" + vcf_df_filtered['ref'] + "_" + vcf_df_filtered['alt'] | ||
|
||
#Load merged fine-mapping dataframe | ||
finemap_df = pd.read_csv(finemap_file, sep='\t')[['variant', 'pip']] | ||
|
||
#Join against fine-mapping dataframe | ||
neg_df = vcf_df_filtered.join(finemap_df.set_index('variant'), on='variant', how='left') | ||
neg_df.loc[neg_df['pip'].isnull(), 'pip'] = 0. | ||
|
||
#Only keep SNPs with PIP < cutoff | ||
neg_df = neg_df.query("pip < " + str(pip_cutoff)).copy().reset_index(drop=True) | ||
|
||
#Store final table of negative SNPs | ||
neg_df.to_csv("ge/GTEx_snps_" + tissue_name + "_intronic_polya_negatives.bed.gz", sep='\t', index=False) | ||
|
||
print("len(neg_df) = " + str(len(neg_df))) | ||
|
||
################################################################################ | ||
# __main__ | ||
################################################################################ | ||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.