-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathMakefile
91 lines (71 loc) · 3.36 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
cores = 1
all: working/summarystats.txt
clean:
rm -rf working/*
##########################################
# download external dependencies
# Note: all external files should go here
##########################################
# download all of PubChem Bioassay
working/bioassayMirror: src/mirrorBioassay.sh
mkdir -p $@
$^ $@
# download uniprot ID mappings
working/uniprot_id_mapping.dat.gz:
mkdir -p working
wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz -O $@
# download protein target sequences
working/targets.fasta: working/bioassayDatabase.sqlite
echo "SELECT DISTINCT target FROM targets WHERE target_type = \"protein\";" | sqlite3 $< | xargs -I '{}' wget -O - "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id={}&rettype=fasta&retmode=text" >> $@
# download Pfam HMM data
working/Pfam-A.hmm:
wget -O [email protected] ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam29.0/Pfam-A.hmm.gz
gunzip [email protected]
hmmpress $@
# download kClust linux binary
# working/kClust:
# wget -O $@ ftp://toolkit.lmb.uni-muenchen.de/pub/kClust/kClust
# chmod u+x $@
##########################################
# build database
##########################################
# extract GI-> uniprot ID mappings to uncompressed text file
working/gi_uniprot_mapping.dat: working/uniprot_id_mapping.dat.gz
zcat $< | awk '{if ($$2 == "GI") print $$0;}' > $@
# load assays into database
working/bioassayDatabase.sqlite: src/buildBioassayDatabase.R working/bioassayMirror
$^ proteinsOnly $@
# compute target HMMs
working/domainsFromHmmscan: working/Pfam-A.hmm working/targets.fasta
hmmscan -E 0.01 --domE 0.01 --tblout working/domainsFromHmmscan --cpu $(cores) --noali $^
# extract domains from HMM results
working/domainsFromHmmscanTwoCols: working/domainsFromHmmscan
awk '{ if (!/^#/) print $$2 " " $$3}' $^ > $@
# use kClust to cluster proteins by sequence
working/targetClusters: src/kClust working/targets.fasta
mkdir $@
$< -i working/targets.fasta -d $@ -s 0.52 -M 16000MB
# load target annotations into database
working/databaseWithTargetTranslations.sqlite: src/loadTranslations.R working/bioassayDatabase.sqlite working/gi_uniprot_mapping.dat working/targetClusters working/domainsFromHmmscanTwoCols
cp working/bioassayDatabase.sqlite $@
$< working/gi_uniprot_mapping.dat working/targetClusters working/domainsFromHmmscanTwoCols $@
# turn on indexing
working/indexedBioassayDatabase.sqlite: src/indexDatabase.R working/databaseWithTargetTranslations.sqlite
cp working/databaseWithTargetTranslations.sqlite $@
$< $@
# load species annotations for assays
working/bioassayDatabaseWithSpecies.sqlite: src/annotateSpecies.R working/indexedBioassayDatabase.sqlite
cp working/indexedBioassayDatabase.sqlite $@
$< $@
# create symbolic link to final database file
working/pubchemBioassay.sqlite: working/bioassayDatabaseWithSpecies.sqlite
ln -s bioassayDatabaseWithSpecies.sqlite $@
# summarize database contents in a text file
working/summarystats.txt: src/computeStats.R working/pubchemBioassay.sqlite working/bioassayMirror
$^ $@
# Enumerate PubChem BioAssay CSV header columns (score categories)
working/scoreCategories.txt: src/enumerateScoreCategories.R working/bioassayMirror
$^ $@
# optionally build a small sample database for the bioassayR vignette
working/sampleDatabase.sqlite: src/sampleDatabase.R working/pubchemBioassay.sqlite
$^ $@