-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-selects_informative_SNPs.R
145 lines (107 loc) · 5.86 KB
/
01-selects_informative_SNPs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
##%######################################################%##
# #
#### Stage 1, processing parental genotypes ####
#### for families WXa and ZM ###
# #
##%######################################################%##
# This stage uses the genotyped table generated by GATK as input and saves
# - a tabular file counting the number of heterozygous SNPs in parents
# this is used to estimate heterozygous SNP density
# - a file indicating the 95% sequencing depth quantiles
# (also used to estimate heterozygous SNP density)
# - a tabular file listing all potential informative SNPs, indicating the maternal and
# paternal allele, among other things
source("WZ_functions.R")
# we imports the genotype table of the 4 parents
vcf <- fread(
"GATK_filtered_parents_snps.table", sep = "\t", header = T,
drop = c("ID", "FILTER", "INFO", "fWXA.PL", "fZM.PL", "mWXA.PL", "mZM.PL")
)
# to facilitate the script, we will not show the 4 parents on the same row, but
# only the two parents of the same family (the two families will be at different
# row) This functions extracts columns related to a given family (and "shared"
# columns), removes family names from columns and adds a new columns to denote
# the family
oneFam <- function(fam) {
GATK_fam <- vcf[, c(1:5, grep(fam, names(vcf))), with = F]
setnames(GATK_fam, gsub(stri_c(fam, "."), "", names(GATK_fam), fixed = T))
GATK_fam[, fam := fam]
GATK_fam
}
vcf <- rbindlist(lapply(c("WXA", "ZM"), oneFam))
setnames(vcf, 6:13, c(
"mother.GT", "mother.AD", "mother.DP", "mother.GQ",
"father.GT", "father.AD", "father.DP", "father.GQ"
))
# We look for possible SNPs in each family ----------------------
baseTypes <- c("A", "C", "G", "T")
# we convert bases to integers, which speeds things up, takes less memory and
# allows discarding non-base variation (indels, missing value) at the same time
# starting with the "left" base of the mother
mother.L <- chmatch(stri_sub(vcf$mother.GT, 1, 1), baseTypes)
mother.R <- chmatch(stri_sub(vcf$mother.GT, 3, 3), baseTypes)
father.L <- chmatch(stri_sub(vcf$father.GT, 1, 1), baseTypes)
father.R <- chmatch(stri_sub(vcf$father.GT, 3, 3), baseTypes)
# We count bases with variation within each family (to obtain general information)
mat <- cbind(mother.L, mother.R, father.L, father.R)
# for each base type, we count the alleles that have this base
diffs <- vapply(1:4, function(base) {
test <- mat == base
test[is.na(test)] <- T # we consider NA (missing base) as identity
as.integer(rowSums(test))
}, integer(nrow(mat)))
rM <- rowMaxs(diffs)
# when there is less than 4 chromosomes (among the two parents) that have a
# given base, it means there is a SNP
SNP <- rM < 4L
# we report the number of position with variation (possible SNPs)
sum(SNP[vcf$fam == "WXA"])
sum(SNP[vcf$fam == "ZM"])
vcf[SNP, sum(!duplicated(data.table(CHROM, POS)))]
# we count heterozygous SNPs -----------------------------
# this is to estimate female heterozygous SNP density at stage 05)
# For this, we record the 95% quantile of sequencing depth on SNPs (for each family.
# we also do it for fathers even though we didn't use this information afterwards
# (at some point, we recorded male heterozygous SNP density)
depthQuantiles <- vcf[, .(
mother = quantile(mother.DP, probs = 0.95, na.rm = T),
), by = .(fami = fam)]
# logicals indicating whether the mother is heterozygous
motherHet <- mother.L != mother.R
# we count heterozygous position per contig (both mothers combined)
motherHetero <- vcf[motherHet &
mother.GQ > 40L & # the genotype quality should be > 40
mother.DP < depthQuantiles[match(fam, fami), mother] & #the sequencing depth must not be too high
mother.DP >= 5L, # nor too low
.(mothers = length(unique(POS))), by = CHROM]
writeT(motherHetero, "heteroSNPs_byContigGQ40.txt")
writeT(depthQuantiles, "DPquantile.txt")
# We now select informative SNPs ----------------------------------------
# the mother must be heterozygous and the father homozygous
# note that we don't yet apply filters on genotype quality, depth and such
# as we always prefer applying filters at the last moment
informative <- motherHet & father.L == father.R
informative[is.na(informative)] <- F
# we create a matrix to receive the bases (numbers) of "maternal" alleles and
# "paternal" alleles for informative SNPs (will stay NAs for other SNPs). The
# maternal allele is in the first column
alleles <- matrix(NA, ncol = 2, nrow = length(mother.L))
# logical that is TRUE if the left allele of the mother is in the father
mother.L.in.father <- mother.L == father.L & informative
# same, but this time we consider the "right" allele
mother.R.in.father <- mother.R == father.L & informative
# we use these logical to retrieve the maternal and paternal alleles
# remember that the maternal allele is the one that is not in the father
alleles[mother.L.in.father, ] <- cbind(mother.R[mother.L.in.father], mother.L[mother.L.in.father])
alleles[mother.R.in.father, ] <- cbind(mother.L[mother.R.in.father], mother.R[mother.R.in.father])
# we add these to the VCF. Column "A1" will indicate the maternal allele
vcf[, c("A1", "A2") := data.table(alleles)] # A2 is the alternative allele
# we retain only informative SNPs and the columns we want
vcf <- vcf[!is.na(A1) & !is.na(A2), .(
CHROM, POS, REF = chmatch(REF, baseTypes),
mother.AD, mother.DP, mother.GQ, father.AD, father.DP, father.GQ, fam, A1, A2)]
writeT(vcf, "informativeSNPs.txt")
# we also write informative SNP positions separately for each family
# as we need these to scan the bams of F1 pools in the next stage
writeT(vcf[fam =="WXA", .(CHROM, POS)], "informativeSNPs_WXA.txt")
writeT(vcf[fam =="ZM", .(CHROM, POS)], "informativeSNPs_ZM.txt")