-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScript_01_Xmethdata preparation.Rmd
189 lines (155 loc) · 7.33 KB
/
Script_01_Xmethdata preparation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
---
title: "X-chromosome methylation data preparation"
author: "Yunfeng Liu"
date: "1/16/2023"
output: html_document
---
#Script information
This script was written to prepare X-chromosome methylation data on BIOS.
```{r}
#set library path.
.libPaths("~/researchdrive/yliu/Rlibs")
```
```{r}
#Load all necessary libraries.
library(BBMRIomics)
library(tidyverse)
library(DNAmArray)
library(minfi)
library(irlba)
library(ggfortify)
library(data.table)
#View available datasets.
data(package="BBMRIomics")
#Load DNA-methylation data (large, takes about 30 minutes).
bbmri.data(methData_Mvalues_BIOS_Freeze2_unrelated)
dim(mvalues) # 481388 4386
```
```{r}
#Keep X chromosome for the DNA-methylation data.
mvalues_chrX <- keepSeqlevels(mvalues, "chrX", pruning.mode = "coarse")
dim(mvalues_chrX) # 11130 4386
## remove unreliable probes based on zhou et.al paper
maskProbes <- as.data.frame(fread("./Input/Input_01_HM450.hg19.manifest.tsv"))
maskProbes <- filter(maskProbes,MASK_general=="TRUE")
mvalues_chrX <- mvalues_chrX[!(rownames(mvalues_chrX) %in% maskProbes$probeID),]
dim(mvalues_chrX) # 9777 4386
```
```{r}
# Remove some samples (N=263) who their age is missing
table(is.na(mvalues_chrX$sampling_age)) # 263
col.mvalues.chrX<-as.data.frame(colData(mvalues_chrX))
col.mvalues.chrX<-col.mvalues.chrX[!is.na(col.mvalues.chrX$sampling_age),]
mvalues_chrX<-mvalues_chrX[,colnames(mvalues_chrX) %in% rownames(col.mvalues.chrX)]
dim(mvalues_chrX) #9777 4123
#Remove problematic samples.
#There is 1 covariate which contain certain levels causing singularities if it is included into the model:
#sample_plate: levels "OV0192DNA001" and "OV0192DNA002".
idx <- which(mvalues_chrX$sample_plate == "OV0192DNA001" | mvalues_chrX$sample_plate == "OV0192DNA002")
#Now remove them.
mvalues_chrX <- mvalues_chrX[,-idx]
dim(mvalues_chrX) # 9777 4044
# Sex mislabeled issues (Unfortunately,some samples are mixed-up in sex,we should exclude them from this analysis)
# Because we don't know if their age are still correct
# Load beta values
bbmri.data(methData_Betas_BIOS_Freeze2_unrelated)
#Keep sex chromosome for the DNA-methylation data.
betas_chrX <- keepSeqlevels(betas,"chrX",pruning.mode = "coarse")
## remove unreliable probes
betas_chrX <- betas_chrX[!(rownames(betas_chrX) %in% maskProbes$probeID),]
# Select same samples in betas.
betas_chrX<-betas_chrX[,colnames(mvalues)]
dim(betas_chrX) # 9777 4044
# Check if predicted and real sexes are identical
predictedSex <- as.data.frame(getSex.DNAmArray(assay(betas)))
colnames(predictedSex)<-"predictedSex"
predictedSex$predictedSex[grepl("Male",predictedSex$predictedSex)]<-"male"
predictedSex$predictedSex[grepl("Female",predictedSex$predictedSex)]<-"female"
mvalues_chrX$predictedSex<-predictedSex$predictedSex
mvalues_chrX$correctSex<-ifelse(colData(mvalues_chrX)$predictedSex==colData(mvalues_chrX)$sex,TRUE,FALSE)
table(colData(mvalues_chrX)$correctSex) # FALSE:11 TRUE:4032 NA:2(sex is missing)
col.mvalues.chrX<-filter(col.mvalues.chrX,correctSex==TRUE)
mvalues_chrX<-mvalues_chrX[,rownames(col.mvalues.chrX)]
betas_chrX<-betas_chrX[,rownames(col.mvalues.chrX)]
dim(mvalues_chrX) # 9777 4032
dim(betas_chrX) # 9777 4032
### One female sample should be removed based on PCA.
pc_betas_chrX<- prcomp_irlba(t(assay(betas_chrX)))
summary(pc_betas_chrX)
tiff("PCA plot for sex.tiff", units="in", width=8, height=6, res=600,compression = 'lzw')
autoplot(pc_betas_chrX, data=as.data.frame(colData(betas_chrX)), colour="sex",main="Principal components plot")
dev.off()
# Find this problematic "female" samples
# Spilited samples into men and women separately
col.mvalues.Xfemales<-filter(col.mvalues.chrX,sex=="female")
col.mvalues.Xmales<-filter(col.mvalues.chrX,sex=="male")
betas_Xfemales<-betas_chrX[,rownames(col.mvalues.Xfemales)]
betas_Xmales<-betas_chrX[,rownames(col.mvalues.Xmales)]
dim(betas_Xfemales) # 9777 2344
dim(betas_Xmales) # 9777 1688
# Calculate the mean methylation of each female samples.
# we found "BIOS691E2BA5" sample have lowest DNAm level,remove it.
betas_Xfemales_mean<-as.data.frame(rowMeans(t(assay(betas_Xfemales)),na.rm = TRUE))
colnames(betas_Xfemales_mean)<-"Mean methylation"
betas_Xfemales<-betas_Xfemales[,!(colnames(betas_Xfemales)%in%"BIOS691E2BA5")]
dim(betas_Xfemales) # 9777 2343
# Similarly,remove this sample from betas and mvalues object.
mvalues_chrX<-mvalues_chrX[,!(colnames(mvalues_chrX)%in%"BIOS691E2BA5")]
betas_chrX<-betas_chrX[,!(colnames(betas_chrX)%in%"BIOS691E2BA5")]
col.mvalues.Xfemales<-col.mvalues.chrX[!(rownames(col.mvalues.Xfemales)%in%"BIOS691E2BA5"),]
dim(mvalues_chrX) # 9777 4031
dim(betas_chrX) # 9777 4031
dim(betas_Xfemales) # 9777 2343
dim(betas_Xmales) # 9777 1688
#Save betas,betas_Xfemales and betas_Xfemales object.
save(betas_Xfemales,betas_Xmales,file = "./Output/Output_01/Output_01_chrX_betas.RData")
```
```{r}
# Predicted Cell counts for all BIOS data by minfi
## Creat a object called RGset_female
load(file="Input_01_samplesheet_BIOS.RData")
rownames(samplesheet)<-make.names(samplesheet$uuid, unique=TRUE)
id<-intersect(colnames(mvalues_chrX),rownames(samplesheet)) # 4031
samplesheet<-samplesheet[id,]
mvalues_chrX<-mvalues_chrX[,id]
## Spilit women and men separately
mvalues_Xfemales<-mvalues_chrX[,na.omit(match(rownames(col.mvalues.Xfemales),colnames(mvalues_chrX)))]
dim(mvalues_Xfemales) # 9783 2343
mvalues_Xmales<-mvalues_chrX[,na.omit(match(rownames(col.mvalues.Xmales),colnames(mvalues_chrX)))]
dim(mvalues_Xmales) # 9783 1688
samplesheet_female<-filter(samplesheet,sex=="female") # 2343
samplesheet_male<-filter(samplesheet,sex=="male") # 1688
## loading IDAT files
# women
library(BiocParallel)
register(MulticoreParam(10, log=TRUE))
RGset_female <- read.metharray.exp(targets = samplesheet_female)
Cellcounts_female<-estimateCellCounts(RGset_female,referencePlatform = "IlluminaHumanMethylation450k")
rownames(Cellcounts_female)<-make.names(rownames(samplesheet_female), unique=TRUE)
Cellcounts_female<-as.data.frame(Cellcounts_female)
save(Cellcounts_female,file = "./Output_01_Cellcounts_female.RData")
Cellcounts_female<-Cellcounts_female[colnames(mvalues_Xfemales),]
#Add additional variables to mvalues_Xfemales.
mvalues_Xfemales$CD8T_predicted <- Cellcounts_female$CD8T
mvalues_Xfemales$CD4T_predicted <- Cellcounts_female$CD4T
mvalues_Xfemales$NK_predicted <- Cellcounts_female$NK
mvalues_Xfemales$Bcell_predicted <- Cellcounts_female$Bcell
mvalues_Xfemales$Mono_predicted <- Cellcounts_female$Mono
mvalues_Xfemales$Gran_predicted <- Cellcounts_female$Gran
# Men
RGset_male <- read.metharray.exp(targets = samplesheet_male)
Cellcounts_male<-estimateCellCounts(RGset_male,referencePlatform = "IlluminaHumanMethylation450k")
rownames(Cellcounts_male)<-make.names(rownames(samplesheet_male), unique=TRUE)
Cellcounts_male<-as.data.frame(Cellcounts_male)
save(Cellcounts_male,file = "./Output_01_Cellcounts_male.RData")
Cellcounts_male<-Cellcounts_male[colnames(mvalues_Xmales),]
#Add additional variables to mvalues_males.
mvalues_Xmales$CD8T_predicted <- Cellcounts_male$CD8T
mvalues_Xmales$CD4T_predicted <- Cellcounts_male$CD4T
mvalues_Xmales$NK_predicted <- Cellcounts_male$NK
mvalues_Xmales$Bcell_predicted <- Cellcounts_male$Bcell
mvalues_Xmales$Mono_predicted <- Cellcounts_male$Mono
mvalues_Xmales$Gran_predicted <- Cellcounts_male$Gran
#Save the files.
save(mvalues_Xfemales,mvalues_Xmales,file = "./Output/Output_01/Output_01_chrX_mvalues.RData")
```