-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a37c95b
commit a9b76f9
Showing
3 changed files
with
295 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
###### | ||
# Script : Analisis de expresion diferencial | ||
# Author: Sofia Salazar, Diego Ramirez y Evelia Coss | ||
# Date: 27/02/2024 | ||
# Description: El siguiente script nos permite realiza el Analisis de expresion Diferencial | ||
# a partir de los datos provenientes del alineamiento de STAR a R, | ||
# Primero correr el script "load_data_inR.R" | ||
# Usage: Correr las lineas en un nodo de prueba en el cluster. | ||
# Arguments: | ||
# - Input: metadata.csv, cuentas de STAR (Terminacion ReadsPerGene.out.tab) | ||
# - Output: Matriz de cuentas (CSV y RData) | ||
####### | ||
|
||
# qlogin | ||
# module load r/4.0.2 | ||
# R | ||
|
||
# --- Load packages ---------- | ||
library(DESeq2) | ||
|
||
# --- Load data ----- | ||
# Cargar archivos | ||
indir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/STAR_output" | ||
outdir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/" | ||
figdir <- '/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/figures/' | ||
|
||
#Cargar variable "counts", proveniente del script "load_data_inR.R" | ||
load("/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/counts/STAR_counts.RData") | ||
samples <- metadata$sample_id # Extraer los nombres de los Transcriptomas | ||
metadata$type <- as.factor(metadata$type) # convertir a factor | ||
|
||
# --- DEG ---- | ||
counts <- counts[5:129239, ] # Filtramos los rows con informacion general sobre el mapeo | ||
counts <- counts[which(rowSums(counts) > 10),] #Seleccionamos genes con mas de 10 cuentas | ||
|
||
# Convertir al formato dds | ||
dds <- DESeqDataSetFromMatrix(countData = counts, | ||
colData = metadata, design = ~type) #Se hace un DESeqDataSet para realizar un analisis | ||
|
||
dim(dds) # checar las dimensiones | ||
#[1] 33470 8 | ||
|
||
## -- Asignar la referencia y generar contrastes ----- | ||
# Las comparaciones se realizan por pares | ||
#Si no se indica de manera explicita que se va a comparara, lo va a tomar de manera alfabetica, | ||
# en este caso se indica que control es la referencia, | ||
dds$type <- relevel(dds$type, ref = "CONTROL") | ||
|
||
## --- Obtener archivo dds ---- | ||
|
||
dds <- DESeq(dds) | ||
|
||
# estimating size factors | ||
# estimating dispersions | ||
# gene-wise dispersion estimates | ||
# mean-dispersion relationship | ||
# final dispersion estimates | ||
# fitting model and testing | ||
|
||
# Obtener la lista de coeficientes | ||
resultsNames(dds) | ||
|
||
# [1] "Intercept" "type_PLS_15min_vs_CONTROL" | ||
# [3] "type_PLS_30min_vs_CONTROL" "type_PLS_4h_vs_CONTROL" | ||
|
||
# Guardar la salida del diseno | ||
save(metadata, dds, file = paste0(outdir, 'dds_Times_vs_control.RData')) | ||
|
||
## --- Normalizacion de los datos --------- | ||
# Opcion 1. log2(n + 1) | ||
ntd <- normTransform(dds) | ||
|
||
# Opcion 2. regularized logarithm or rlog | ||
# Normalizacion de las cuentas por logaritmo y podrias hacer el analisis usando este objeto en lugar del dds | ||
ddslog <- rlog(dds, blind = F) | ||
|
||
# Almacenar la grafica | ||
png(file = paste0(figdir, "PCA_rlog.png")) | ||
plt <- plotPCA(ddslog, intgroup = "type") | ||
print(plt) | ||
dev.off() | ||
|
||
# Opcion 3. vsd | ||
# Estima la tendencia de dispersion de los datos y calcula la varianza, hace una normalizacion de las | ||
# cuentas con respecto al tamaño de la libreria | ||
vsdata <- vst(dds, blind = F) | ||
|
||
# Almacenar la grafica | ||
png(file = paste0(figdir, "PCA_vsd.png")) | ||
plt <- plotPCA(vsdata, intgroup = "type") | ||
print(plt) | ||
dev.off() | ||
|
||
# Guardar la salida del diseno (vsdata) | ||
save(metadata, vsdata, file = paste0(outdir, 'vst_Times_vs_control.RData')) | ||
|
||
# En la grafica de las primeras dos componentes principales son notorias las diferencias | ||
# entre tipos de muestras con respecto a las componente principales que capturan su varianza, | ||
# cada componente principal representa una combinacion lineal de las variables (en este caso genes) | ||
# que explican la mayor cantidad de varianza en nuestros datos (las cuentas). | ||
|
||
|
||
## ---- Obtener informacion del contraste 1 ---- | ||
# results(dds, contrast=c("condition","treated","untreated")) | ||
res_15t <- results(dds, name = "type_PLS_15min_vs_CONTROL") | ||
res_15t | ||
|
||
summary(res_15t) | ||
|
||
# out of 33470 with nonzero total read count | ||
# adjusted p-value < 0.1 | ||
# LFC > 0 (up) : 1657, 5% | ||
# LFC < 0 (down) : 811, 2.4% | ||
# outliers [1] : 0, 0% | ||
# low counts [2] : 18169, 54% | ||
# (mean count < 12) | ||
# [1] see 'cooksCutoff' argument of ?results | ||
# [2] see 'independentFiltering' argument of ?results | ||
|
||
# Guardar los resultados | ||
write.csv(res_15t, file=paste0(outdir, 'DE_15min_vs_control.csv')) | ||
|
||
## ---- Obtener informacion del contraste 2 ---- | ||
res_30t <- results(dds, name = "type_PLS_30min_vs_CONTROL") | ||
res_30t | ||
|
||
summary(res_30t) | ||
|
||
# out of 33470 with nonzero total read count | ||
# adjusted p-value < 0.1 | ||
# LFC > 0 (up) : 1309, 3.9% | ||
# LFC < 0 (down) : 1300, 3.9% | ||
# outliers [1] : 0, 0% | ||
# low counts [2] : 19467, 58% | ||
# (mean count < 15) | ||
# [1] see 'cooksCutoff' argument of ?results | ||
# [2] see 'independentFiltering' argument of ?results | ||
|
||
# Guardar los resultados | ||
write.csv(res_30t, file=paste0(outdir, 'DE_30min_vs_control.csv')) | ||
|
||
## ---- Obtener informacion del contraste 3 ---- | ||
res_4t <- results(dds, name = "type_PLS_4h_vs_CONTROL") | ||
res_4t | ||
|
||
summary(res_4t) | ||
|
||
# out of 33470 with nonzero total read count | ||
# adjusted p-value < 0.1 | ||
# LFC > 0 (up) : 4289, 13% | ||
# LFC < 0 (down) : 3409, 10% | ||
# outliers [1] : 0, 0% | ||
# low counts [2] : 6489, 19% | ||
# (mean count < 3) | ||
# [1] see 'cooksCutoff' argument of ?results | ||
# [2] see 'independentFiltering' argument of ?results | ||
|
||
# Guardar los resultados | ||
write.csv(res_4t, file=paste0(outdir, 'DE_4h_vs_control.csv')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
###### | ||
# Script : Visualizacion grafica de los resultados de DEG | ||
# Author: Sofia Salazar, Diego Ramirez y Evelia Coss | ||
# Date: 27/02/2024 | ||
# Description: El siguiente script nos permite realiza el Analisis de expresion Diferencial | ||
# a partir de los datos provenientes del alineamiento de STAR a R, | ||
# Primero correr el script "load_data_inR.R" | ||
# Usage: Correr las lineas en un nodo de prueba en el cluster. | ||
# Arguments: | ||
# - Input: metadata.csv, cuentas de STAR (Terminacion ReadsPerGene.out.tab) | ||
# - Output: Matriz de cuentas (CSV y RData) | ||
####### | ||
|
||
# qlogin | ||
# module load r/4.0.2 | ||
# R | ||
|
||
# --- Load packages ---------- | ||
library(dplyr) | ||
library(pheatmap) | ||
library(ggplot2) | ||
|
||
# --- Load data ----- | ||
# Cargar archivos | ||
indir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/STAR_output" | ||
outdir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/" | ||
figdir <- '/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/figures/' | ||
|
||
#Cargar variable "dds", proveniente del script "DEG_analysis.R" | ||
load("/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/dds_Times_vs_control.RData") | ||
|
||
#Cargar variable "vsdata", proveniente del script "DEG_analysis.R" | ||
load("/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/vst_Times_vs_control.RData") | ||
|
||
#Cargar variable "res_30t", proveniente del script "DEG_analysis.R" | ||
load("/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/DE_30min_vs_control.csv") | ||
|
||
# ---- volcano plot ---- | ||
df <- as.data.frame(res_30t) | ||
# padj 0.05 y log2FoldChange de 2 | ||
df <- df %>% | ||
mutate(Expression = case_when(log2FoldChange >= 2 & padj < 0.05 ~ "Up-regulated", | ||
log2FoldChange <= -(2) & padj < 0.05 ~ "Down-regulated", | ||
TRUE ~ "Unchanged")) | ||
|
||
# visualizacion | ||
png(file = paste0(figdir, "VolcanoPlot_30min_vs_control.png")) | ||
|
||
ggplot(df, aes(log2FoldChange, -log(padj,10))) + | ||
geom_point(aes(color = Expression), size = 0.7) + | ||
labs(title = "30min vs control") + | ||
xlab(expression("log"[2]*"FC")) + | ||
ylab(expression("-log"[10]*"p-adj")) + | ||
scale_color_manual(values = c("dodgerblue3", "gray50", "firebrick3")) + | ||
guides(colour = guide_legend(override.aes = list(size=1.5))) + | ||
geom_vline(xintercept = 2, linetype = "dashed", color = "black", alpha = 0.5) + | ||
geom_vline(xintercept = -(2), linetype = "dashed", color = "black", alpha = 0.5) + | ||
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black", alpha = 0.5) | ||
|
||
dev.off() | ||
|
||
# --- Heatmap (vsd) ----- | ||
topGenes <- head(order(res_30t$padj), 20) # Obtener el nombre de los 20 genes con p value mas significativo | ||
|
||
png(file = paste0(figdir, "Heatmap_vsd_topgenes.png")) | ||
pheatmap(assay(vsdata)[topGenes,], cluster_rows=FALSE, show_rownames=TRUE, | ||
cluster_cols=FALSE) | ||
dev.off() | ||
|
||
# --- Heatmap (por contrastes) (log2 Fold Change) ----- | ||
betas <- coef(dds) | ||
colnames(betas) | ||
|
||
# [1] "Intercept" "type_PLS_15min_vs_CONTROL" | ||
# [3] "type_PLS_30min_vs_CONTROL" "type_PLS_4h_vs_CONTROL" | ||
|
||
mat <- betas[topGenes, -c(1,2)] # crear la matriz con el topgene de genes | ||
|
||
# Filtro de 3 log2foldchange | ||
thr <- 3 | ||
mat[mat < -thr] <- -thr | ||
mat[mat > thr] <- thr | ||
|
||
# Almacenar la grafica | ||
png(file = paste0(figdir, "Heatmap_log2FoldChage_topgenes.png")) | ||
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101), | ||
cluster_col=FALSE) | ||
dev.off() | ||
|
||
# https://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#time-course-experiments | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
###### | ||
# Script : Importar datos de cuentas en R | ||
# Author: Sofia Salazar, Diego Ramirez y Evelia Coss | ||
# Date: 27/02/2024 | ||
# Description: El siguiente script nos permite importar los datos provenientes del alineamiento de STAR a R, | ||
# para el posterior analisis de Expresion diferencial con DESEq2. | ||
# Usage: Correr las lineas en un nodo de prueba en el cluster. | ||
# Arguments: | ||
# - Input: metadata.csv, cuentas de STAR (Terminacion ReadsPerGene.out.tab) | ||
# - Output: Matriz de cuentas (CSV y RData) | ||
####### | ||
|
||
# qlogin | ||
# module load r/4.0.2 | ||
# R | ||
|
||
# --- Load data ----- | ||
# Cargar archivos | ||
indir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/STAR_output" | ||
outdir <- "/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/results/" | ||
setwd(indir) | ||
files <- dir(pattern = "ReadsPerGene.out.tab") | ||
counts <- c() # esta sera la matriz | ||
for(i in seq_along(files)){ | ||
x <- read.table(file = files[i], sep = "\t", header = F, as.is = T) | ||
# as.is para no convertir tipo de datos | ||
counts <- cbind(counts, x[,2]) | ||
} | ||
|
||
# Cargar Metadatos | ||
metadata <- read.csv("/mnt/Guanina/bioinfo24/data/Clase_RNASeq2024/metadata.csv", header = F) | ||
# Renombrar columnas con el ID de los transcriptomas | ||
colnames(metadata) <- c("sample_id", "type") | ||
# Convertir a formato dataframe | ||
counts <- as.data.frame(counts) | ||
rownames(counts) <- x[,1] # Renombrar las filas con el nombre de los genes | ||
colnames(counts) <- sub("_ReadsPerGene.out.tab", "", files) | ||
|
||
# almacenar metadata y matriz de cuentas | ||
save(metadata, counts, file = paste0(outdir, 'counts/STAR_counts.RData')) | ||
|
||
# Guardar informacion de ejecucion | ||
sessionInfo() |