In this repo, we will be doing:
(i) Assigning cell labels from reference data: or Automatic cell annotation for bladder cancer dataset used in scRNA-seq analysis using the reference data from Tabula Sapiens.
(ii) Assigning cell labels to sub-cluster of epithelial cells from gene sets (to be added)
# loading libraries
library(Seurat)
library(SeuratData)
library(SeuratDisk)
# Set the destination directory for saving the downloaded file
destination_dir <- getwd()
# Set the URL of the file to be downloaded
file_url <- "https://ndownloader.figshare.com/files/27388874"
# Define the file name for saving the downloaded file
file_name <- "TS_Bladder.h5ad"
# Download the file and save it to the specified directory
#download.file(file_url, file.path(destination_dir, file_name))
in python:
# import libs
import scanpy as sc
from scipy import io
import os
# Set working directory
os.chdir("C://Users/qaedi/OneDrive - Queen's University/Documents/scRNA_cell_annotation")
# creating a dir
!mkdir matrix_files
# reading h5ad file
adata = sc.read_h5ad('TS_Bladder.h5ad')
adata
#AnnData object with n_obs × n_vars = 21568 × 58833
# obs: 'Annotation', 'Predictability', 'Manually Annotated', 'Donor', 'Method', 'Organ', #'Compartment', 'Anatomical Information'
# var: 'gene_symbol', 'ensembl_id', 'gene_length'
# uns: 'Annotation_colors', 'Compartment_colors', 'Donor_colors', 'Manually Annotated_colors', #'Method_colors', 'Organ_colors', 'Propagated.Annotationcollapsed_colors', '_scvi', 'donor_colors', #'leiden', 'method_colors', 'neighbors', 'tissue_colors', 'umap'
# obsm: 'X_umap'
# layers: 'counts', 'raw_counts'
# obsp: 'connectivities', 'distances'
# Ensuring that the raw count is the main matrix
adata.X = adata.layers['raw_counts']
# generating files needed to create Seurat object
with open('matrix_files/barcodes.tsv', 'w') as f:
for item in adata.obs_names:
f.write(item + '\n')
with open('matrix_files/features.tsv', 'w') as f:
for item in ['\t'.join([x,x,'Gene Expression']) for x in adata.var_names]:
f.write(item + '\n')
io.mmwrite('matrix_files/matrix', adata.X.T)
# gzipping the files
import gzip
import glob
file_list = glob.glob("matrix_files/*")
for file_path in file_list:
with open(file_path, "rb") as file_in:
with gzip.open(file_path + ".gz", "wb") as file_out:
file_out.write(file_in.read())
Now we have a directory with all needed files to create a Seurat object:
library(Seurat)
library(SingleR)
library(patchwork)
library(cowplot)
# create seurat object
## locating files
raw_data <- Read10X(data.dir = "matrix_files/")
## reading metadata
metadata <- read.csv("ref_metadata.csv")
rownames(metadata) <- metadata$X
## creating Seurat object
su <- CreateSeuratObject(counts = raw_data, meta.data = metadata)
## retain cells that are manually annotated
su <- subset(su, subset = Manually.Annotated == TRUE)
## creating a group for batch id
su$batch_id <- paste0(su$Donor, "_", su$Method)
#____________________Exploring source of variation______________#
## Normalizing the counts
su <- NormalizeData(object = su, normalization.method = "LogNormalize")
# Find variable feature
su <- FindVariableFeatures(su,
selection.method = "vst",
nfeatures = 2000,
verbose = TRUE)
## Scaling
su <- ScaleData(su)
## Perform PCA
su <- RunPCA(su)
## Plot the PCA colored by cell cycle phase
no_split <- DimPlot(su,
reduction = "pca",
group.by= "batch_id")
with_split <- DimPlot(su,
reduction = "pca",
group.by= "batch_id",
split.by= "batch_id")
no_split + with_split
So there is no difference between batches , so we can proceed with file as is.
## Converting Seurat to sce object
refSce <- as.SingleCellExperiment(su)
hsu <- readRDS("~/scRNA/github/harmonized_seurat.RDS")
# Adding cluster data
hsu$clusters <- Idents(hsu)
query_su <- CreateSeuratObject(counts = GetAssayData(object = hsu, assay = "RNA"), meta.data = hsu@meta.data)
# convert to sce
querySce <- as.SingleCellExperiment(query_su)
pred <- SingleR(test=querySce, ref=refSce,
labels=refSce$Annotation, de.method="wilcox")
# Checking rownames
all(rownames(query_su@meta.data) == rownames(data.frame(pred)))
# Adding predicted classes to metadata
query_su$singleR_pred <- data.frame(pred)$pruned.labels
library(pheatmap)
tab <- table(query_su$clusters, query_su$singleR_pred)
pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101))
As it showed in the below plot, the overlap between Tabula Sapiens labels and our manual labels is significant: