Skip to content

Commit

Permalink
feat: metadata and motif
Browse files Browse the repository at this point in the history
  • Loading branch information
nahid18 committed Jun 18, 2024
1 parent 411a086 commit 9faa71d
Show file tree
Hide file tree
Showing 11 changed files with 211 additions and 83 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export(plot_clonal_prop_per_group)
export(plot_clonal_prop_per_sample)
export(plot_clonotype_abundance)
export(plot_diversity_index)
export(plot_motif_counts)
export(plot_motifs)
export(plot_reads_group_abundance)
exportClasses(Basic)
exportClasses(Clonality)
Expand Down
27 changes: 18 additions & 9 deletions R/get_study.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
source("R/utils-diversity.R")
source("R/utils-clonality.R")
source("R/utils-basic.R")
source("R/utils-motif.R")
source("R/utils-hill.R")

#' Get pyTCR data of a project
Expand All @@ -14,25 +15,33 @@ source("R/utils-hill.R")
#'
#' @examples
#' \dontrun{
#' proj <- reTCR::get_study(id = "PRJNA473147", attr_col = "cmv_status")
#' proj <- reTCR::get_study(id="PRJNA473147", attr_col="cmv_status")
#' }
get_study <- function(id, attr_col) {
stopifnot(is.character(id) && nchar(id) > 0)
filename <- paste0(id, "_mixcr_metadata_file.csv")
filepath <- system.file("extdata", filename, package = "reTCR")
data <- utils::read.csv(filepath)

basic <- .get_basic(data = data, attr_col = attr_col)
diversity <- .get_diversity(data = data, attr_col = attr_col)
clonality <- .get_clonality(data = data, attr_col = attr_col)
hill <- .get_hill_numbers(df = data)
samplefile <- paste0(id, "_metadata_file.csv")
samplepath <- system.file("extdata", samplefile, package = "reTCR")
sampledata <- utils::read.csv(samplepath)

mixfile <- paste0(id, "_mixcr_metadata_file.csv")
mixpath <- system.file("extdata", mixfile, package = "reTCR")
mixdata <- utils::read.csv(mixpath)

basic <- .get_basic(data = mixdata, attr_col = attr_col)
diversity <- .get_diversity(data = mixdata, attr_col = attr_col)
clonality <- .get_clonality(data = mixdata, attr_col = attr_col)
motif <- .get_motif(data = mixdata, attr_col = attr_col)
hill <- .get_hill_numbers(df = mixdata)

return(methods::new(
"reTCRProj",
data = data,
data = mixdata,
metadata = sampledata,
basic = basic,
diversity = diversity,
clonality = clonality,
motif = motif,
hill = hill
))
}
20 changes: 13 additions & 7 deletions R/retcr_proj_class.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,34 +72,40 @@ setClass("Clonality",
#'
#' A class to represent motif metrics
#'
#' @slot aa_spectratype aa spectratype data
#' @slot aa_max_spectratype aa max spectratype data
#' @slot aa_motif_count aa motif count data
#' @slot aa_spectra Amino acid spectratype
#' @slot aa_max_spectra Amino acid max spectratype
#' @slot aa_motif_count Amino acid motif count table
#' @slot aa_most_motif Most abundant amino acid motif per sample
#' @exportClass Motif
setClass("Motif",
slots = c(
aa_spectratype = "data.frame",
aa_max_spectratype = "data.frame",
aa_motif_count = "data.frame"
aa_spectra = "data.frame",
aa_max_spectra = "data.frame",
aa_motif_count = "data.frame",
aa_most_motif = "data.frame"
)
)

#' Class "reTCRProj"
#'
#' A class to represent a reTCR project
#'
#' @slot data contains the main data for the project
#' @slot data contains the MIXCR data for the project
#' @slot metadata contains the sample metadata
#' @slot basic contains basic metrics
#' @slot diversity contains diversity metrics
#' @slot clonality contains clonality metrics
#' @slot motif contains motif metrics
#' @slot hill contains Hill numbers
#' @exportClass reTCRProj
setClass("reTCRProj",
slots = c(
data = "data.frame",
metadata = "data.frame",
basic = "Basic",
diversity = "Diversity",
clonality = "Clonality",
motif = "Motif",
hill = "data.frame"
)
)
25 changes: 15 additions & 10 deletions R/utils-motif-plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,41 @@
#' This function plots the motif counts.
#'
#' @param data A data frame containing the motif counts.
#' @param threshold the minimum count threshold (default: 60).
#' @param k the minimum count threshold (default: 60).
#' @param attr_col Character. Attribute column name.
#'
#' @return A ggplot2 object representing the motif counts plot.
#' @export
#'
#' @examples
#' \dontrun{
#' proj <- reTCR::get_study(id="PRJNA473147", attr_col="cmv_status")
#' reTCR::plot_motif_counts(proj@motif@aa_motif_count, 20)
#' reTCR::plot_motifs(proj@motif@aa_motif_count, "cmv_status", 50)
#' }
plot_motif_counts <- function(data, threshold = 60) {
data <- dplyr::filter(data, count > threshold)
df <- stats::aggregate(count ~ cmv_status + motif, data = data, FUN = length)
plot_motifs <- function(data, attr_col, k = 60) {
data <- dplyr::filter(data, count > k)
df <- stats::aggregate(
count ~ cmv_status + motif,
data = data,
FUN = length
)
colnames(df)[3] <- "number_of_samples"
df <- dplyr::filter(df, number_of_samples > 2)
data <- dplyr::inner_join(data, df, by = c("cmv_status", "motif"))
data <- dplyr::inner_join(data, df, by = c(attr_col, "motif"))

ggplot2::ggplot(
data,
ggplot2::aes(
x = motif,
y = count,
color = cmv_status
color = attr_col
)
) +
ggplot2::geom_jitter(width = 0.2, size = 3) +
ggplot2::labs(
x = "Amino acid motif",
y = "Count",
color = "CMV Status"
x = "motif",
y = "count",
color = "status"
) +
ggplot2::theme_bw() +
ggplot2::theme(
Expand Down
72 changes: 35 additions & 37 deletions R/utils-motif.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,4 @@
.get_aa_spectratype <- function(data) {
aa_spec <- data %>%
dplyr::mutate(aa_length = nchar(cdr3aa)) %>%
dplyr::group_by(sample, cmv_status, aa_length) %>%
dplyr::summarise(
spectratype = sum(freq, na.rm = TRUE),
.groups = "drop"
)
return(aa_spec)
}

.get_most_freq_aa_spectratype <- function(aa_spec) {
.get_most_freq_aa_spec <- function(aa_spec) {
aa_max <- aa_spec %>%
dplyr::group_by(sample) %>%
dplyr::slice_max(spectratype, n = 1)
Expand All @@ -27,51 +16,60 @@
return(aa_motif_list)
}

.get_aa_motif_count <- function(data, aa_max) {
.get_aa_motif_count <- function(data, aa_max, attr_col) {
return(aa_motif_count)
}

.get_motif <- function(data, attr_col) {
aa_spec <- data %>%
dplyr::mutate(aa_length = nchar(cdr3aa)) %>%
dplyr::group_by(sample, !!rlang::sym(attr_col), aa_length) %>%
dplyr::summarise(
spectratype = sum(freq, na.rm = TRUE),
.groups = "drop"
)

aa_max <- aa_spec %>%
dplyr::group_by(sample) %>%
dplyr::slice_max(spectratype, n = 1)

# aa motif count table
samples <- unique(data$sample)
aa_motif <- dplyr::tibble(
aa_motif_tb <- dplyr::tibble(
motif = character(),
count = numeric(),
sample = character()
)

for (sample in samples) {
data_temp <- data %>% dplyr::filter(.data$sample == !!sample)
motif_counts <- .get_aa_motif_list(data_temp$cdr3aa, 6)
data_temp <- data %>% dplyr::filter(data$sample == !!sample)
motif_list <- .get_aa_motif_list(data_temp$cdr3aa, 6)
data_temp <- dplyr::tibble(
motif = names(motif_counts),
count = as.numeric(unlist(motif_counts)),
motif = names(motif_list),
count = as.numeric(unlist(motif_list)),
sample = sample
)
aa_motif <- dplyr::bind_rows(aa_motif, data_temp)
aa_motif_tb <- dplyr::bind_rows(aa_motif_tb, data_temp)
}

aa_motif_count <- dplyr::left_join(
aa_motif,
dplyr::select(aa_max, sample, cmv_status),
aa_motif_tb,
dplyr::select(aa_max, sample, !!rlang::sym(attr_col)),
by = "sample"
)
return(aa_motif_count)
}

.get_most_motif_per_sample <- function(data) {
aa_max <- data %>%
# most abundant motif per sample
aa_most_abundant_motif <- aa_motif_count %>%
dplyr::group_by(sample) %>%
dplyr::slice_max(freq, n = 1)
return(aa_max)
}

.get_motif <- function(data) {
aa_spec <- .get_aa_spectratype(data)
aa_max <- .get_most_freq_aa_spectratype(aa_spec)
aa_motif_count <- .get_aa_motif_count(data, aa_max)
aa_most_abundant_motif <- .get_most_motif_per_sample(aa_motif_count)
dplyr::slice_max(count, n = 1)

return(
methods::new(
"Motif",
aa_spectratype = aa_spec,
aa_max_spectratype = aa_max,
aa_spectra = aa_spec,
aa_max_spectra = aa_max,
aa_motif_count = aa_motif_count,
aa_most_abundant_motif = aa_most_abundant_motif
aa_most_motif = aa_most_abundant_motif
)
)
}
44 changes: 35 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ library(reTCR)

proj <- reTCR::get_study(id="PRJNA473147", attr_col="cmv_status")

# view data
# get MIXCR data
print(proj@data)

# get sample metadata
print(proj@metadata)
```

## 1. Basic Metrics
## 1. Basic metrics

```r
# summary data
Expand Down Expand Up @@ -57,7 +60,7 @@ print(proj@basic@convergence)
print(proj@basic@spectratype)
```

### 1.2 Statistical Analysis
### 1.2 Statistical analysis

D'Agostino normality test

Expand All @@ -77,7 +80,7 @@ library(stats)
shapiro.test(proj@basic@summary_data$clonotype_count)
```

## 2. Diversity Metrics
## 2. Diversity metrics

```r
# shannon index values
Expand All @@ -96,7 +99,7 @@ print(proj@diversity@chao1)
print(proj@diversity@gini_coeff)
```

### 2.2 Diversity Visualization
### 2.2 Diversity visualization

```r
reTCR::plot_diversity_index(proj@diversity@shannon, "shannon_index", "cmv_status")
Expand All @@ -114,7 +117,7 @@ reTCR::plot_diversity_index(proj@diversity@chao1, "chao1", "cmv_status")
reTCR::plot_diversity_index(proj@diversity@gini_coeff, "gini_coeff", "cmv_status")
```

## 3. Clonality Metrics
## 3. Clonality metrics

```r
# most frequent clonotypes
Expand All @@ -139,7 +142,7 @@ print(proj@clonality@abundance_top)
print(proj@clonality@abundance_rare)
```

### 3.2 Clonality Visualization
### 3.2 Clonality visualization

```r
# let's store clonal proportion in `clonal_data`
Expand All @@ -161,7 +164,30 @@ reTCR::plot_reads_group_abundance(proj@clonality@abundance_top)
reTCR::plot_reads_group_abundance(proj@clonality@abundance_rare)
```

## 4. Segment usage metrics
## 4. Motif metrics

```r
# Amino acid spectratype
print(proj@motif@aa_spectra)

# Amino acid max spectratype
print(proj@motif@aa_max_spectra)

# Amino acid motif count table
print(proj@motif@aa_motif_count)

# Most abundant amino acid motif per sample
print(proj@motif@aa_most_motif)
```

### 4.2 Motif visualization

```r
# plot amino acid motif counts
reTCR::plot_motifs(proj@motif@aa_motif_count, "cmv_status", 20)
```

## 5. Segment usage metrics

```r
# Top n highest clonotypes
Expand All @@ -171,7 +197,7 @@ reTCR::get_top_n_clonotypes(proj@data, 15, "cmv_status")
reTCR::get_bottom_n_clonotypes(proj@data, 20, "cmv_status")
```

## 5. Hill numbers
## 6. Hill numbers

```r
# get hill numbers
Expand Down
Loading

0 comments on commit 9faa71d

Please sign in to comment.