From a02f7e1b837bb6d4cdf2ae9aed44fe2511868554 Mon Sep 17 00:00:00 2001 From: Kaushik Acharya Date: Wed, 4 Mar 2020 11:40:12 +0530 Subject: [PATCH 1/2] typos correction --- R/benchmarks.R | 22 +++++++++++----------- R/data.R | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/benchmarks.R b/R/benchmarks.R index 635e9df..13992fd 100755 --- a/R/benchmarks.R +++ b/R/benchmarks.R @@ -1,9 +1,9 @@ #' Benchmarking comorbidities task #' -#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts. +#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts. #' We compare how similar the embeddings for a pair of concepts are by computing the #' cosine similarity of their corresponding vectors, -#' and use this similarity to assess whether or not thetwo concepts are related. +#' and use this similarity to assess whether or not the two concepts are related. #' \code{\link{benchmark_comorbidities}} focuses on an embedding's ability to identify comorbidities. #' A comorbidity is a disease or condition that frequently accompanies a primary diagnosis. #' @@ -79,10 +79,10 @@ benchmark_comorbidities <- function(embedding_df,sig_level=0.05, bootstraps=1000 #' Benchmarking causative task #' -#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts. +#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts. #' We compare how similar the embeddings for a pair of concepts are by computing the #' cosine similarity of their corresponding vectors, -#' and use this similarity to assess whether or not thetwo concepts are related. +#' and use this similarity to assess whether or not the two concepts are related. #' \code{\link{benchmark_causative}} assesses an embedding's ability to recover causes from #' the UMLS' table (MRREL) of entities known to be the cause of a certain result. #' @@ -155,10 +155,10 @@ benchmark_causative <- function(embedding_df,sig_level = 0.05, bootstraps=10000, #' Benchmarking NDF RT task #' -#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts. +#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts. #' We compare how similar the embeddings for a pair of concepts are by computing the #' cosine similarity of their corresponding vectors, -#' and use this similarity to assess whether or not thetwo concepts are related. +#' and use this similarity to assess whether or not the two concepts are related. #' \code{\link{benchmark_ndf_rt}} assesses an embedding's ability to power to detect "may treat" and "may prevent" #' relationships using bootstrap scores of random drug-disease pairs. #' @@ -239,12 +239,12 @@ benchmark_ndf_rt <- function(embedding_df, sig_level = 0.05, bootstraps=10000){ return(df) } -#' Benchmarking semnatic type task +#' Benchmarking semantic type task #' -#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts. +#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts. #' We compare how similar the embeddings for a pair of concepts are by computing the #' cosine similarity of their corresponding vectors, -#' and use this similarity to assess whether or not thetwo concepts are related. +#' and use this similarity to assess whether or not the two concepts are related. #' \code{\link{benchmark_semantic_type}} assesses an ability to identify semantic types. Semantic types are #' meta-information about which category a concept belongs to, and these categories are arranged in a hierarchy. #' @@ -293,10 +293,10 @@ benchmark_semantic_type <- function(embedding_df, sig_level = 0.05, bootstraps=1 #' Benchmarking similarity task #' -#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts. +#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts. #' We compare how similar the embeddings for a pair of concepts are by computing the #' cosine similarity of their corresponding vectors, -#' and use this similarity to assess whether or not thetwo concepts are related. +#' and use this similarity to assess whether or not the two concepts are related. #' \code{link{benchmark_similarity}} reports the spearman correlation between the human assessment scores #' and cosine similarity from the embeddings. #' diff --git a/R/data.R b/R/data.R index d25b0a2..62972d5 100755 --- a/R/data.R +++ b/R/data.R @@ -1,6 +1,6 @@ #' UMLS Semantic Type Data #' -#' Semnatic type data and English descriptions for 3,063,795 CUIs +#' Semantic type data and English descriptions for 3,063,795 CUIs #' #' @docType data #' From c33492d3eedf293c5348b10a68f7677490dbd6a4 Mon Sep 17 00:00:00 2001 From: Kaushik Acharya Date: Wed, 4 Mar 2020 11:41:56 +0530 Subject: [PATCH 2/2] constructing pointwise mututal information without converting sparse matrix into a regular matrix --- R/word2vec_fit.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/R/word2vec_fit.R b/R/word2vec_fit.R index 76cc3ee..839ff91 100755 --- a/R/word2vec_fit.R +++ b/R/word2vec_fit.R @@ -1,6 +1,6 @@ #' Construct pointwise mutual information matrix #' -#' @param coccur The co-occurrence matrix +#' @param cooccur The co-occurrence matrix #' @param singletons The dataframe of CUIs and their counts #' @param N The number of bins #' @param smooth The smoothing factor @@ -8,14 +8,13 @@ #' @return A dataframe of pointwise mutual information #' @export construct_pmi <- function(cooccur,singletons,N,smooth=0.75) { - cooccur_matrix <- as.matrix(cooccur) + singletons$Count <- singletons$Count^smooth/N^smooth + concept_list <- row.names(cooccur) + nz <- Matrix::which(cooccur != 0, arr.ind = TRUE) # masking the lower half of the matrix because cooccur will always be symmetric # don't want to double count CUI1-CUI2 as CUI2-CUI1 - cooccur_matrix[lower.tri(cooccur_matrix, diag = FALSE)] <- 0 - singletons$Count <- singletons$Count^smooth/N^smooth - concept_list <- row.names(cooccur_matrix) - nz <- which(cooccur_matrix != 0, arr.ind = TRUE) - pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur_matrix[nz]/N, stringsAsFactors = F) + nz <- nz[which(nz[,1] <= nz[,2]),] + pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur[nz]/N, stringsAsFactors = F) pmi_df <- pmi_df %>% dplyr::inner_join(singletons,by=c("Concept_1" = "CUI")) %>% dplyr::rename(Concept_1_Prob=.data$Count) %>%