beamandrew · kaushikacharya · Mar 4, 2020 · Mar 4, 2020
diff --git a/R/benchmarks.R b/R/benchmarks.R
@@ -1,9 +1,9 @@
 #' Benchmarking comorbidities task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_comorbidities}} focuses on an embedding's ability to identify comorbidities.
 #' A comorbidity is a disease or condition that frequently accompanies a  primary  diagnosis.
 #'
@@ -79,10 +79,10 @@ benchmark_comorbidities <- function(embedding_df,sig_level=0.05, bootstraps=1000
 
 #' Benchmarking causative task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_causative}} assesses an embedding's ability to recover causes from
 #' the UMLS' table (MRREL) of entities known to be the cause of a certain result.
 #'
@@ -155,10 +155,10 @@ benchmark_causative <- function(embedding_df,sig_level = 0.05, bootstraps=10000,
 
 #' Benchmarking NDF RT task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_ndf_rt}} assesses an embedding's ability to power to detect "may treat" and "may prevent"
 #' relationships using bootstrap scores of random drug-disease pairs.
 #'
@@ -239,12 +239,12 @@ benchmark_ndf_rt <- function(embedding_df, sig_level = 0.05, bootstraps=10000){
   return(df)
 }
 
-#' Benchmarking semnatic type task
+#' Benchmarking semantic type task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_semantic_type}} assesses an ability to identify semantic types. Semantic types are
 #' meta-information about which category a concept belongs to, and these categories are arranged in a hierarchy.
 #'
@@ -293,10 +293,10 @@ benchmark_semantic_type <- function(embedding_df, sig_level = 0.05, bootstraps=1
 
 #' Benchmarking similarity task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{link{benchmark_similarity}} reports the spearman correlation between the human assessment scores
 #' and cosine similarity from the embeddings.
 #'

diff --git a/R/data.R b/R/data.R
@@ -1,6 +1,6 @@
 #' UMLS Semantic Type Data
 #'
-#' Semnatic type data and English descriptions for 3,063,795 CUIs
+#' Semantic type data and English descriptions for 3,063,795 CUIs
 #'
 #' @docType data
 #'

diff --git a/R/word2vec_fit.R b/R/word2vec_fit.R
@@ -1,21 +1,20 @@
 #' Construct pointwise mutual information matrix
 #'
-#' @param coccur The co-occurrence matrix
+#' @param cooccur The co-occurrence matrix
 #' @param singletons The dataframe of CUIs and their counts
 #' @param N The number of bins
 #' @param smooth The smoothing factor
 #'
 #' @return A dataframe of pointwise mutual information
 #' @export
 construct_pmi <- function(cooccur,singletons,N,smooth=0.75) {
-  cooccur_matrix <- as.matrix(cooccur)
+  singletons$Count <- singletons$Count^smooth/N^smooth
+  concept_list <- row.names(cooccur)
+  nz <- Matrix::which(cooccur != 0, arr.ind = TRUE)
   # masking the lower half of the matrix because cooccur will always be symmetric
   # don't want to double count CUI1-CUI2 as CUI2-CUI1
-  cooccur_matrix[lower.tri(cooccur_matrix, diag = FALSE)] <- 0
-  singletons$Count <- singletons$Count^smooth/N^smooth
-  concept_list <- row.names(cooccur_matrix)
-  nz <- which(cooccur_matrix != 0, arr.ind = TRUE)
-  pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur_matrix[nz]/N, stringsAsFactors = F)
+  nz <- nz[which(nz[,1] <= nz[,2]),]
+  pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur[nz]/N, stringsAsFactors = F)
   pmi_df <- pmi_df %>%
     dplyr::inner_join(singletons,by=c("Concept_1" = "CUI")) %>%
     dplyr::rename(Concept_1_Prob=.data$Count) %>%