From a02f7e1b837bb6d4cdf2ae9aed44fe2511868554 Mon Sep 17 00:00:00 2001
From: Kaushik Acharya <acharya.kaushik@gmail.com>
Date: Wed, 4 Mar 2020 11:40:12 +0530
Subject: [PATCH 1/2] typos correction

---
 R/benchmarks.R | 22 +++++++++++-----------
 R/data.R       |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/benchmarks.R b/R/benchmarks.R
index 635e9df..13992fd 100755
--- a/R/benchmarks.R
+++ b/R/benchmarks.R
@@ -1,9 +1,9 @@
 #' Benchmarking comorbidities task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_comorbidities}} focuses on an embedding's ability to identify comorbidities.
 #' A comorbidity is a disease or condition that frequently accompanies a  primary  diagnosis.
 #'
@@ -79,10 +79,10 @@ benchmark_comorbidities <- function(embedding_df,sig_level=0.05, bootstraps=1000
 
 #' Benchmarking causative task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_causative}} assesses an embedding's ability to recover causes from
 #' the UMLS' table (MRREL) of entities known to be the cause of a certain result.
 #'
@@ -155,10 +155,10 @@ benchmark_causative <- function(embedding_df,sig_level = 0.05, bootstraps=10000,
 
 #' Benchmarking NDF RT task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_ndf_rt}} assesses an embedding's ability to power to detect "may treat" and "may prevent"
 #' relationships using bootstrap scores of random drug-disease pairs.
 #'
@@ -239,12 +239,12 @@ benchmark_ndf_rt <- function(embedding_df, sig_level = 0.05, bootstraps=10000){
   return(df)
 }
 
-#' Benchmarking semnatic type task
+#' Benchmarking semantic type task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{\link{benchmark_semantic_type}} assesses an ability to identify semantic types. Semantic types are
 #' meta-information about which category a concept belongs to, and these categories are arranged in a hierarchy.
 #'
@@ -293,10 +293,10 @@ benchmark_semantic_type <- function(embedding_df, sig_level = 0.05, bootstraps=1
 
 #' Benchmarking similarity task
 #'
-#' The benchmarking strategy leverages previously published ‘known’ relationships between medicalconcepts.
+#' The benchmarking strategy leverages previously published ‘known’ relationships between medical concepts.
 #' We compare how similar the embeddings for a pair of concepts are by computing the
 #' cosine similarity of their corresponding vectors,
-#' and use this similarity to assess whether or not thetwo concepts are related.
+#' and use this similarity to assess whether or not the two concepts are related.
 #' \code{link{benchmark_similarity}} reports the spearman correlation between the human assessment scores
 #' and cosine similarity from the embeddings.
 #'
diff --git a/R/data.R b/R/data.R
index d25b0a2..62972d5 100755
--- a/R/data.R
+++ b/R/data.R
@@ -1,6 +1,6 @@
 #' UMLS Semantic Type Data
 #'
-#' Semnatic type data and English descriptions for 3,063,795 CUIs
+#' Semantic type data and English descriptions for 3,063,795 CUIs
 #'
 #' @docType data
 #'

From c33492d3eedf293c5348b10a68f7677490dbd6a4 Mon Sep 17 00:00:00 2001
From: Kaushik Acharya <acharya.kaushik@gmail.com>
Date: Wed, 4 Mar 2020 11:41:56 +0530
Subject: [PATCH 2/2] constructing pointwise mututal information without
 converting sparse matrix into a regular matrix

---
 R/word2vec_fit.R | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/R/word2vec_fit.R b/R/word2vec_fit.R
index 76cc3ee..839ff91 100755
--- a/R/word2vec_fit.R
+++ b/R/word2vec_fit.R
@@ -1,6 +1,6 @@
 #' Construct pointwise mutual information matrix
 #'
-#' @param coccur The co-occurrence matrix
+#' @param cooccur The co-occurrence matrix
 #' @param singletons The dataframe of CUIs and their counts
 #' @param N The number of bins
 #' @param smooth The smoothing factor
@@ -8,14 +8,13 @@
 #' @return A dataframe of pointwise mutual information
 #' @export
 construct_pmi <- function(cooccur,singletons,N,smooth=0.75) {
-  cooccur_matrix <- as.matrix(cooccur)
+  singletons$Count <- singletons$Count^smooth/N^smooth
+  concept_list <- row.names(cooccur)
+  nz <- Matrix::which(cooccur != 0, arr.ind = TRUE)
   # masking the lower half of the matrix because cooccur will always be symmetric
   # don't want to double count CUI1-CUI2 as CUI2-CUI1
-  cooccur_matrix[lower.tri(cooccur_matrix, diag = FALSE)] <- 0
-  singletons$Count <- singletons$Count^smooth/N^smooth
-  concept_list <- row.names(cooccur_matrix)
-  nz <- which(cooccur_matrix != 0, arr.ind = TRUE)
-  pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur_matrix[nz]/N, stringsAsFactors = F)
+  nz <- nz[which(nz[,1] <= nz[,2]),]
+  pmi_df <- data.frame(Concept_1 = concept_list[nz[,1]], Concept_2 = concept_list[nz[,2]], JointProb = cooccur[nz]/N, stringsAsFactors = F)
   pmi_df <- pmi_df %>%
     dplyr::inner_join(singletons,by=c("Concept_1" = "CUI")) %>%
     dplyr::rename(Concept_1_Prob=.data$Count) %>%