Finalized HLM code.

GallupGovt · Oct 31, 2018 · f277366 · f277366
1 parent ff4be7e
commit f277366
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 53 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,5 +8,6 @@ articles.zip
 comments.zip
 config.py
 
-# hlm data files
+# hlm files
 hlm/*.csv
+hlm/*.pdf
diff --git a/hlm/create_bias.r b/hlm/create_bias.r
@@ -65,8 +65,7 @@ build_scores <- function(n = 500) {
                              low_vals = los,
                              high_vals = his
         )
-        names(results)[i] <- paste(c('z', sample(x = c(letters, 0:9), size = 6)),
-                                     collapse = '')
+        names(results)[i] <- paste0('z', i)
         print(paste('Finished iteration', i, 'of', n))
     }
     return(results)

diff --git a/hlm/hlm.r b/hlm/hlm.r
@@ -1,66 +1,114 @@
-#!/usr/loca/bin/R
+#!/usr/local/bin/R
 # hierarchical modeling for ugb
 # questions? contact [email protected]
 
 # load libraries
 library(data.table)
+library(ggplot2)
 library(lme4)
+library(reshape2)
+library(sjstats)
 
 # get passed arguments
 args <- commandArgs(trailingOnly = TRUE)
-args <- c('breitbart_modeling_data.csv')
+nargs <- sapply(args, function(x) {strsplit(x, '_')[[1]][1]})
 
 # load data
-d <- fread(paste('hlm', args[1], sep = '/'))
+d <- list()
+for(i in 1:(length(args)-1)) {
+    d[[i]] <- fread(paste('hlm', args[i], sep = '/'))
+    d[[i]]$source <- nargs[i]
+}
+d <- do.call(rbind, d)
 
-gender_mod <- lmer(gender_bias ~
-            title_embeddings_gender +
-            art_text_embeddings_gender +
-            title_sent_neg +
-            title_sent_pos +
-            art_sent_neg +
-            art_sent_pos +
-            art_comments +
-            comment_embeddings_gender +
-            upvotes +
-            comm_sent_neg +
-            comm_sent_pos +
-            (1 | art_id) +
-            (1 | CLUSTER),
-            d, REML = FALSE
-)
+# prep modeling loop
+metric_measure <- grep(args[length(args)], names(d))
+outcomes <- grep('^z', names(d))
+mod_results <- list()
+i <- 1
+
+# model against synthetic outcomes
+for(outcome in outcomes) {
+    mod <- d[, lmer(.SD[[outcome]] ~
+                    .SD[[metric_measure[1]]] +
+                    .SD[[metric_measure[2]]] +
+                    title_sent_neg +
+                    title_sent_pos +
+                    art_sent_neg +
+                    art_sent_pos +
+                    art_comments +
+                    .SD[[metric_measure[3]]] +
+                    upvotes +
+                    comm_sent_neg +
+                    comm_sent_pos +
+                    (1 | source) +
+                    (1 | art_id) +
+                    (1 | CLUSTER),
+                    d, REML = FALSE)]
+    mod_results[[i]] <- list(icc = icc(mod),
+                             predictors = names(fixef(mod)),
+                             estimate = coef(summary(mod))[, 1],
+                             sterr = coef(summary(mod))[, 2],
+                             tval = coef(summary(mod))[, 3])
+    print(paste('Finished model', i, 'of', length(outcomes)))
+    i <- i + 1
+}
 
-race_mod <- lmer(race_bias ~
-            title_embeddings_race +
-            art_text_embeddings_race +
-            title_sent_neg +
-            title_sent_pos +
-            art_sent_neg +
-            art_sent_pos +
-            art_comments +
-            comment_embeddings_race +
-            upvotes +
-            comm_sent_neg +
-            comm_sent_pos +
-            (1 | art_id) +
-            (1 | CLUSTER),
-            d, REML = FALSE
+# create diagnostics
+# iccs
+iccs <- data.frame(
+    n = 1:length(outcomes),
+    Article = do.call(c, lapply(mod_results, function(x) {x$icc[1]})),
+    Cluster = do.call(c, lapply(mod_results, function(x) {x$icc[2]})),
+    Source = do.call(c, lapply(mod_results, function(x) {x$icc[3]}))
 )
+iccs_l <- melt(iccs, id.vars = 'n')
+names(iccs_l)[2:3] <- c('Level', 'ICC')
 
-power_mod <- lmer(power_bias ~
-            title_embeddings_power +
-            art_text_embeddings_power +
-            title_sent_neg +
-            title_sent_pos +
-            art_sent_neg +
-            art_sent_pos +
-            art_comments +
-            comment_embeddings_power +
-            upvotes +
-            comm_sent_neg +
-            comm_sent_pos +
-            (1 | art_id) +
-            (1 | CLUSTER),
-            d, REML = FALSE
+pdf('hlm/icc_visual.pdf', height = 7.5, width = 10)
+    print(ggplot(iccs_l, aes(x = n, y = ICC)) +
+        geom_point(aes(color = Level)) +
+        theme_bw() +
+        labs(title = 'ICC Values by Level', x = 'Iteration') +
+        theme(legend.position = 'bottom')
+    )
+dev.off()
+
+# estimates
+varnames <- c(
+    paste0('Title embeddings (', args[length(args)], ')'),
+    paste0('Article embeddings (', args[length(args)], ')'),
+    'Title sentiment (negative)',
+    'Title sentiment (positive)',
+    'Article sentiment (negative)',
+    'Article sentiment (positive)',
+    'Nbr article comments',
+    'Nbr upvotes',
+    'Comment sentiment (negative)',
+    'Comment sentiment (positive)'
 )
 
+# note: predictors 2-8 and 10-12 are of interest; predictor 1 is the intercept
+#       and predictor 10 is weakly correlated to the outcome and therefore
+#       inherently biased
+ests <- do.call(rbind, mapply(function(x, i) {
+    return(data.frame(
+        n = i,
+        Variable = varnames,
+        Estimate = x$estimate[c(2:8, 10:12)],
+        Signif = ifelse(abs(x$tval[c(2:8, 10:12)]) > 1.96, 1, 0)
+    ))
+}, mod_results, as.list(1:50), SIMPLIFY = FALSE))
+
+pdf('hlm/estimate_visual.pdf', height = 7.5, width = 10)
+    print(ggplot(ests, aes(x = n, y = Estimate)) +
+        geom_point(aes(color = as.factor(Signif))) +
+        scale_color_manual(name = 'Significance',
+                           values = c('black', 'red'),
+                           labels = c('Insignificant', 'Significant')) +
+        facet_wrap(vars(Variable), nrow = 5, ncol = 2, scales = 'free_y') +
+        theme_bw() +
+        labs(title = 'Point Estimates and Statistical Significance by Variable',
+             x = 'Iteration') +
+        theme(legend.position = 'bottom'))
+dev.off()
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 beautifulsoup4==4.6.0
 elasticsearch==6.2.0
 pandas==0.23.0
-requests==2.19.1
+requests==2.20.0
 selenium=3.14.0=py37hfa6e2cd_0
 cachetools==2.1.0
 google-api-python-client==1.7.4