Skip to content

Commit

Permalink
Finalized HLM code.
Browse files Browse the repository at this point in the history
  • Loading branch information
mhoover-gallup committed Oct 31, 2018
1 parent ff4be7e commit f277366
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 53 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ articles.zip
comments.zip
config.py

# hlm data files
# hlm files
hlm/*.csv
hlm/*.pdf
3 changes: 1 addition & 2 deletions hlm/create_bias.r
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ build_scores <- function(n = 500) {
low_vals = los,
high_vals = his
)
names(results)[i] <- paste(c('z', sample(x = c(letters, 0:9), size = 6)),
collapse = '')
names(results)[i] <- paste0('z', i)
print(paste('Finished iteration', i, 'of', n))
}
return(results)
Expand Down
146 changes: 97 additions & 49 deletions hlm/hlm.r
Original file line number Diff line number Diff line change
@@ -1,66 +1,114 @@
#!/usr/loca/bin/R
#!/usr/local/bin/R
# hierarchical modeling for ugb
# questions? contact [email protected]

# load libraries
library(data.table)
library(ggplot2)
library(lme4)
library(reshape2)
library(sjstats)

# get passed arguments
args <- commandArgs(trailingOnly = TRUE)
args <- c('breitbart_modeling_data.csv')
nargs <- sapply(args, function(x) {strsplit(x, '_')[[1]][1]})

# load data
d <- fread(paste('hlm', args[1], sep = '/'))
d <- list()
for(i in 1:(length(args)-1)) {
d[[i]] <- fread(paste('hlm', args[i], sep = '/'))
d[[i]]$source <- nargs[i]
}
d <- do.call(rbind, d)

gender_mod <- lmer(gender_bias ~
title_embeddings_gender +
art_text_embeddings_gender +
title_sent_neg +
title_sent_pos +
art_sent_neg +
art_sent_pos +
art_comments +
comment_embeddings_gender +
upvotes +
comm_sent_neg +
comm_sent_pos +
(1 | art_id) +
(1 | CLUSTER),
d, REML = FALSE
)
# prep modeling loop
metric_measure <- grep(args[length(args)], names(d))
outcomes <- grep('^z', names(d))
mod_results <- list()
i <- 1

# model against synthetic outcomes
for(outcome in outcomes) {
mod <- d[, lmer(.SD[[outcome]] ~
.SD[[metric_measure[1]]] +
.SD[[metric_measure[2]]] +
title_sent_neg +
title_sent_pos +
art_sent_neg +
art_sent_pos +
art_comments +
.SD[[metric_measure[3]]] +
upvotes +
comm_sent_neg +
comm_sent_pos +
(1 | source) +
(1 | art_id) +
(1 | CLUSTER),
d, REML = FALSE)]
mod_results[[i]] <- list(icc = icc(mod),
predictors = names(fixef(mod)),
estimate = coef(summary(mod))[, 1],
sterr = coef(summary(mod))[, 2],
tval = coef(summary(mod))[, 3])
print(paste('Finished model', i, 'of', length(outcomes)))
i <- i + 1
}

race_mod <- lmer(race_bias ~
title_embeddings_race +
art_text_embeddings_race +
title_sent_neg +
title_sent_pos +
art_sent_neg +
art_sent_pos +
art_comments +
comment_embeddings_race +
upvotes +
comm_sent_neg +
comm_sent_pos +
(1 | art_id) +
(1 | CLUSTER),
d, REML = FALSE
# create diagnostics
# iccs
iccs <- data.frame(
n = 1:length(outcomes),
Article = do.call(c, lapply(mod_results, function(x) {x$icc[1]})),
Cluster = do.call(c, lapply(mod_results, function(x) {x$icc[2]})),
Source = do.call(c, lapply(mod_results, function(x) {x$icc[3]}))
)
iccs_l <- melt(iccs, id.vars = 'n')
names(iccs_l)[2:3] <- c('Level', 'ICC')

power_mod <- lmer(power_bias ~
title_embeddings_power +
art_text_embeddings_power +
title_sent_neg +
title_sent_pos +
art_sent_neg +
art_sent_pos +
art_comments +
comment_embeddings_power +
upvotes +
comm_sent_neg +
comm_sent_pos +
(1 | art_id) +
(1 | CLUSTER),
d, REML = FALSE
pdf('hlm/icc_visual.pdf', height = 7.5, width = 10)
print(ggplot(iccs_l, aes(x = n, y = ICC)) +
geom_point(aes(color = Level)) +
theme_bw() +
labs(title = 'ICC Values by Level', x = 'Iteration') +
theme(legend.position = 'bottom')
)
dev.off()

# estimates
varnames <- c(
paste0('Title embeddings (', args[length(args)], ')'),
paste0('Article embeddings (', args[length(args)], ')'),
'Title sentiment (negative)',
'Title sentiment (positive)',
'Article sentiment (negative)',
'Article sentiment (positive)',
'Nbr article comments',
'Nbr upvotes',
'Comment sentiment (negative)',
'Comment sentiment (positive)'
)

# note: predictors 2-8 and 10-12 are of interest; predictor 1 is the intercept
# and predictor 10 is weakly correlated to the outcome and therefore
# inherently biased
ests <- do.call(rbind, mapply(function(x, i) {
return(data.frame(
n = i,
Variable = varnames,
Estimate = x$estimate[c(2:8, 10:12)],
Signif = ifelse(abs(x$tval[c(2:8, 10:12)]) > 1.96, 1, 0)
))
}, mod_results, as.list(1:50), SIMPLIFY = FALSE))

pdf('hlm/estimate_visual.pdf', height = 7.5, width = 10)
print(ggplot(ests, aes(x = n, y = Estimate)) +
geom_point(aes(color = as.factor(Signif))) +
scale_color_manual(name = 'Significance',
values = c('black', 'red'),
labels = c('Insignificant', 'Significant')) +
facet_wrap(vars(Variable), nrow = 5, ncol = 2, scales = 'free_y') +
theme_bw() +
labs(title = 'Point Estimates and Statistical Significance by Variable',
x = 'Iteration') +
theme(legend.position = 'bottom'))
dev.off()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4==4.6.0
elasticsearch==6.2.0
pandas==0.23.0
requests==2.19.1
requests==2.20.0
selenium=3.14.0=py37hfa6e2cd_0
cachetools==2.1.0
google-api-python-client==1.7.4
Expand Down

0 comments on commit f277366

Please sign in to comment.