Skip to content

Commit

Permalink
Merge pull request #72 from kaybenleroll/nlp-workshop
Browse files Browse the repository at this point in the history
nlp workshop
  • Loading branch information
kaybenleroll authored May 31, 2024
2 parents 937fcf4 + 7f3f68d commit 20a8e58
Show file tree
Hide file tree
Showing 20 changed files with 1,843 additions and 4,072 deletions.
3 changes: 3 additions & 0 deletions ws_nlp_202301/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
output.log

temp*.R

*_cache/*
*_files/*
42 changes: 29 additions & 13 deletions ws_nlp_202301/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@ FROM rocker/verse:4.2.1

ENV RETICULATE_MINICONDA_ENABLED=FALSE


WORKDIR /tmp

COPY build/test_report.qmd /tmp
RUN quarto render test_report.qmd --to pdf \
&& rm -fv /tmp/test_report.pdf


RUN git clone https://github.com/lindenb/makefile2graph.git \
&& cd makefile2graph \
&& make \
&& make install


RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --no-install-recommends \
Expand Down Expand Up @@ -46,12 +60,20 @@ RUN apt-get update \
broom \
cleanNLP \
conflicted \
corpora \
corporaexplorer \
corpus \
cowplot \
DataExplorer \
DT \
directlabels \
fs \
furrr \
ggnetwork \
ggraph \
ggwordcloud \
hunspell \
ineq \
kernlab \
lda \
lobstr \
Expand All @@ -66,13 +88,19 @@ RUN apt-get update \
pryr \
qdap \
quanteda \
quanteda.textmodels \
quanteda.textplots \
quanteda.textstats \
quarto \
readtext \
rvest \
sessioninfo \
sna \
snakecase \
spacyr \
tesseract \
text2vec \
textdata \
textfeatures \
textplot \
tidygraph \
Expand All @@ -82,29 +110,17 @@ RUN apt-get update \
topicmodels \
udpipe \
wordcloud \
word2vec \
zipfR


WORKDIR /tmp

COPY build/test_report.qmd /tmp
RUN quarto render test_report.qmd --to pdf \
&& rm -fv /tmp/test_report.pdf


RUN git clone https://github.com/lindenb/makefile2graph.git \
&& cd makefile2graph \
&& make \
&& make install

RUN cp -r $HOME/.R /home/rstudio \
&& chown -R rstudio:rstudio /home/rstudio/.R


COPY build/docker_install_sys_rpkgs.R /tmp
RUN Rscript /tmp/docker_install_sys_rpkgs.R


WORKDIR /home/rstudio
USER rstudio

Expand Down
11 changes: 8 additions & 3 deletions ws_nlp_202301/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,22 @@ full_deps.dot:

depgraph: full_deps.png

initial_nlp_work.html: retrieve_extract_parse_film_scripts.html
text_annotation_politeness.html: initial_nlp_work.html



mrproper: clean-data clean-scraped clean-scripts


mrproper: clean-html clean-data clean-scraped clean-scripts
rm -fv *.dot
rm -fv output.log


clean-html:
rm -fv ${HTML_FILES}

clean-data:
rm -rfv data/*.rds
rm -fv data/*.rds

clean-parsed: clean-data
rm -fv data/parsed_scripts/*.rds
Expand Down
12 changes: 12 additions & 0 deletions ws_nlp_202301/build/docker_install_sys_rpkgs.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ remotes::install_github(
upgrade = "never"
)

remotes::install_github(
"quanteda/quanteda.corpora",
ref = "ec4b76d841afc9a734cc0351b1fa87236c83b456",
upgrade = "never"
)

remotes::install_github(
"kbenoit/quanteda.dictionaries",
ref = "b3c91606afad56603915fd622ef0aba4cc95135f",
upgrade = "never"
)

install.packages("StanfordCoreNLP",
dependencies = TRUE,
repos = "http://datacube.wu.ac.at/"
Expand Down
1 change: 1 addition & 0 deletions ws_nlp_202301/build/docker_install_user_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ cnlp_download_corenlp(lang = "en")
cnlp_init_corenlp(lang = "en")

cnlp_init_udpipe(model_name = "english")

55 changes: 1 addition & 54 deletions ws_nlp_202301/build/test_report.qmd
Original file line number Diff line number Diff line change
@@ -1,63 +1,12 @@
---
title: "Temporary Quarto Document"
author: "Mick Cooney <mcooney@describedata.com>"
author: "Mick Cooney <mickcooney@gmail.com>"
date: "`r Sys.Date()`"
---


```{r knit_opts, include = FALSE}
library(conflicted)
library(tidyverse)
library(magrittr)
library(rlang)
library(scales)
library(cowplot)
resolve_conflicts <- function(pkg_priority) {
get_index <- function(pkg_name) {
idx <- str_which(pkg_priority, pkg_name)
if(length(idx) == 0) {
idx <- 0L
}
return(idx)
}
conflict_lst <- conflict_scout()
for(func_name in names(conflict_lst)) {
pkg_index <- map_int(conflict_lst[[func_name]], get_index)
pkg_index <- pkg_index[pkg_index > 0]
if(length(pkg_index) == 0) {
pkg_use <- conflict_lst[[func_name]][1]
} else {
pkg_use <- pkg_index %>%
min() %>%
pkg_priority[.]
}
conflict_prefer(func_name, pkg_use)
}
return(conflict_lst)
}
conflict_lst <- resolve_conflicts(
c("xml2", "magrittr", "rlang", "dplyr", "readr", "purrr", "ggplot2")
)
knitr::opts_chunk$set(
tidy = FALSE,
cache = FALSE,
warning = FALSE,
message = FALSE,
fig.height = 8,
fig.width = 11
)
options(
width = 80L,
Expand All @@ -66,6 +15,4 @@ options(
)
set.seed(42)
theme_set(theme_cowplot())
```
Loading

0 comments on commit 20a8e58

Please sign in to comment.