Add script that processes raw data, edit UTF-8 to ASCII lines

juliasilge · Apr 9, 2016 · f998a8e · f998a8e
1 parent 3f70347
commit f998a8e
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 4 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 .travis.yml
 ^CONDUCT\.md$
 ^cran-comments\.md$
+^data-raw$
diff --git a/README.md b/README.md
@@ -19,6 +19,8 @@ This package provides access to the full texts of Jane Austen's 6 completed, pub
 * `northangerabbey`:  *Northanger Abbey*, published posthumously in 1818
 * `persuasion`:  *Persuasion*, also published posthumously in 1818
 
+There is also a function `austen_books()` that returns a tidy data frame of all 6 novels.
+
 ## Installation
 
 To install the package type the following:

diff --git a/cran-comments.md b/cran-comments.md
@@ -9,8 +9,7 @@ This is the first attempted CRAN release of janeaustenr, and my first submission
 
 ## R CMD check results
 
-0 errors | 0 warnings | 1 note
+0 errors | 0 warnings | 0 notes
 
-* The 1 note is for marked UTF-8 strings in data files; the texts of the novels are UTF-8 plain text.
-* On Win-builder, there was a note for possibly invalid URLs for the Project Gutenberg URLs in the .Rd files because Project Gutenberg blocks automated traffic
-* On Win-builder, there was a note for possibly mis-spelled words in DESCRIPTION (Austen's at 2:30 and 6:34, Northanger at 8:32) but these are the correct spellings
+* On Win-builder, there was a message about possibly invalid URLs for the Project Gutenberg URLs in the .Rd files because Project Gutenberg blocks automated traffic
+* On Win-builder, there was a message about possibly mis-spelled words in DESCRIPTION (Austen's at 2:30 and 6:34, Northanger at 8:32) but these are the correct spellings
diff --git a/data-raw/prep_data.R b/data-raw/prep_data.R
@@ -0,0 +1,56 @@
+## This is the script used to process the UTF-8 plain text files from
+## Project Gutenberg and add them to the janeaustenr package.
+## Project Gutenberg doesn't like automated traffic very much so be careful 
+## trying to connect too often.
+
+library(readr)
+library(devtools)
+
+## First, read the plain text files from Project Gutenberg
+## Skip lines at the beginning to remove Project Gutenberg header information
+## Remove lines at the end to get rid of Project Gutenberg footer information
+## A few of these files ended up with NA lines
+
+sensesensibility <- read_lines("http://www.gutenberg.org/cache/epub/161/pg161.txt", skip = 33)
+sensesensibility <- sensesensibility[1:(length(sensesensibility) - 370)]
+sensesensibility <- sensesensibility[!is.na(sensesensibility)]
+
+prideprejudice <- read_lines("http://www.gutenberg.org/cache/epub/1342/pg1342.txt", skip = 30)
+prideprejudice <- prideprejudice[1:(length(prideprejudice) - 366)]
+prideprejudice <- prideprejudice[!is.na(prideprejudice)]
+
+## Mansfield Park has one line with a non-ASCII character (a British pound
+## symbol); let's edit it for CRAN
+
+mansfieldpark <- read_lines("http://www.gutenberg.org/cache/epub/141/pg141.txt", skip = 29)
+mansfieldpark <- mansfieldpark[1:(length(mansfieldpark) - 367)]
+mansfieldpark <- mansfieldpark[!is.na(mansfieldpark)]
+mansfieldpark[14652] <- "the command of her beauty, and her 20,000 pounds, any one who could satisfy the"
+
+emma <- read_lines("http://www.gutenberg.org/cache/epub/158/pg158.txt", skip = 29)
+emma <- emma[1:(length(emma) - 367)]
+emma <- emma[!is.na(emma)]
+
+northangerabbey <- read_lines("http://www.gutenberg.org/cache/epub/121/pg121.txt", skip = 29)
+northangerabbey <- northangerabbey[1:(length(northangerabbey) - 383)]
+northangerabbey <- northangerabbey[!is.na(northangerabbey)]
+
+## Persuasion also has a line with a non-ASCII character (e with an accent); 
+## let's edit it for CRAN
+
+persuasion <- read_lines("http://www.gutenberg.org/cache/epub/105/pg105.txt", skip = 35)
+persuasion <- persuasion[1:(length(persuasion) - 371)]
+persuasion <- persuasion[!is.na(persuasion)]
+persuasion[7066] <- "concert.  Something so formal and _arrange_ in her air!  and she sits so"
+
+
+## Now, add the data files to the package
+
+devtools::use_data(sensesensibility, overwrite = TRUE)
+devtools::use_data(prideprejudice, overwrite = TRUE)
+devtools::use_data(mansfieldpark, overwrite = TRUE)
+devtools::use_data(emma, overwrite = TRUE)
+devtools::use_data(northangerabbey, overwrite = TRUE)
+devtools::use_data(persuasion, overwrite = TRUE)
+
+## Finished!
diff --git a/data/mansfieldpark.rda b/data/mansfieldpark.rda
diff --git a/data/persuasion.rda b/data/persuasion.rda