- Download data manually
- Use an API (Application Programming Interface)
- Scrape it
- Check the Terms and Conditoins (TOC)
Useful for getting tables from wikipedia
Getting US States Abbreviations
https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
library(RCurl)
## Loading required package: bitops
library(XML)
library(testthat)
library(stringr)
# url is a name of a function
wiki_url <- RCurl::getURL("https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations")
tables <- XML::readHTMLTable(wiki_url)
class(tables)
## [1] "list"
length(tables)
## [1] 5
abbrevs <- tables[[1]]
head(abbrevs)
## V1
## 1 Codes:
## 2 ISO
## 3 ANSI
## 4 USPS
## 5 USCG
## 6 Abbreviations:
## V2
## 1 <NA>
## 2 ISO 3166 codes (2-letter, 3-letter and 3-digit codes from ISO 3166-1; 2+2-letter codes from ISO 3166-2)
## 3 2-letter and 2-digit codes from the ANSI standard INCITS 38:2009
## 4 2-letter codes used by the United States Postal Service
## 5 2-letter codes used by the United States Coast Guard (red text shows differences between ANSI and USPS)
## 6 <NA>
## V3 V4 V5 V6 V7 V8 V9 V10
## 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
us <- abbrevs[11:nrow(abbrevs), ]
head(us)
## V1 V2 V3 V4 V5 V6 V7 V8
## 11 United States of America Federal state US\nUSA\n840 US 00 U.S.
## 12 Alabama State US-AL AL 01 AL AL Ala.
## 13 Alaska State US-AK AK 02 AK AK Alaska
## 14 Arizona State US-AZ AZ 04 AZ AZ Ariz.
## 15 Arkansas State US-AR AR 05 AR AR Ark.
## 16 California State US-CA CA 06 CA CF Calif.
## V9 V10
## 11 U.S. U.S.A.
## 12 Ala.
## 13 Alaska Alas.
## 14 Ariz. Az.
## 15 Ark.
## 16 Calif. Ca., Cal.
first_value <- stringr::str_trim((as.character(us[1, 1])))
testthat::expect_equal(object = first_value, expected = 'United States of America')
# testthat::expect_equal(object = stringr::str_trim((as.character(us[1, 1]))), expected = 'does not match')
https://stat4701.github.io/edav/2015/04/02/rvest_tutorial/
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
if (interactive()) {
data_location <- 'data/working'
} else {
data_location <- '../../data/working'
}
IMDB Top Rated Movies:
http://www.imdb.com/chart/top?ref_=nv_mv_250_6
CSS class and id
lego_movie <- read_html("http://www.imdb.com/title/tt1490017/")
# Rating
lego_movie %>%
html_node("strong span") %>%
html_text() %>%
as.numeric()
## [1] 7.8
# First page of actors
lego_movie %>%
html_nodes(".itemprop .itemprop") %>%
html_text()
## [1] "Will Arnett" "Elizabeth Banks" "Craig Berry"
## [4] "Alison Brie" "David Burrows" "Anthony Daniels"
## [7] "Charlie Day" "Amanda Farinos" "Keith Ferguson"
## [10] "Will Ferrell" "Will Forte" "Dave Franco"
## [13] "Morgan Freeman" "Todd Hansen" "Jonah Hill"
lego_movie %>%
html_nodes("table") %>%
.[[1]] %>%
html_table()
## X1 X2
## 1 Cast overview, first billed only: Cast overview, first billed only:
## 2 Will Arnett
## 3 Elizabeth Banks
## 4 Craig Berry
## 5 Alison Brie
## 6 David Burrows
## 7 Anthony Daniels
## 8 Charlie Day
## 9 Amanda Farinos
## 10 Keith Ferguson
## 11 Will Ferrell
## 12 Will Forte
## 13 Dave Franco
## 14 Morgan Freeman
## 15 Todd Hansen
## 16 Jonah Hill
## X3
## 1 Cast overview, first billed only:
## 2 ...
## 3 ...
## 4 ...
## 5 ...
## 6 ...
## 7 ...
## 8 ...
## 9 ...
## 10 ...
## 11 ...
## 12 ...
## 13 ...
## 14 ...
## 15 ...
## 16 ...
## X4
## 1 Cast overview, first billed only:
## 2 Batman / \n Bruce Wayne \n \n \n (voice)
## 3 Wyldstyle / \n Lucy \n \n \n (voice)
## 4 Blake / \n Additional Voices \n \n \n (voice)
## 5 Unikitty \n \n \n (voice)
## 6 Octan Robot / \n Additional Voices \n \n \n (voice)
## 7 C-3PO \n \n \n (voice)
## 8 Benny \n \n \n (voice)
## 9 Mom \n \n \n (voice)
## 10 Han Solo \n \n \n (voice)
## 11 Lord Business (voice) / \n President Business (voice) / \n The Man Upstairs
## 12 Abraham Lincoln \n \n \n (voice) (as Orville Forte)
## 13 Wally \n \n \n (voice)
## 14 Vitruvius \n \n \n (voice)
## 15 Gandalf / \n Additional Voices \n \n \n (voice)
## 16 Green Lantern \n \n \n (voice)
lego_movie %>%
html_nodes(".primary_photo , .ellipsis, .character, #titleCast .itemprop, #titleCast .loadlate")
## {xml_nodeset (87)}
## [1] <td class="primary_photo">\n<a href="/name/nm0004715/?ref_=tt_cl_i1 ...
## [2] <img height="44" width="32" alt="Will Arnett" title="Will Arnett" s ...
## [3] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [4] <span class="itemprop" itemprop="name">Will Arnett</span>
## [5] <td class="ellipsis">\n ...\n </td>
## [6] <td class="character">\n <div>\n <a href="/ ...
## [7] <td class="primary_photo">\n<a href="/name/nm0006969/?ref_=tt_cl_i2 ...
## [8] <img height="44" width="32" alt="Elizabeth Banks" title="Elizabeth ...
## [9] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [10] <span class="itemprop" itemprop="name">Elizabeth Banks</span>
## [11] <td class="ellipsis">\n ...\n </td>
## [12] <td class="character">\n <div>\n <a href="/ ...
## [13] <td class="primary_photo">\n<a href="/name/nm1911947/?ref_=tt_cl_i3 ...
## [14] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## [15] <span class="itemprop" itemprop="name">Craig Berry</span>
## [16] <td class="ellipsis">\n ...\n </td>
## [17] <td class="character">\n <div>\n <a href="/ ...
## [18] <td class="primary_photo">\n<a href="/name/nm1555340/?ref_=tt_cl_i4 ...
## [19] <img height="44" width="32" alt="Alison Brie" title="Alison Brie" s ...
## [20] <td class="itemprop" itemprop="actor" itemscope itemtype="http://sc ...
## ...
# more manual way
lego_movie %>%
html_nodes("table") %>%
.[[1]] %>%
html_nodes("tr") %>%
html_nodes("span") %>%
html_text()
## [1] "Will Arnett" "Elizabeth Banks" "Craig Berry"
## [4] "Alison Brie" "David Burrows" "Anthony Daniels"
## [7] "Charlie Day" "Amanda Farinos" "Keith Ferguson"
## [10] "Will Ferrell" "Will Forte" "Dave Franco"
## [13] "Morgan Freeman" "Todd Hansen" "Jonah Hill"