-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy path02-scraping-with-rvest.r
278 lines (197 loc) · 8.55 KB
/
02-scraping-with-rvest.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
### -----------------------------
### simon munzert
### scraping with rvest
### -----------------------------
## peparations -------------------
source("00-course-setup.r")
wd <- getwd()
## breaking up the HTML ----------
## What's HTML?
# HyperText Markup Language
# markup language = plain text + markups
# standard for the construction of websites
# relevance for web scraping: web architecture is important because it determines where and how information is stored
## browsing vs. scraping
# browsing
# 1. you click on something
# 2. browser sends request to server that hosts website
# 3. server returns resource (often an HTML document)
# 4. browser interprets HTML and renders it in a nice fashion
# scraping with R
# 1. you manually specify a resource
# 2. R sends request to server that hosts website
# 3. server returns resource
# 4. R parses HTML (i.e., interprets the structure), but does not render it in a nice fashion
# 5. it's up to you to tell R which parts of the structure to focus on and what content to extract
## inspect the source code in your browser ---------------
browseURL("https://www.buzzfeed.com/?country=us")
# Chrome:
# 1. right click on page
# 2. select "view source"
# Firefox:
# 1. right click on page
# 2. select "view source"
# Microsoft Edge:
# 1. right click on page
# 2. select "view source"
# Safari
# 1. click on "Safari"
# 2. select "Preferences"
# 3. go to "Advanced"
# 4. check "Show Develop menu in menu bar"
# 5. click on "Develop"
# 6. select "show page source"
# 7. alternatively to 5./6., right click on page and select "view source"
## a quick primer to CSS selectors ----------
## What's CSS?
# Cascading Style Sheets
# style sheet language to give browsers information of how to render HTML document by providing more info on, e.g., layout, colors, and fonts
# CSS code can be stored within an HTML document or in an external CSS file
# the good thing for us: selectors, i.e. patterns used to specify which elements to format in a certain way, can be used to address the elements we want to extract information from
# works via tag name (e.g., <h2>, <p>, ...) or element attributes "id" and "class"
## How does it work?
browseURL("http://flukeout.github.io/") # let's play this together until plate 8 or so!
#######################
### IT'S YOUR SHOT! ###
#######################
# 1. repeat playing CSS diner until plate 10!
# 2. go to the following website
browseURL("https://www.jstatsoft.org/about/editorialTeam")
# a) which CSS identifiers can be used to describe all names of the editorial team?
# b) write a corresponding CSS selector that targets them!
.member a
div.member a
#group a
## a quick primer to XPath ------------------
# XPath is a query language for selecting nodes from an XML-style document (including HTML)
# provides just another way of extracting data from static webpages
# you can also use XPath with R
# can be more powerful than CSS selectors
# learning XPath takes probably a day (and some practice)
# you'll probably not need it very often, so we don't talk about it here
# if you want to know more, consult the book--we give it an extensive treatment
## the rvest package ----------
## overview
# see also: https://github.com/hadley/rvest
# convenient package to scrape information from web pages
# builds on other packages, such as xml2 and httr
# provides very intuitive functions to import and process webpages
## basic workflow of scraping with rvest
# 1. specify URL
url <- "https://www.buzzfeed.com/?country=us"
# 2. download static HTML behind the URL and parse it into an XML file
url_parsed <- read_html(url)
class(url_parsed)
html_structure(url_parsed)
as_list(url_parsed)
# 3. extract specific nodes with CSS (or XPath)
headings_nodes <- html_nodes(url_parsed, css = ".lede__link")
# 4. extract content from nodes
headings <- html_text(headings_nodes)
headings <- str_replace_all(headings, "\\n", "") %>% str_trim()
#######################
### IT'S YOUR SHOT! ###
#######################
# 1. revisit the jstatsoft.org website from above and use rvest to extract the names!
url <- "https://www.jstatsoft.org/about/editorialTeam"
# 2. bonus: try and extract the full lines including the affiliation, and count how many of the editors are at a statistics or mathematics department or institution!
### extract data from tables --------------
## HTML tables
# ... are a special case for scraping because they are already very close to the data structure you want to build up in R
# ... come with standard tags and are usually easily identifiable
## scraping HTML tables with rvest
url <- "https://en.wikipedia.org/wiki/Joint_Statistical_Meetings"
browseURL(url)
url_parsed <- read_html(url)
tables <- html_table(url_parsed, fill = TRUE)
tables
meetings <- tables[[2]]
class(meetings)
head(meetings)
table(meetings$Location) %>% sort()
## note: HTML tables can get quite complex. there are more flexible solutions than html_table() on the market (e.g., package "htmltab")
#######################
### IT'S YOUR SHOT! ###
#######################
# 1. scrape the table tall buildings (300m+) currently under construction from
browseURL("https://en.wikipedia.org/wiki/List_of_tallest_buildings_in_the_world")
# 2. how many of those buildings are currently built in China? and in which city are most of the tallest buildings currently built?
### working with SelectorGadget ----------
# to learn about it, visit
vignette("selectorgadget")
# to install it, visit
browseURL("http://selectorgadget.com/")
# and follow the advice below: "drag this link to your bookmark bar: >>SelectorGadget>> (updated August 7, 2013)"
## SelectorGadget is magic. Proof:
browseURL("https://www.buzzfeed.com/?country=us")
css <- '.sm-text-3'
css <- ".schlagzeilen-headline"
url <- "http://spiegel.de/schlagzeilen"
url_parsed <- read_html(url)
html_nodes(url_parsed, css = css) %>% html_text
#######################
### IT'S YOUR SHOT! ###
#######################
# 1. use SelectorGadget to identify a CSS selector that helps extract all article author names from Buzzfeed's main page!
# 2. use rvest to scrape these names!
## dealing with multiple pages ----------
# often, we want to scrape data from multiple pages
# in such scenarios, automating the scraping process becomes r e a l l y powerful
# my philosophy: download first, then import and extract information. minimizes server load and saves time
## example: fetching and analyzing jstatsoft download statistics
# set temporary working directory
setwd(wd)
tempwd <- ("data/jstatsoftStats")
dir.create(tempwd)
setwd(tempwd)
browseURL("http://www.jstatsoft.org/")
# construct list of urls
baseurl <- "http://www.jstatsoft.org/article/view/v"
volurl <- paste0("0", seq(1,73,1))
volurl[1:9] <- paste0("00", seq(1, 9, 1))
brurl <- paste0("0", seq(1,9,1))
urls_list <- paste0(baseurl, volurl)
urls_list <- paste0(rep(urls_list, each = 9), "i", brurl)
names <- paste0(rep(volurl, each = 9), "_", brurl, ".html")
# download pages
folder <- "html_articles/"
dir.create(folder)
for (i in 1:length(urls_list)) {
if (!file.exists(paste0(folder, names[i]))) {
download.file(urls_list[i], destfile = paste0(folder, names[i]))
Sys.sleep(runif(1, 0, 1))
}
}
# check success
list_files <- list.files(folder, pattern = "0.*")
list_files_path <- list.files(folder, pattern = "0.*", full.names = TRUE)
length(list_files)
# delete non-existing articles
files_size <- sapply(list_files_path, file.size)
table(files_size) %>% sort()
delete_files <- list_files_path[files_size == 24265]
sapply(delete_files, file.remove)
list_files_path <- list.files(folder, pattern = "0.*", full.names = TRUE) # update list of files
# import pages and extract content
authors <- character()
title <- character()
statistics <- character()
numViews <- numeric()
datePublish <- character()
for (i in 1:length(list_files_path)) {
html_out <- read_html(list_files_path[i])
table_out <- html_table(html_out, fill = TRUE)[[6]]
authors[i] <- table_out[1,2]
title[i] <- table_out[2,2]
statistics[i] <- table_out[4,2]
numViews[i] <- statistics[i] %>% str_extract("[[:digit:]]+") %>% as.numeric()
datePublish[i] <- statistics[i] %>% str_extract("[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}.$") %>% str_replace("\\.", "")
}
# construct data frame
dat <- data.frame(authors = authors, title = title, numViews = numViews, datePublish = datePublish)
head(dat)
# download statistics
dattop <- dat[order(dat$numViews, decreasing = TRUE),]
dattop[1:10,]
summary(dat$numViews)
plot(density(dat$numViews, from = 0, ), yaxt="n", ylab="", xlab="Number of views", main="Distribution of article page views in JSTATSOFT")