-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.R
60 lines (48 loc) · 1.58 KB
/
helper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
library(tm)
library(wordcloud)
library(memoise)
library(XML)
# Using "memoise" to automatically cache the results
# refer to http://shiny.rstudio.com/gallery/word-cloud.html
getTermMatrix <- memoise(function(text) {
myCorpus = Corpus(VectorSource(text))
myCorpus = tm_map(myCorpus, content_transformer(tolower))
myCorpus = tm_map(myCorpus, removePunctuation)
myCorpus = tm_map(myCorpus, removeNumbers)
myCorpus = tm_map(myCorpus, removeWords,
c(stopwords("SMART"), "and", "but", "the"))
myDTM = TermDocumentMatrix(myCorpus,
control = list(minWordLength = 1))
m = as.matrix(myDTM)
sort(rowSums(m), decreasing = TRUE)
})
stripMarkup <- function(text) {
return(gsub("<.*?>", "", text))
}
# Removing an entire HTML node including its content, eg. <script> or <style>
stripNodeAndContent <- function(text, tag) {
r <- paste0("<", tag, ".+?</", tag, ">")
return(gsub(r, "", text))
}
stripSpace <- function(text){
return(gsub("\\s+", " ", text))
}
stripComment <- function(text){
return(gsub("/\\*.*?\\*/", " ", text))
}
cleanMarkup <- function(text){
stripComment(
stripSpace(
stripMarkup(
stripNodeAndContent(
stripNodeAndContent(
stripNodeAndContent(text, "script"),
"style"),
"head")
)))
}
# extracting a HTML Doc of a URL
getText <- function(url){
text <- getURL(url)
cleanMarkup(text)
}