Im trying to web-scrape the Obama's spechees page, to create things like wordclouds, etc.
When I try to do it for like 1, 5, 10 different pages (speeches), not in a loop, separately, the code works. But with this loop I created (above), the resulting object contains nothing (NULL
).
Somebody can help me, please?
library(wordcloud)
library(tm)
library(XML)
library(RCurl)
site <- "http://obamaspeeches.com/"
url <- readLines(site)
h <- htmlTreeParse(file = url, asText = TRUE, useInternalNodes = TRUE,
encoding = "utf-8")
# getting the phrases that will form the web adresses for the speeches
teste <- data.frame(h[42:269, ])
teste2 <- teste[grep("href=", teste$h.42.269...), ]
teste2 <- as.data.frame(teste2)
teste3 <- gsub("^.*href=", "", teste2[, "teste2"])
teste3 <- as.data.frame(teste3)
teste4 <- gsub("^/", "", teste3[, "teste3"])
teste4 <- as.data.frame(teste4)
teste5 <- gsub(">.*$", "", teste4[, "teste4"])
teste5 <- as.data.frame(teste5)
# loop to read pages
l <- vector(mode = "list", length = nrow(teste5))
i <- 1
for (i in nrow(teste5)) {
site <- paste("http://obamaspeeches.com/", teste5[i, ], sep = "")
url <- readLines(site)
l[[i]] <- url
i <- i + 1
}
str(l)