Building off of what JackStat outlined above, I made a modification to the page determination scheme to pick up units where there are less than 5 pages (JackStat's algorithm will throw an error). I also set it up with an import to pull in which units of interest are to be tracked. There are comments added for steps to get this to run on a Windows PC.
library(RSelenium)
library(XML)
library(foreach)
### Insure that the selenium-server-standalone.jar file and the Google Chrome driver are in the same folder
### as the Windows command directory setting
### Open Windows command
### Type in "java -jar selenium-server-standalone.jar" and hit enter
setwd("H:/heritage_units")
hu <- read.table("hu_tracked_101316.csv", sep = ",", header = TRUE, colClasses = "character")
hu.c <- hu[, 1]
# Start Selenium server
checkForServer()
startServer()
remDr <-
remoteDriver(
remoteServerAddr = "localhost"
, port = 4444
, browserName = "chrome"
)
remDr$open()
master <- data.frame('Spotted On'=factor(), 'Location'=factor(), 'Directon'=factor(), 'Train No'=factor(), 'Leading'=factor(), 'Spotter Reputation'=factor(), 'Heritage Unit'=character())
for (u in seq_along(hu.c)) {
url <- paste("https://www.heritageunits.com/Locomotive/Detail/", hu.c[u], sep="")
print(hu.c[u])
# Navigate to page
remDr$navigate(url)
# Snag the html
outhtml <- remDr$findElement(using = 'xpath', "//*")
out<-outhtml$getElementAttribute("outerHTML")[[1]]
# Parse with RCurl
doc<-htmlParse(out, encoding = "UTF-8")
# get the last page so we can cycle through
PageNodes <- getNodeSet(doc, '//*[(@id = "history_paginate")]')
Pages <- sapply(X = PageNodes, FUN = xmlValue)
# Find horizontal ellipsis in page information
sc <- 0
for (j in 1:nchar(Pages)){
if (!(grepl("[[:alpha:]]", substr(Pages, j, j)) | grepl("[[:digit:]]", substr(Pages, j, j)))){
sc <- j
}
}
if (sc==0) {
posN <- gregexpr(pattern ='N', Pages)
LastPage <- substr(Pages, posN[[1]]-1, posN[[1]]-1)
}else{
posN <- gregexpr(pattern ='N', Pages)
LastPage <- substr(Pages, sc+1, posN[[1]]-1)
}
temp1 <- readHTMLTable(doc)$history
temp1$'Heritage Unit' <- hu.c[u]
for (i in 2:LastPage){
nextpage <- remDr$findElement("css selector", '#history_next')
nextpage$sendKeysToElement(list(key ="enter"))
# Take it slow so it gets each page
Sys.sleep(.50)
outhtml <- remDr$findElement(using = 'xpath', "//*")
out<-outhtml$getElementAttribute("outerHTML")[[1]]
# Parse with RCurl
doc<-htmlParse(out, encoding = "UTF-8")
temp2 <- readHTMLTable(doc)$history
temp2$'Heritage Unit' <- hu.c[u]
temp1 <- rbind(temp1, temp2)
}
master <- rbind(master, temp1)
}
write.csv(master, "hu_sel_date.csv")