I am creating a scraper for this page: https://www.oddsportal.com. Since the page use javascript rendering I decided for Rselenium. My aim is to scrape odds for each match played in this year. I use login form for the page since I set my own bookmakers to show for these matches. I already scraped 50 000 URLs for these matches and now I use Rselenium to open each of these URLs and scrape specific data. I wonder if there is any better solution for this problem since my script takes too long mainly for the remDr$navigate(url) part. I also tried splashr package which is faster but I am not able to login and see the bookmakers I need. Tried webdriver package also but couldn't set useragent which I need in order not to get 404 error. I use findElements but maybe if I render the page after navigating to URL and scrape with html_nodes it could save some time. I also tried disabling css but coould not find any working solution for phantomjs or headless chromedriver in R. Thank you for your replies in advance. This is my script for testing on 20 URLs so far:
pjs <- wdman::phantomjs()
eCap <- list(phantomjs.page.settings.userAgent
= "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0", phantomjs.page.settings.loadImages = FALSE, phantomjs.phantom.cookiesEnabled = FALSE, phantomjs.phantom.javascriptEnabled = TRUE)
remDr <- remoteDriver(browserName = "phantomjs", port = 4567L, extraCapabilities = eCap)
remDr$open()
#login to webpage
remDr$navigate("https://www.oddsportal.com/results/#soccer")
remDr$findElement('name', 'login-submit')$clickElement()
remDr$findElement(using = 'css selector', "#login-username1")$sendKeysToElement(list("*****"))
remDr$findElement(using = 'css selector', "#login-password1")$sendKeysToElement(list("*****"))
remDr$findElement(using = 'css selector', '#col-content > div:nth-child(3) > div > form > div:nth-child(3) > button')$clickElement()
#loop through the URL adresses and get the odds with results
while(i<=20){
url<-links1$links[i]
remDr$navigate(url)
# odds for 18Bet
if(length(remDr$findElements('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$bet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$betx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$bet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$bet1[i]<-0
odds$betx[i]<-0
odds$bet2[i]<-0
}
# odds for 1xBet
if(length(remDr$findElements('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$xBet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$xBetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$xBet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$xBet1[i]<-0
odds$xBetx[i]<-0
odds$xBet2[i]<-0
}
# odds for Asianodds
if(length(remDr$findElements('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Asianodds1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Asianoddsx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Asianodds2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Asianodds1[i]<-0
odds$Asianoddsx[i]<-0
odds$Asianodds2[i]<-0
}
# odds for bet-at-home
if(length(remDr$findElements('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$betathome1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$betathomex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$betathome2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$betathome1[i]<-0
odds$betathomex[i]<-0
odds$betathome2[i]<-0
}
# odds for bet365
if(length(remDr$findElements('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Bet3651[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Bet365x[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Bet3652[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Bet3651[i]<-0
odds$Bet365x[i]<-0
odds$Bet3652[i]<-0
}
# odds for bwin
if(length(remDr$findElements('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$bwin1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$bwinx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$bwin2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$bwin1[i]<-0
odds$bwinx[i]<-0
odds$bwin2[i]<-0
}
# odds for Chance.cz
if(length(remDr$findElements('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Chance1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Chancex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Chance2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Chance1[i]<-0
odds$Chancex[i]<-0
odds$Chance2[i]<-0
}
# odds for iFortuna.sk
if(length(remDr$findElements('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$iFortuna1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$iFortunax[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$iFortuna2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$iFortuna1[i]<-0
odds$iFortunax[i]<-0
odds$iFortuna2[i]<-0
}
# odds for Marathonbet
if(length(remDr$findElements('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Marathonbet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Marathonbetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Marathonbet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Marathonbet1[i]<-0
odds$Marathonbetx[i]<-0
odds$Marathonbet2[i]<-0
}
# odds for MAXITIP.cz
if(length(remDr$findElements('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$MAXITIP1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$MAXITIPx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$MAXITIP2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$MAXITIP1[i]<-0
odds$MAXITIPx[i]<-0
odds$MAXITIP2[i]<-0
}
# odds for Pinnacle
if(length(remDr$findElements('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Pinnacle1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Pinnaclex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Pinnacle2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Pinnacle1[i]<-0
odds$Pinnaclex[i]<-0
odds$Pinnacle2[i]<-0
}
# odds for SAZKAbet.cz
if(length(remDr$findElements('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$SAZKAbet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$SAZKAbetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$SAZKAbet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$SAZKAbet1[i]<-0
odds$SAZKAbetx[i]<-0
odds$SAZKAbet2[i]<-0
}
# odds for Tipsport.sk
if(length(remDr$findElements('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$Tipsport1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Tipsportx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Tipsport2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
odds$Tipsport1[i]<-0
odds$Tipsportx[i]<-0
odds$Tipsport2[i]<-0
}
#country, league, match, results
odds$match[i] <-remDr$findElement('xpath','//*[@id="col-content"]/h1')$getElementText()
odds$krajina[i]<-remDr$findElement('xpath', '//*[@id="breadcrumb"]/a[3]')$getElementText()
odds$liga[i]<-remDr$findElement('xpath', '//*[@id="breadcrumb"]/a[4]')$getElementText()
if(length(remDr$findElements('xpath', '//*[@id="event-status"]/p/strong'))!=0){
odds$result[i] <-remDr$findElement('xpath', '//*[@id="event-status"]/p/strong')$getElementText()
}else{odds$result[i]=0}
i<-i+1
}