5

I am creating a scraper for this page: https://www.oddsportal.com. Since the page use javascript rendering I decided for Rselenium. My aim is to scrape odds for each match played in this year. I use login form for the page since I set my own bookmakers to show for these matches. I already scraped 50 000 URLs for these matches and now I use Rselenium to open each of these URLs and scrape specific data. I wonder if there is any better solution for this problem since my script takes too long mainly for the remDr$navigate(url) part. I also tried splashr package which is faster but I am not able to login and see the bookmakers I need. Tried webdriver package also but couldn't set useragent which I need in order not to get 404 error. I use findElements but maybe if I render the page after navigating to URL and scrape with html_nodes it could save some time. I also tried disabling css but coould not find any working solution for phantomjs or headless chromedriver in R. Thank you for your replies in advance. This is my script for testing on 20 URLs so far:

pjs <- wdman::phantomjs()

eCap <- list(phantomjs.page.settings.userAgent 
             = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0", phantomjs.page.settings.loadImages = FALSE, phantomjs.phantom.cookiesEnabled = FALSE, phantomjs.phantom.javascriptEnabled = TRUE)

remDr <- remoteDriver(browserName = "phantomjs", port = 4567L, extraCapabilities = eCap)
remDr$open()

#login to webpage
remDr$navigate("https://www.oddsportal.com/results/#soccer")
remDr$findElement('name', 'login-submit')$clickElement()
remDr$findElement(using = 'css selector', "#login-username1")$sendKeysToElement(list("*****"))
remDr$findElement(using = 'css selector', "#login-password1")$sendKeysToElement(list("*****"))
remDr$findElement(using = 'css selector', '#col-content > div:nth-child(3) > div > form > div:nth-child(3) > button')$clickElement()


#loop through the URL adresses and get the odds with results

while(i<=20){

url<-links1$links[i]
remDr$navigate(url)
# odds for 18Bet
if(length(remDr$findElements('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$bet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$betx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$bet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="18bet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$bet1[i]<-0
  odds$betx[i]<-0
  odds$bet2[i]<-0
}

# odds for 1xBet

if(length(remDr$findElements('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$xBet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$xBetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$xBet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="1xBet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$xBet1[i]<-0
  odds$xBetx[i]<-0
  odds$xBet2[i]<-0
}

# odds for Asianodds

if(length(remDr$findElements('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Asianodds1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Asianoddsx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Asianodds2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Asianodds"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Asianodds1[i]<-0
  odds$Asianoddsx[i]<-0
  odds$Asianodds2[i]<-0
}

# odds for bet-at-home

if(length(remDr$findElements('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$betathome1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$betathomex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$betathome2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet-at-home"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$betathome1[i]<-0
  odds$betathomex[i]<-0
  odds$betathome2[i]<-0
}

# odds for bet365

if(length(remDr$findElements('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Bet3651[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Bet365x[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Bet3652[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bet365"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Bet3651[i]<-0
  odds$Bet365x[i]<-0
  odds$Bet3652[i]<-0
}

# odds for bwin

if(length(remDr$findElements('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$bwin1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$bwinx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$bwin2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="bwin"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$bwin1[i]<-0
  odds$bwinx[i]<-0
  odds$bwin2[i]<-0
}

# odds for Chance.cz

if(length(remDr$findElements('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Chance1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Chancex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Chance2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Chance.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Chance1[i]<-0
  odds$Chancex[i]<-0
  odds$Chance2[i]<-0
}

# odds for iFortuna.sk

if(length(remDr$findElements('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$iFortuna1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$iFortunax[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$iFortuna2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="iFortuna.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$iFortuna1[i]<-0
  odds$iFortunax[i]<-0
  odds$iFortuna2[i]<-0
}

# odds for Marathonbet

if(length(remDr$findElements('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Marathonbet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$Marathonbetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$Marathonbet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Marathonbet"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Marathonbet1[i]<-0
  odds$Marathonbetx[i]<-0
  odds$Marathonbet2[i]<-0
}

# odds for MAXITIP.cz

if(length(remDr$findElements('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$MAXITIP1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
  odds$MAXITIPx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
  odds$MAXITIP2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="MAXITIP.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$MAXITIP1[i]<-0
  odds$MAXITIPx[i]<-0
  odds$MAXITIP2[i]<-0
}

# odds for Pinnacle

if(length(remDr$findElements('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Pinnacle1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
  odds$Pinnaclex[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
  odds$Pinnacle2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Pinnacle"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Pinnacle1[i]<-0
  odds$Pinnaclex[i]<-0
  odds$Pinnacle2[i]<-0
}

# odds for SAZKAbet.cz

if(length(remDr$findElements('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
odds$SAZKAbet1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
odds$SAZKAbetx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
odds$SAZKAbet2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="SAZKAbet.cz"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$SAZKAbet1[i]<-0
  odds$SAZKAbetx[i]<-0
  odds$SAZKAbet2[i]<-0
}

# odds for Tipsport.sk

if(length(remDr$findElements('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]'))!=0){
  odds$Tipsport1[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[2]')$getElementText()
  odds$Tipsportx[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[3]')$getElementText()
  odds$Tipsport2[i]<-remDr$findElement('xpath', '//a[@class="name" and .="Tipsport.sk"]/ancestor::tr[contains(@class, "lo")]//td[4]')$getElementText()
}else{
  odds$Tipsport1[i]<-0
  odds$Tipsportx[i]<-0
  odds$Tipsport2[i]<-0
  }

#country, league, match, results

odds$match[i] <-remDr$findElement('xpath','//*[@id="col-content"]/h1')$getElementText()
odds$krajina[i]<-remDr$findElement('xpath', '//*[@id="breadcrumb"]/a[3]')$getElementText()
odds$liga[i]<-remDr$findElement('xpath', '//*[@id="breadcrumb"]/a[4]')$getElementText()

if(length(remDr$findElements('xpath', '//*[@id="event-status"]/p/strong'))!=0){
  odds$result[i] <-remDr$findElement('xpath', '//*[@id="event-status"]/p/strong')$getElementText()
}else{odds$result[i]=0}  

i<-i+1
}
Tomas
  • 3
  • 1
Tomas
  • 83
  • 5

0 Answers0