I am trying to pull twitter followers data using Selenium chrome webdriver and BeautifulSoup for a account that has 80K followers. I am facing two issues in my script :
1) While scrolling till the bottom of the page to get the entire page source after all the followers are loaded, my script does not scroll all the way to the bottom. It stops scrolling in between after loading a random number of followers and then start traversing each followers profile to get their data. I want it to load all the followers on the page and then start traversing the profiles.
2) My second issue is every time I run the script it would try to scroll to the bottom one by one till all the followers are loaded and then start pulling data by parsing one follower data at a time. This would take 4 to 5 days to fetch all the followers data in my case(80K followers). Is there any better way to do this.
Here is my script :
from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join
print "Running for chrome."
chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
os.environ["webdriver.chrome.driver"]=chromedriver
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : download_path}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
driver.implicitly_wait(20)
driver.maximize_window()
except Exception as err:
print "Error:Failed to open chrome."
print "Error: ",err
driver.stop_client()
driver.close()
#opening the web page
try:
driver.get('https://twitter.com/login')
except Exception as err:
print "Error:Failed to open url."
print "Error: ",err
driver.stop_client()
driver.close()
username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")
username.send_keys("###########")
password.send_keys("###########")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadserver/followers')
followers_link=driver.page_source #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')
output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")
for _ in xrange(0, followers_count/followers_per_page + 1):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
followers_link=driver.page_source #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
for name in bref:
name_list.append(name['href'])
break
lastHeight = newHeight
followers_link=''
print len(name_list)
for x in range(0,len(name_list)):
#print name['href']
#print name.text
driver.stop_client()
driver.get('https://twitter.com'+name_list[x])
page_source=driver.page_source
each_soup=BeautifulSoup(page_source,'html.parser')
profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
try:
name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
if name:
output.write('"'+name.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in name:',e
try:
handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
if handle:
output.write('"'+handle.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in handle:',e
try:
location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
if location:
output.write('"'+location.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in location:',e
try:
bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
if bio:
output.write('"'+bio.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in bio:',e
try:
joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
if joinDate:
output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in joindate:',e
try:
url = [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
if url:
output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
else:
output.write(' '+'\n')
except Exception as e:
output.write(' '+'\n')
print 'Error in url:',e
output.close()
os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")