2

I am trying to pull twitter followers data using Selenium chrome webdriver and BeautifulSoup for a account that has 80K followers. I am facing two issues in my script :

1) While scrolling till the bottom of the page to get the entire page source after all the followers are loaded, my script does not scroll all the way to the bottom. It stops scrolling in between after loading a random number of followers and then start traversing each followers profile to get their data. I want it to load all the followers on the page and then start traversing the profiles.

2) My second issue is every time I run the script it would try to scroll to the bottom one by one till all the followers are loaded and then start pulling data by parsing one follower data at a time. This would take 4 to 5 days to fetch all the followers data in my case(80K followers). Is there any better way to do this.

Here is my script :

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
 os.environ["webdriver.chrome.driver"]=chromedriver
 chromeOptions = webdriver.ChromeOptions()
 prefs = {"download.default_directory" : download_path}
 chromeOptions.add_experimental_option("prefs",prefs)
 driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
 driver.implicitly_wait(20)
 driver.maximize_window()
except Exception as err:
 print "Error:Failed to open chrome."
 print "Error: ",err
 driver.stop_client()
 driver.close()
 
#opening the web page
try:
 driver.get('https://twitter.com/login')
except Exception as err:
 print "Error:Failed to open url."
 print "Error: ",err
 driver.stop_client()
 driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("###########")
password.send_keys("###########")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadserver/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)


for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()


os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")
Ashish Verma
  • 235
  • 3
  • 13

3 Answers3

0

There is a better way. Use the Twitter API, here's a quick Github script that I found Github Script Sorry you probably feel like you've waisted a lot of time by using Selenium (there are pros to not using an API) Great post on automation and getting how stuff works: Twitter API

There's a way to scroll many times but you'd have to do some math or set a condition to stop this.

driver.execute_script("window.scrollTo(0, 10000);") 

Let's say you have 10k followers and the intial displays 100followers, and after that you'll load 10followers each scroll. You would scroll another 990 times.

Here's the exact usage for your case made by of course alecxe :D. Qudora* answer By - alecxe -

html = driver.page_source

This .page_source can be used once you reveal all the followers(scroll) and then parse it with something like BeautifulSoup

Community
  • 1
  • 1
Yono
  • 2,176
  • 2
  • 10
  • 25
  • Can we load all the followers by scrolling manually and save the page source in a text file and then loop through all the followers data from the text file instead of going to twitter's website. I don't know if this would work or not. If it would then can you please provide the code to do this part as I tried doing it and didn't succeed. Thanks. – Ashish Verma Apr 03 '17 at 15:01
  • Yes, there's a selenium func > .page_source example html = driver.page_source – Yono Apr 03 '17 at 17:12
0

I did t he implementation as mentioned by alecxe in his answer but still my script is not parsing all the followers. It is still loading random number of followers. Can't seem to get to the bottom of this. Can someone please try and run this on their end to see if they are able to load all the followers. Here's the modified script :

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
 os.environ["webdriver.chrome.driver"]=chromedriver
 chromeOptions = webdriver.ChromeOptions()
 prefs = {"download.default_directory" : download_path}
 chromeOptions.add_experimental_option("prefs",prefs)
 driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
 driver.implicitly_wait(20)
 driver.maximize_window()
except Exception as err:
 print "Error:Failed to open chrome."
 print "Error: ",err
 driver.stop_client()
 driver.close()
 
#opening the web page
try:
 driver.get('https://twitter.com/login')
except Exception as err:
 print "Error:Failed to open url."
 print "Error: ",err
 driver.stop_client()
 driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("*****************")
password.send_keys("*****************")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadoperator/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")

followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

followers_per_page = 18
followers_count = 15777


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, 7755000);")
        time.sleep(2)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)

'''
for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()
'''

os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")
Ashish Verma
  • 21
  • 1
  • 1
  • 5
0
  1. In the Firefox or other browser open developer console and write down (copy) request which occurs during scrolling the page down - you will use it to construct your request. Request will look smth like this - https://twitter.com/DiaryofaMadeMan/followers/users?include_available_features=1&include_entities=1&max_position=1584951385597824282&reset_error_state=false, and search html source for data-min-position like this - data-min-position="1584938620170076301"
  2. Load HTML using PhantomJS - parse using Beautifulsoup. You need to get first portion of followers and the "data-min" value. Save followers to the list and "data-min-position"to variable
  3. Use saved at the stage 1 request and "data-min" to construct new request - replacing only digits of data-max of request with saved data-min
  4. Use python requests (no webdriver anymore) to sent request and receive json response.
  5. Get new followers and new data-min from response json
  6. repeat 2,3,4 until data-min=0

This way is much better than API cause you can load huge amount of data without any restrictions