import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
## In the following segment, I show how you can get a list of news article URLs based on a keyword search.
## I use CNN as an example but news source would work.
base_url = u'https://www.cnn.com/search?q=politics&size=200'
browser.get(base_url)
time.sleep(1)
#Finds the container that contains every news article.
main_news_container = browser.find_element_by_class_name('cnn-search__results-list')
headlines_list = main_news_container.find_elements_by_class_name('cnn-search__result-headline')
#In main container get 'a'
#text_sections = main_news_container.find_elements_by_xpath("//a[@href]")
print(len(text_sections))
headlines = [];
links = [];
for elem in headlines_list:
#if "/2020/" in elem.get_attribute("href"):
#this is printing the link
#print(elem.get_attribute("href"))
links.append(elem.get_attribute("href"))
#this is printing the Headline
#print(elem.text)
headlines.append(elem.text)
#Find the text body_elements inside the main_news_container
body_elements = main_news_container.find_elements_by_class_name("cnn-search__result-body")
#this is how you get the body body_elements text
#print(body_elements[1].text)
print(links[0])
print(len(headlines))
print(len(body_elements))
I am trying to get the href from the elements of headlines_list but it returns None. However I can still get the text of the headline by saying,
elem.text