0
import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

browser = webdriver.Chrome()
## In the following segment, I show how you can get a list of news article URLs based on a keyword search.
## I use CNN as an example but news source would work.

base_url = u'https://www.cnn.com/search?q=politics&size=200'

browser.get(base_url)
time.sleep(1)

#Finds the container that contains every news article.
main_news_container = browser.find_element_by_class_name('cnn-search__results-list')
headlines_list = main_news_container.find_elements_by_class_name('cnn-search__result-headline')
#In main container get 'a'
#text_sections = main_news_container.find_elements_by_xpath("//a[@href]")
print(len(text_sections))
headlines = [];
links = [];
for elem in headlines_list:
    #if "/2020/" in elem.get_attribute("href"):
        #this is printing the link
    #print(elem.get_attribute("href"))
    links.append(elem.get_attribute("href"))
        #this is printing the Headline
    #print(elem.text)
    headlines.append(elem.text)


#Find the text body_elements inside the main_news_container
body_elements = main_news_container.find_elements_by_class_name("cnn-search__result-body")

#this is how you get the body body_elements text
#print(body_elements[1].text)
print(links[0])
print(len(headlines))
print(len(body_elements))

I am trying to get the href from the elements of headlines_list but it returns None. However I can still get the text of the headline by saying,

elem.text

This is the inspect output of the web page source

1 Answers1

0

You are trying to get the attribute href of the h3 as the class is in the h3. You need to find the a element inside the h3 element.

I changed the line with your headlines to have the a element:

headlines_list = main_news_container.find_element_by_css_selector("h3[class='cnn-search__result-headline']>a")

and here is your code with it

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

browser = webdriver.Chrome()
## In the following segment, I show how you can get a list of news article URLs based on a keyword search.
## I use CNN as an example but news source would work.

base_url = u'https://www.cnn.com/search?q=politics&size=200'

browser.get(base_url)
time.sleep(1)

#Finds the container that contains every news article.
main_news_container = browser.find_element_by_class_name('cnn-search__results-list')
headlines_list = main_news_container.find_element_by_css_selector("h3[class='cnn-search__result-headline']>a")
#In main container get 'a'
#text_sections = main_news_container.find_elements_by_xpath("//a[@href]")
print(len(text_sections))
headlines = [];
links = [];
for elem in headlines_list:
    #if "/2020/" in elem.get_attribute("href"):
        #this is printing the link
    #print(elem.get_attribute("href"))
    links.append(elem.get_attribute("href"))
        #this is printing the Headline
    #print(elem.text)
    headlines.append(elem.text)


#Find the text body_elements inside the main_news_container
body_elements = main_news_container.find_elements_by_class_name("cnn-search__result-body")

#this is how you get the body body_elements text
#print(body_elements[1].text)
print(links[0])
print(len(headlines))
print(len(body_elements))
nabais
  • 1,629
  • 1
  • 7
  • 14
  • Thank you so much!! The problem was I wasn't waiting enough at the beginning when I increased the wait time to 3s I get the actual result! – Mertay Dayanc Sep 29 '20 at 22:42