I am new in asks and trio in python, I got a sample code. let me explain I have a list of URL every one is news URLs, each one has sub urls. the first url requests and get all other hrefs and add in a list. then get the article of all hrefs in that list. The issue is certain times the article is getting other times empty.
tried the sample code for single urls that time its working
import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time
asks.init('trio')
async def extractor(path, htmls, paths, session):
try:
r = await session.get(path, timeout=2)
out = r.content
htmls.append(out)
paths.append(path)
except Exception as e:
out = str(e)
htmls.append(out)
paths.append(path)
async def main(path_list, session):
htmls = []
paths = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, htmls, paths, session)
return htmls, paths
async def run(urls, conns=50):
s = asks.Session(connections=conns)
g = Goose()
htmls, paths = await main(urls, s)
print(htmls," ",paths)
cleaned = []
for html, path in zip(htmls, paths):
dic = {}
dic['url'] = path
if html is not None:
try:
#g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
article = g.extract(raw_html=html)
author=article.authors
dic['goose_text'] = article.cleaned_text
#print(article.cleaned_text)
#dic['goose_date'] = article.publish_datetime
dic['goose_title'] = article.title
if author:
dic['authors']=author[0]
else:
dic['authors'] =''
except Exception as e:
raise
print(e)
log.info('goose found no text using html')
dic['goose_html'] = html
dic['goose_text'] = ''
dic['goose_date'] = None
dic['goose_title'] = None
dic['authors'] =''
cleaned.append(dic)
return cleaned
async def real_main():
sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'
obj = json.loads(sss)
pprint(obj)
articles=[]
for l in obj:
articles.append(await run([l['story_url']]))
#await trio.sleep(3)
pprint(articles)
if __name__ == "__main__":
trio.run(real_main)
get the article data without missing