I'm using visual studio code as my IDE for python. However, every time I want to print something in Mandarin, it keep showing me this error "SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0xc3 in position 0: invalid continuation byte". By the way, I've already checked stackoverflow and Google to look for the answer to this question.
I've added the codes below to fix it. In the end, it still failed to fix it.
# -*- coding: UTF-8 -*-
import io
import sys
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
# -*- coding: UTF-8 -*-
import io
import sys
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
import requests
from bs4 import BeautifulSoup
import json
import time
import re
def get_web_page(url):
resp = requests.get(
url = url,
cookies = {'over18':'1'}
)
if resp.status_code != 200:
print ('Invalid url:', resp.url)
return None
else:
return resp.text
def get_articles(dom, date):
soup = BeautifulSoup(dom,'html5lib')
paging_div = soup.find('div','btn-group btn-group-paging')
prev_url = paging_div.find_all('a')[1]['href']
articles = []
divs = soup.find_all('div','r-ent')
for d in divs:
if d.find('div','date').text.strip() == date:
push_count = 0
push_str = d.find('div','nrec').text
if push_str:
try:
push_count = int(push_str)
except ValueError:
if push_str == ('爆'):
push_count = 99
elif push_str.startswith('X'):
push_count = -10
if d.find('a'):
href = d.find('a')['href']
title = d.find('a').text
author = d.find('div','author').text if d.find('div','author') else ''
articles.append({
'title': title,
'href': href,
'push_count': push_count,
'author': author
})
return articles, prev_url
def get_ip(dom):
pattern = '來自 : \d+\.\d+\.\+d\.\+d'
match = re.search(pattern, dom)
if match:
return match.group(0).replace('來自 : ','')
else:
return None
API_KEY = '76ec8d187ce0d00ee3fed79ab1b8dc22'
def get_country(ip):
if ip:
url = 'http://api.ipstack.com/{}?access_key={}'.format(ip, API_KEY)
data = requests.get(url).json()
country_name = data['country_name'] if data ['country_name'] else None
return country_name
return None
print('取得今日文章列表…')
PTT_URL = "https://www.ptt.cc"
current_page = get_web_page(PTT_URL+'/bbs/Gossiping/index.html')
if current_page:
articles = []
today = time.strftime('%m/%d').lstrip('0')
current_articles, prev_url = get_articles(current_page, today)
while current_articles:
articles += current_articles
current_page = get_web_page(PTT_URL + prev_url)
current_articles, prev_url = get_articles(current_page, today)
print('共 %d 篇文章' %(len(articles)))
print('取得前 100 篇文章的IP')
country_to_count = dict()
for article in articles[:100]:
print('查詢 IP:', article['title'])
page = get_web_page(PTT_URL + article['href'])
if page:
ip = get_ip(page)
country = get_country(ip)
if country in country_to_count.keys():
country_to_count[country] += 1
else:
country_to_count[country] = 1
print('各國 IP 分布')
for k, v in country_to_count.items():
print(k, v)
The output of the code above is "line 40 SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0xc3 in position 0: invalid continuation byte" line 40 is the first line of my code which contains Mandarin.