import pypyodbc
from pypyodbc import *
import nltk
from nltk import *
import csv
import sys
import codecs
import re
#connect to the database
conn = pypyodbc.connect('Driver={Microsoft Access Driver (*.Mdb)};\
DBQ=C:\\TextData.mdb')
#create a cursor to control the datbase with
cur = conn.cursor()
cur.execute('''SELECT Text FROM MessageCreationDate WHERE Tags LIKE 'GHS - %'; ''')
TextSet = cur.fetchall()
ghsWordList = []
TextWords = list(TextSet)
for row in TextWords :
message = re.split('\W+',str(row))
for eachword in message :
if eachword.isalpha() :
ghsWordList.append(eachword.lower())
print(ghsWordList)
When I run this code, it's giving me an error:
'charmap' codec can't encode character '\u0161' in position 2742: character maps to <undefined>
I've looked at a number of other answers on here to similar questions, and googled the hell out of it; however I am not well versed enough in Python nor Character Encoding to know where I need to used the Codecs module to change the character set being used to present/append/create the list?
Could someone not only help me with the code but also point me in the direct of some good reading materials for understanding this sort of thing?