I have a lot of files(300~500) to read, and I want to accelerate this task.
The idealization is:
from multiprocessing import Pool
import os
import _io
filelist = map(open,os.listdir())
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(_io.TextIOWrapper.read,filelist)
Of course, I got an error:
TypeError: cannot serialize '_io.TextIOWrapper' object
The question is: Can I accelerate I/O process by parallelism? If yes, how to?
UPDATE conclusion:
Now I get the way to parallelism and have tested my code:
I used 22 items, totalling 63.2 MB
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return f.read()
def mul():
with Pool() as pool:
a = pool.map(my_read, os.listdir())
def single():
a = []
for i in os.listdir():
with open(i) as f:
r = f.read()
a.append(r)
if __name__ == '__main__':
mul()
# single()
Sadly, single()
costs 0.4s while mul()
costs 0.8s.
UPDATE 1:
Some people said it's an IO-bound task so I can not improve it by parallelism。 However, I can find these words in Python doc:
However, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
The full code is here:
My purpose is to transfer Epub
to txt
.
I have parallelized char2text
and now I want to accelerate readall
:
import zipfile
from multiprocessing import Pool
import bs4
def char2text(i):
soup = bs4.BeautifulSoup(i)
chapter = soup.body.getText().splitlines()
chapter = "\n".join(chapter).strip() + "\n\n"
return chapter
class Epub(zipfile.ZipFile):
def __init__(self, file, mode='r', compression=0, allowZip64=False):
zipfile.ZipFile.__init__(self, file, mode, compression, allowZip64)
if mode == 'r':
self.opf = self.read('OEBPS/content.opf').decode()
opf_soup = bs4.BeautifulSoup(self.opf)
self.author = opf_soup.find(name='dc:creator').getText()
self.title = opf_soup.find(name='dc:title').getText()
try:
self.description = opf_soup.find(name='dc:description').getText()
except:
self.description = ''
try:
self.chrpattern = opf_soup.find(name='dc:chrpattern').getText()
except:
self.chrpattern = ''
self.cover = self.read('OEBPS/images/cover.jpg')
elif mode == 'w':
pass
def get_text(self):
self.tempread = ""
charlist = self.readall(self.namelist())
with Pool() as pool:
txtlist = pool.map(char2text, charlist)
self.tempread = "".join(txtlist)
return self.tempread
def readall(self, namelist):
charlist = []
for i in namelist:
if i.startswith('OEBPS/') and i.endswith('.xhtml'):
r = self.read(i).decode()
charlist.append(r)
return charlist
def epub2txt(self):
tempread = self.get_text()
with open(self.title + '.txt', 'w', encoding='utf8') as f:
f.write(tempread)
if __name__ == "__main__":
e = Epub("assz.epub")
import cProfile
cProfile.run("e.epub2txt()")