I use the following code to convert a PDF to a text file. However, I am only interested in the main text of the document, no figures, no page numbers, no tables, no captions, no formula, etc.
But it give me a bunch of text some of them are short lines extracted from tables or formulas. I want the resulting text be readable for a user without need to stuff which can't be shown in text mode.
The other problem is that it breaks sentences to multiple lines, so I will loose the sentence boundaries. I want to have complete sentences without breaks.
I played with some options like 'all_text = False' with no success.
import sys
from tqdm import tqdm
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
def pdfparser(pdf_file):
fp = open(pdf_file, 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
laparams = LAParams()
laparams.all_texts = False
laparams.detect_vertical = False
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
password = ""
maxpages = 3
caching = True
pagenos=set()
imagewriter = None
for page in tqdm(PDFPage.get_pages(fp)):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace("\\n","\n")
return text
if __name__ == '__main__':
text = pdfparser(sys.argv[1])
print(text)