I'm trying to extract page and header data from a docx file. The file is several hundred pages, each with a table and a header. The header has pertinent information that needs to be paired with each table. I'm able to extract the header and table data, I just can't reliably pair them together.
Using win32com this is what I've got so far
# getting the table page number
app = Dispatch("Word.Application")
doc = app.Documents.Open(filename)
table_1_page = doc.Tables(1).Range.Information(3) # 3 == wdActiveEndPageNumber
The problem occurs because the headers TextFrames and are duplicated on multiple pages, so when I call:
# getting the header page number
doc.Sections(1).Headers(1).Shapes(1).TextFrame.TextRange.Information(3)
I get one of the pages that the TextFrame occurs on. The page is chooses seems somewhat arbitrary, sometimes its the first others its the last, but its not predictable.
I'm spent a bit of time reading over the object model here. Ultimately it would be nice to capture all of the items displayed per page without reinventing the wheel.
EDIT 10/25/16 per request, here is some minimum working code**
# filename docx_parser.py
import pythoncom
class OpenDoc(object):
def __init__(self, docx_path):
import win32com.client as win32
self.path = docx_path
self.word = win32.Dispatch("Word.Application")
self.word.Visible = 0
self.word.Documents.Open(p)
self.doc = self.word.ActiveDocument
def get_table_count(self):
return self.doc.Tables.Count
def count_table_rows(self, table):
return table.Rows.Count
def count_table_columns(self, table):
return table.Columns.Count
def get_headers(self):
headers = self.doc.Sections(1).Headers(1)
shape_count = headers.Shapes.Count
for shape_num in range(1, shape_count + 1):
t_range = headers.Shapes(shape_num).TextFrame.TextRange
text = t_range.Text
page_num = t_range.Information(3) # 3 == wdActiveEndPageNumber
yield text, page_num
def get_table_text(self, table):
col_count = self.count_table_columns(table)
row_count = self.count_table_rows(table)
for row in range(1, row_count + 1):
row_data = []
for col in range(1, col_count + 1):
try:
row_data.append(table.Cell(Row=row, Column=col).Range.Text.strip(chr(7) + chr(13)))
except pythoncom.com_error as error:
row_data.append("")
yield row_data
def get_all_table_text(self):
for table in self.get_tables():
table_data = []
for row_data in self.get_table_text(table):
table_data.append(row_data)
yield table_data
def get_tables(self):
for table in self.doc.Tables:
yield table
def __del__(self):
self.word.Quit()
if __name__ == "__main__":
try:
path = r"sample.docx"
open_doc = OpenDoc(path)
for table_num, table_text in enumerate(open_doc.get_all_table_text()):
print("\n-------------- Table %s ----------------" % (table_num + 1))
for row_data in table_text:
print(", ".join(row_data))
for header_text, page_num in open_doc.get_headers():
print("header page number: %s, text: %s" % (page_num, header_text))
except Exception as error:
from traceback import format_exc
print(format_exc())
raw_input("")