According to the architecture of the DOCX document:
- Text: doc>Paragraph>run
- Text table: doc>Form>row>col>cell>Paragraph>run
- Header: doc>sections>header>Paragraph>run
- Header table: doc>sections>header>Form>row>col>cell>Paragraph>run
The footer is the same as the header, we can directly traverse the paragraph to find and replace our keywords, but this will cause the text format to be reset, so we can only traverse the words in the run and replace them. However, as our keywords may exceed the length range of the run, we cannot replace them successfully.
Therefore, I provide an idea here: firstly, take paragraph as unit, and mark the position of every character in paragraph through list; then, mark the position of every character in run through list; find keywords in paragraph, delete and replace them by character as unit by corresponding relation.
'''
-*- coding: utf-8 -*-
@Time : 2021/4/19 13:13
@Author : ZCG
@Site :
@File : Batch DOCX document keyword replacement.py
@Software: PyCharm
'''
from docx import Document
import os
import tqdm
def get_docx_list(dir_path):
'''
:param dir_path:
:return: List of docx files in the current directory
'''
file_list = []
for path,dir,files in os.walk(dir_path):
for file in files:
if file.endswith("docx") == True and str(file[0]) != "~": #Locate the docx document and exclude temporary files
file_root = path+"\\"+file
file_list.append(file_root)
print("The directory found a total of {0} related files!".format(len(file_list)))
return file_list
class ParagraphsKeyWordsReplace:
'''
self:paragraph
'''
def paragraph_keywords_replace(self,x,key,value):
'''
:param x: paragraph index
:param key: Key words to be replaced
:param value: Replace the key words
:return:
'''
keywords_list = [s for s in range(len(self.text)) if self.text.find(key, s) == s] # Retrieve the number of occurrences of the Key in this paragraph and record the starting position in the List
# there if use: while self.text.find(key) >= 0,When {"ab":" ABC "} is encountered, it will enter an infinite loop
while len(keywords_list)>0: #If this paragraph contains more than one key, you need to iterate
index_list = [] #Gets the index value for all characters in this paragraph
for y, run in enumerate(self.runs): # Read the index of run
for z, char in enumerate(list(run.text)): # Read the index of the chars in the run
position = {"run": y, "char": z} # Give each character a dictionary index
index_list.append(position)
# print(index_list)
start_i = keywords_list.pop() # Fetch the starting position containing the key from the back to the front of the list
end_i = start_i + len(key) # Determine where the key word ends in the paragraph
keywords_index_list = index_list[start_i:end_i] # Intercept the section of a list that contains keywords in a paragraph
# print(keywords_index_list)
# return keywords_index_list #Returns a list of coordinates for the chars associated with keywords
ParagraphsKeyWordsReplace.character_replace(self, keywords_index_list, value)
# print(f"Successful replacement:{key}===>{value}")
def character_replace(self,keywords_index_list,value):
'''
:param keywords_index_list: A list of indexed dictionaries containing keywords
:param value: The new word after the replacement
: return:
Receive parameters and delete the characters in keywords_index_list back-to-back, reserving the first character to replace with value
Note: Do not delete the list in reverse order, otherwise the list length change will cause a string index out of range error
'''
while len(keywords_index_list) > 0:
dict = keywords_index_list.pop() #Deletes the last element and returns its value
y = dict["run"]
z = dict["char"]
run = self.runs[y]
char = self.runs[y].text[z]
if len(keywords_index_list) > 0:
run.text = run.text.replace(char, "") #Delete the [1:] character
elif len(keywords_index_list) == 0:
run.text = run.text.replace(char, value) #Replace the 0th character
class DocxKeyWordsReplace:
'''
self:docx
'''
def content(self,replace_dict):
print("Please wait for a moment, the body content is processed...")
for key, value in tqdm.tqdm(replace_dict.items()):
for x,paragraph in enumerate(self.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph,x,key,value)
def tables(self,replace_dict):
print("Please wait for a moment, the body tables is processed...")
for key,value in tqdm.tqdm(replace_dict.items()):
for i,table in enumerate(self.tables):
for j,row in enumerate(table.rows):
for cell in row.cells:
for x,paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph,x,key,value)
def header_content(self,replace_dict):
print("Please wait for a moment, the header body content is processed...")
for key,value in tqdm.tqdm(replace_dict.items()):
for i,sections in enumerate(self.sections):
for x,paragraph in enumerate(self.sections[i].header.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph, x, key, value)
def header_tables(self,replace_dict):
print("Please wait for a moment, the header body tables is processed...")
for key,value in tqdm.tqdm(replace_dict.items()):
for i,sections in enumerate(self.sections):
for j,tables in enumerate(self.sections[i].header.tables):
for k,row in enumerate(tables[j].rows):
for l,cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph, x, key, value)
def footer_content(self, replace_dict):
print("Please wait for a moment, the footer body content is processed...")
for key,value in tqdm.tqdm(replace_dict.items()):
for i, sections in enumerate(self.sections):
for x, paragraph in enumerate(self.sections[i].footer.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph, x, key, value)
def footer_tables(self, replace_dict):
print("Please wait for a moment, the footer body tables is processed...")
for key,value in tqdm.tqdm(replace_dict.items()):
for i, sections in enumerate(self.sections):
for j, tables in enumerate(self.sections[i].footer.tables):
for k, row in enumerate(tables[j].rows):
for l, cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.paragraph_keywords_replace(paragraph, x, key, value)
def main():
'''
How to use it: Modify the values in replace_dict and file_dir
Replace_dict: The following dictionary corresponds to the format, with key as the content to be replaced and value as the new content
File_dir: The directory where the docx file resides. Supports subdirectories
'''
# Input part
replace_dict = {
"MG life technology (shenzhen) co., LTD":"Shenzhen YW medical technology co., LTD",
"MG-":"YW-",
"2017-":"2020-",
"Z18":"Z20",
}
file_dir = r"D:\Working Files\SVN\"
# Call processing part
for i,file in enumerate(get_docx_list(file_dir),start=1):
print(f"{i}、Files in progress:{file}")
docx = Document(file)
DocxKeyWordsReplace.content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_tables(docx, replace_dict=replace_dict)
docx.save(file)
print("This document has been processed!\n")
if __name__ == "__main__":
main()
print("All complete processing!")