Python使用python-docx實現(xiàn)自動化處理Word文檔
一、引言
隨著辦公自動化需求的增長,Python通過python-docx庫實現(xiàn)了對Word文檔的深度操作。本文將展示如何通過代碼實現(xiàn)段落樣式復制、HTML表格轉Word表格以及動態(tài)生成可定制化模板的功能。
二、核心功能模塊解析
1. 段落樣式與圖片復制
def copy_inline_shapes(new_doc, img):
"""復制段落中的所有內(nèi)嵌形狀(通常是圖片)"""
new_para = new_doc.add_paragraph()
for image_bytes, w, h in img:
# 添加圖片到新段落
new_para.add_run().add_picture(io.BytesIO(image_bytes), width=w, height=h) # 設置寬度為1.25英寸或其他合適的值
功能說明:從舊文檔中提取圖片并復制至新文檔,支持自定義寬度和高度。
使用場景:適用于需要保留原始格式的圖文混排文檔。
2. HTML表格轉Word表格
def docx_table_to_html(word_table):
# 實現(xiàn)HTML表單轉換邏輯,包括合并單元格處理
功能說明:將解析后的HTML表格結構轉換為Word文檔中的表格,支持橫向/縱向合并。
關鍵點:
- 使用BeautifulSoup解析HTML
- 處理單元格樣式、邊框和背景顏色
- 支持多級標題的樣式繼承
3. 模板生成與樣式動態(tài)化
def generate_template():
doc = Document()
for align in [WD_ALIGN_PARAGRAPH.LEFT, WD_ALIGN_PARAGRAPH.RIGHT, WD_ALIGN_PARAGRAPH.CENTER, None]:
for blod_flag in [True, False]:
# 創(chuàng)建不同樣式的段落
功能說明:動態(tài)生成包含多種樣式(左、右、居中、無)的模板文檔。
優(yōu)勢:支持快速擴展新樣式,適應不同場景需求。
三、完整示例代碼
示例1:復制段落樣式與圖片
def clone_document(old_s, old_p, old_ws, new_doc_path):
new_doc = Document()
for para in old_p:
if "Image_None" in para:
copy_inline_shapes(new_doc, [i["image"] for i in old_s if len(i) > 3][0])
elif "table" in para:
html_table_to_docx(new_doc, para)
else:
clone_paragraph(para)
示例2:HTML表格轉Word
def html_table_to_docx(doc, html_content):
soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')
for table in tables:
# 處理合并單元格和樣式轉換邏輯...
四、關鍵實現(xiàn)細節(jié)
1. 樣式復制策略
繼承機制:通過run_style和style字段傳遞字體、對齊等屬性。
分頁符處理:使用is_page_break判斷段落或表格后是否需要換頁。
2. 表格轉換優(yōu)化
合并單元格檢測:通過tcPr元素識別橫向/縱向合并。
樣式遷移:保留邊框、背景色等視覺屬性。
3. 模板動態(tài)生成
多樣式支持:通過遍歷所有段落樣式,生成可擴展的模板。
靈活配置:允許用戶自定義分頁符位置和樣式參數(shù)。
五、應用場景
| 場景 | 解決方案 |
|---|---|
| 段落排版 | 自動復制樣式并保留格式 |
| 數(shù)據(jù)表導出 | HTML轉Word表格,支持合并單元格 |
| 報告模板生成 | 動態(tài)創(chuàng)建包含多種樣式的模板文件 |
六、總結
通過python-docx庫,我們實現(xiàn)了從樣式復制到表格轉換的完整流程。動態(tài)生成的模板功能進一步提升了文檔處理的靈活性。無論是處理復雜的圖文排版,還是需要快速生成多風格文檔的需求,這套解決方案都能提供高效的實現(xiàn)路徑。
建議:在實際應用中,可結合python-docx的Document對象特性,通過遍歷所有元素實現(xiàn)更精細的控制。同時,對異常情況的捕獲(如圖片格式錯誤)也是提升健壯性的重要部分。
七、知識擴展
使用模版樣式生成文檔
from docx import Document
from docx.oxml import OxmlElement
from docx.oxml.shared import qn
from wan_neng_copy_word import clone_document as get_para_style,html_table_to_docx
import io
# 剩余部分保持不變...
def copy_inline_shapes(new_doc, img):
"""復制段落中的所有內(nèi)嵌形狀(通常是圖片)"""
new_para = new_doc.add_paragraph()
for image_bytes, w, h in img:
# 添加圖片到新段落
new_para.add_run().add_picture(io.BytesIO(image_bytes), width=w, height=h) # 設置寬度為1.25英寸或其他合適的值
def copy_paragraph_style(run_from, run_to):
"""復制 run 的樣式"""
run_to.bold = run_from.bold
run_to.italic = run_from.italic
run_to.underline = run_from.underline
run_to.font.size = run_from.font.size
run_to.font.color.rgb = run_from.font.color.rgb
run_to.font.name = run_from.font.name
run_to.font.all_caps = run_from.font.all_caps
run_to.font.strike = run_from.font.strike
run_to.font.shadow = run_from.font.shadow
def is_page_break(element):
"""判斷元素是否為分頁符(段落或表格后)"""
if element.tag.endswith('p'):
for child in element:
if child.tag.endswith('br') and child.get(qn('type')) == 'page':
return True
elif element.tag.endswith('tbl'):
# 表格后可能有分頁符(通過下一個元素判斷)
if element.getnext() is not None:
next_element = element.getnext()
if next_element.tag.endswith('p'):
for child in next_element:
if child.tag.endswith('br') and child.get(qn('type')) == 'page':
return True
return False
def clone_paragraph(para_style, text, new_doc, para_style_ws):
"""根據(jù)舊段落創(chuàng)建新段落"""
new_para = new_doc.add_paragraph()
para_style_ws = list(para_style_ws["style"].values())[0]
para_style_data = list(para_style["style"].values())[0]
para_style_ws.font.size = para_style_data.font.size
new_para.style = para_style_ws
new_run = new_para.add_run(text)
copy_paragraph_style(para_style["run_style"][0], new_run)
new_para.alignment = list(para_style["alignment"].values())[0]
return new_para
def copy_cell_borders(old_cell, new_cell):
"""復制單元格的邊框樣式"""
old_tc = old_cell._tc
new_tc = new_cell._tc
old_borders = old_tc.xpath('.//w:tcBorders')
if old_borders:
old_border = old_borders[0]
new_border = OxmlElement('w:tcBorders')
border_types = ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']
for border_type in border_types:
old_element = old_border.find(f'.//w:{border_type}', namespaces={
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
})
if old_element is not None:
new_element = OxmlElement(f'w:{border_type}')
for attr, value in old_element.attrib.items():
new_element.set(attr, value)
new_border.append(new_element)
tc_pr = new_tc.get_or_add_tcPr()
tc_pr.append(new_border)
def clone_table(old_table, new_doc):
"""根據(jù)舊表格創(chuàng)建新表格"""
new_table = new_doc.add_table(rows=len(old_table.rows), cols=len(old_table.columns))
if old_table.style:
new_table.style = old_table.style
for i, old_row in enumerate(old_table.rows):
for j, old_cell in enumerate(old_row.cells):
new_cell = new_table.cell(i, j)
for paragraph in new_cell.paragraphs:
new_cell._element.remove(paragraph._element)
for old_paragraph in old_cell.paragraphs:
new_paragraph = new_cell.add_paragraph()
for old_run in old_paragraph.runs:
new_run = new_paragraph.add_run(old_run.text)
copy_paragraph_style(old_run, new_run)
new_paragraph.alignment = old_paragraph.alignment
copy_cell_borders(old_cell, new_cell)
for i, col in enumerate(old_table.columns):
if col.width is not None:
new_table.columns[i].width = col.width
return new_table
def clone_document(old_s, old_p, old_ws, new_doc_path):
new_doc = Document()
# 復制主體內(nèi)容
for para in old_p:
for k, v in para.items():
if "Image_None" == k:
# print()
copy_inline_shapes(new_doc, [i["image"] for i in old_s if len(i) > 3][0])
elif "table" == k:
html_table_to_docx(new_doc,v)
else:
style = [i for i in old_s if v in list(i["style"].keys()) and "style" in i]
style_ws = [i for i in old_ws if v in list(i["style"].keys()) and "style" in i]
clone_paragraph(style[0], k, new_doc, style_ws[0])
new_doc.save(new_doc_path)
# 使用示例
if __name__ == "__main__":
body_ws, _ = get_para_style('demo_template.docx')
body_s, body_p = get_para_style("南山三防工作專報1.docx")
clone_document(body_s, body_p, body_ws, 'cloned_example.docx')
模版樣式文本分離
from docx.enum.text import WD_BREAK
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from bs4 import BeautifulSoup
from docx.oxml.ns import qn
def docx_table_to_html(word_table):
soup = BeautifulSoup(features='html.parser')
html_table = soup.new_tag('table', style="border-collapse: collapse;")
# 記錄哪些單元格已經(jīng)被合并
merged_cells = [[False for _ in range(len(word_table.columns))] for _ in range(len(word_table.rows))]
for row_idx, row in enumerate(word_table.rows):
html_tr = soup.new_tag('tr')
col_idx = 0
while col_idx < len(row.cells):
cell = row.cells[col_idx]
# 如果該單元格已經(jīng)被合并(被前面的 colspan 或 rowspan 占用),跳過
if merged_cells[row_idx][col_idx]:
col_idx += 1
continue
# 跳過縱向合并中被“continue”的單元格
v_merge = cell._element.tcPr and cell._element.tcPr.find(qn('w:vMerge'))
if v_merge is not None and v_merge.get(qn('w:val')) == 'continue':
col_idx += 1
continue
td = soup.new_tag('td')
# 設置文本內(nèi)容
td.string = cell.text.strip()
# 初始化樣式字符串
td_style = ''
# 獲取單元格樣式
if cell._element.tcPr:
tc_pr = cell._element.tcPr
# 處理背景顏色
shd = tc_pr.find(qn('w:shd'))
if shd is not None:
bg_color = shd.get(qn('w:fill'))
if bg_color:
td_style += f'background-color:#{bg_color};'
# 處理對齊方式
jc = tc_pr.find(qn('w:jc'))
if jc is not None:
align = jc.get(qn('w:val'))
if align == 'center':
td_style += 'text-align:center;'
elif align == 'right':
td_style += 'text-align:right;'
else:
td_style += 'text-align:left;'
# 處理邊框
borders = tc_pr.find(qn('w:tcBorders'))
if borders is not None:
for border_type in ['top', 'left', 'bottom', 'right']:
border = borders.find(qn(f'w:{border_type}'))
if border is not None:
color = border.get(qn('w:color'), '000000')
size = int(border.get(qn('w:sz'), '4')) # 半點單位,1pt = 2sz
style = border.get(qn('w:val'), 'single')
td_style += f'border-{border_type}:{size // 2}px {style} #{color};'
# 處理橫向合并(colspan)
grid_span = tc_pr.find(qn('w:gridSpan'))
if grid_span is not None:
colspan = int(grid_span.get(qn('w:val'), '1'))
if colspan > 1:
td['colspan'] = colspan
# 標記后面被合并的單元格
for c in range(col_idx + 1, col_idx + colspan):
if c < len(row.cells):
merged_cells[row_idx][c] = True
# 處理縱向合并(rowspan)
v_merge = tc_pr.find(qn('w:vMerge'))
if v_merge is not None and v_merge.get(qn('w:val')) != 'continue':
rowspan = 1
next_row_idx = row_idx + 1
while next_row_idx < len(word_table.rows):
next_cell = word_table.rows[next_row_idx].cells[col_idx]
next_v_merge = next_cell._element.tcPr and next_cell._element.tcPr.find(qn('w:vMerge'))
if next_v_merge is not None and next_v_merge.get(qn('w:val')) == 'continue':
rowspan += 1
next_row_idx += 1
else:
break
if rowspan > 1:
td['rowspan'] = rowspan
# 標記后面被合并的行
for r in range(row_idx + 1, row_idx + rowspan):
if r < len(word_table.rows):
merged_cells[r][col_idx] = True
# 設置樣式和默認邊距
td['style'] = td_style + "padding: 5px;"
html_tr.append(td)
# 更新列索引
if 'colspan' in td.attrs:
col_idx += int(td['colspan'])
else:
col_idx += 1
html_table.append(html_tr)
soup.append(html_table)
return str(soup)
def set_cell_background(cell, color_hex):
"""設置單元格背景色"""
color_hex = color_hex.lstrip('#')
shading_elm = OxmlElement('w:shd')
shading_elm.set(qn('w:fill'), color_hex)
cell._tc.get_or_add_tcPr().append(shading_elm)
def html_table_to_docx(doc, html_content):
"""
將 HTML 中的表格轉換為 Word 文檔中的表格
:param html_content: HTML 字符串
:param doc: python-docx Document 實例
"""
soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')
for html_table in tables:
# 獲取表格行數(shù)
trs = html_table.find_all('tr')
rows = len(trs)
# 估算最大列數(shù)(考慮 colspan)
cols = 0
for tr in trs:
col_count = 0
for cell in tr.find_all(['td', 'th']):
col_count += int(cell.get('colspan', 1))
cols = max(cols, col_count)
# 創(chuàng)建 Word 表格
table = doc.add_table(rows=rows, cols=cols)
table.style = 'Table Grid'
# 記錄已處理的單元格(用于處理合并)
used_cells = [[False for _ in range(cols)] for _ in range(rows)]
for row_idx, tr in enumerate(trs):
cells = tr.find_all(['td', 'th'])
col_idx = 0
for cell in cells:
while col_idx < cols and used_cells[row_idx][col_idx]:
col_idx += 1
if col_idx >= cols:
break # 避免越界
# 獲取 colspan 和 rowspan
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
# 獲取文本內(nèi)容
text = cell.get_text(strip=True)
# 獲取對齊方式
align = cell.get('align')
align_map = {
'left': WD_ALIGN_PARAGRAPH.LEFT,
'center': WD_ALIGN_PARAGRAPH.CENTER,
'right': WD_ALIGN_PARAGRAPH.RIGHT
}
# 獲取背景顏色
style = cell.get('style', '')
bg_color = None
for s in style.split(';'):
if 'background-color' in s or 'background' in s:
bg_color = s.split(':')[1].strip()
break
# 獲取 Word 單元格
word_cell = table.cell(row_idx, col_idx)
# 合并單元格
if colspan > 1 or rowspan > 1:
end_row = min(row_idx + rowspan - 1, rows - 1)
end_col = min(col_idx + colspan - 1, cols - 1)
merged_cell = table.cell(row_idx, col_idx).merge(table.cell(end_row, end_col))
word_cell = merged_cell
# 設置文本內(nèi)容
para = word_cell.paragraphs[0]
para.text = text
# 設置對齊方式
if align in align_map:
para.alignment = align_map[align]
# 設置背景顏色
if bg_color:
try:
set_cell_background(word_cell, bg_color)
except:
pass # 忽略無效顏色格式
# 標記已使用的單元格
for r in range(row_idx, min(row_idx + rowspan, rows)):
for c in range(col_idx, min(col_idx + colspan, cols)):
used_cells[r][c] = True
# 移動到下一個可用列
col_idx += colspan
# 添加空段落分隔
doc.add_paragraph()
return doc
def copy_inline_shapes(old_paragraph):
"""復制段落中的所有內(nèi)嵌形狀(通常是圖片)"""
images = []
for shape in old_paragraph._element.xpath('.//w:drawing'):
blip = shape.find('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
rId = blip.attrib['{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed']
image_part = old_paragraph.part.related_parts[rId]
image_bytes = image_part.image.blob
images.append([image_bytes, image_part.image.width, image_part.image.height])
return images
def is_page_break(element):
"""判斷元素是否為分頁符(段落或表格后)"""
if element.tag.endswith('p'):
for child in element:
if child.tag.endswith('br') and child.get(qn('type')) == 'page':
return True
elif element.tag.endswith('tbl'):
# 表格后可能有分頁符(通過下一個元素判斷)
if element.getnext() is not None:
next_element = element.getnext()
if next_element.tag.endswith('p'):
for child in next_element:
if child.tag.endswith('br') and child.get(qn('type')) == 'page':
return True
return False
def clone_paragraph(old_para):
"""根據(jù)舊段落創(chuàng)建新段落"""
style = {"run_style": []}
if old_para.style:
# 這里保存style 主要通過字體識別 是 幾級標題
style_name_to_style_obj = {old_para.style.name + "_" + str(old_para.alignment).split()[0]: old_para.style}
style["style"] = style_name_to_style_obj
paras = []
for old_run in old_para.runs:
text_to_style_name = {old_run.text: old_para.style.name + "_" + str(old_para.alignment).split()[0]}
style["run_style"].append(old_run)
paras.append(text_to_style_name)
style_name_to_alignment = {old_para.style.name + "_" + str(old_para.alignment).split()[0]: old_para.alignment}
style["alignment"] = style_name_to_alignment
images = copy_inline_shapes(old_para)
if len(images):
style["image"] = images
paras.append({"Image_None": "Image_None"})
return style, paras
def clone_document(old_doc_path):
try:
old_doc = Document(old_doc_path)
new_doc = Document()
# 復制主體內(nèi)容
elements = old_doc.element.body
para_index = 0
table_index = 0
index = 0
body_style = []
body_paras = []
while index < len(elements):
element = elements[index]
if element.tag.endswith('p'):
old_para = old_doc.paragraphs[para_index]
style, paras = clone_paragraph(old_para)
body_style.append(style)
body_paras += paras
para_index += 1
index += 1
elif element.tag.endswith('tbl'):
old_table = old_doc.tables[table_index]
body_paras += [{"table": docx_table_to_html(old_table)}]
table_index += 1
index += 1
elif element.tag.endswith('br') and element.get(qn('type')) == 'page':
if index > 0:
body_paras.append("br")
new_doc.add_paragraph().add_run().add_break(WD_BREAK.PAGE)
index += 1
else:
index += 1
# 檢查分頁符
if index < len(elements) and is_page_break(elements[index]):
if index > 0:
new_doc.add_paragraph().add_run().add_break(WD_BREAK.PAGE)
body_paras.append("br")
index += 1
else:
return body_style, body_paras
except Exception as e:
print(f"復制文檔時發(fā)生錯誤:{e}")
# 使用示例
if __name__ == "__main__":
# 示例HTML表格
body_s, body_p = clone_document('專報1.docx')
生成可更改模版
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
# 創(chuàng)建一個新的Word文檔
doc = Document()
for align in [WD_ALIGN_PARAGRAPH.LEFT, WD_ALIGN_PARAGRAPH.RIGHT, WD_ALIGN_PARAGRAPH.CENTER, None]:
for blod_flag in [True, False]:
# 獲取所有可用的段落樣式名(只保留段落樣式)
paragraph_styles = [
style for style in doc.styles if style.type == 1 # type == 1 表示段落樣式
]
# 輸出樣式數(shù)量
print(f"共找到 {len(paragraph_styles)} 種段落樣式:")
for style in paragraph_styles:
print(f"- {style.name}")
# 在文檔中添加每個樣式對應的段落
for style in paragraph_styles:
heading = doc.add_paragraph()
run = heading.add_run(f"樣式名稱: {style.name}")
run.bold = blod_flag
para = doc.add_paragraph(f"這是一個應用了 '{style.name}' 樣式的段落示例。", style=style)
para.alignment = align
# 添加分隔線(可選)
doc.add_paragraph("-" * 40)
# 保存為 demo_template.docx
doc.save("demo_template.docx")
print("\n? 已生成包含所有段落樣式的模板文件:demo_template.docx")
以上就是Python使用python-docx實現(xiàn)自動化處理Word文檔的詳細內(nèi)容,更多關于Python自動化處理Word的資料請關注腳本之家其它相關文章!
相關文章
python使用matplotlib畫出的圖怎樣放到word中
這篇文章主要介紹了python使用matplotlib畫出的圖怎樣放到word中問題,具有很好的參考價值,希望對大家有所幫助,如有錯誤或未考慮完全的地方,望不吝賜教2023-09-09
python中Matplotlib實現(xiàn)繪制3D圖的示例代碼
本篇文章主要介紹了python中Matplotlib實現(xiàn)繪制3D圖的示例代碼,具有一定的參考價值,有興趣的可以了解一下2017-09-09
安裝pytorch報錯torch.cuda.is_available()=false問題的解決過程
最近想用pytorch,因此裝了pytorch,但是碰到了問題,下面這篇文章主要給大家介紹了關于安裝pytorch報錯torch.cuda.is_available()=false問題的解決過程,需要的朋友可以參考下2022-05-05
Python如何存儲和讀取ASCII碼形式的byte數(shù)據(jù)
這篇文章主要介紹了Python如何存儲和讀取ASCII碼形式的byte數(shù)據(jù),具有很好的參考價值,希望對大家有所幫助。如有錯誤或未考慮完全的地方,望不吝賜教2022-05-05

