Python處理大量Excel文件的十個(gè)技巧分享

更新時(shí)間：2025年05月27日 15:33:19 作者：軟件測(cè)試雜談

每天被大量Excel文件折磨的你看過(guò)來(lái)！這是一份Python程序員整理的實(shí)用技巧,不說(shuō)廢話(huà),直接上干貨,文章通過(guò)代碼示例講解的非常詳細(xì),需要的朋友可以參考下

一、批量讀取多個(gè)Excel文件
二、選擇性讀取工作表和列
三、自動(dòng)調(diào)整格式和樣式
四、智能數(shù)據(jù)清洗
五、自動(dòng)生成數(shù)據(jù)透視表
六、自動(dòng)生成圖表
七、自動(dòng)發(fā)送郵件報(bào)告
八、定時(shí)自動(dòng)運(yùn)行
九、Excel文件比對(duì)
十、性能優(yōu)化技巧

一、批量讀取多個(gè)Excel文件

在實(shí)際工作中，經(jīng)常要處理多個(gè)Excel文件。用Python批量讀取特別方便：

import pandas as pd
import os

def batch_read_excel(folder_path):
    # 存儲(chǔ)所有數(shù)據(jù)框
    all_data = []
    
    # 遍歷文件夾中的所有Excel文件
    for file in os.listdir(folder_path):
        if file.endswith(('.xlsx', '.xls')):
            file_path = os.path.join(folder_path, file)
            # 讀取文件并添加來(lái)源列
            df = pd.read_excel(file_path)
            df['文件來(lái)源'] = file
            all_data.append(df)
    
    # 合并所有數(shù)據(jù)框
    return pd.concat(all_data, ignore_index=True)

# 使用示例
data = batch_read_excel('D:/工作/銷(xiāo)售數(shù)據(jù)/')

二、選擇性讀取工作表和列

有時(shí)候Excel文件很大，但我們只需要某些工作表和列，這樣可以節(jié)省內(nèi)存：

def smart_read_excel(filename):
    # 只讀取需要的工作表和列
    sheets_to_read = ['銷(xiāo)售數(shù)據(jù)', '客戶(hù)信息']
    useful_columns = ['日期', '產(chǎn)品', '銷(xiāo)量', '單價(jià)']
    
    all_data = {}
    for sheet in sheets_to_read:
        df = pd.read_excel(
            filename,
            sheet_name=sheet,
            usecols=useful_columns,
            dtype={
                '銷(xiāo)量': 'int32',  # 指定數(shù)據(jù)類(lèi)型優(yōu)化內(nèi)存
                '單價(jià)': 'float32'
            }
        )
        all_data[sheet] = df
    
    return all_data

三、自動(dòng)調(diào)整格式和樣式

數(shù)據(jù)處理完還要調(diào)整格式？這個(gè)函數(shù)幫你一鍵搞定：

from openpyxl.styles import PatternFill, Font, Alignment
from openpyxl.utils import get_column_letter

def style_excel(filename):
    # 讀取Excel
    wb = load_workbook(filename)
    ws = wb.active
    
    # 設(shè)置列寬
    for col in range(1, ws.max_column + 1):
        ws.column_dimensions[get_column_letter(col)].width = 15
    
    # 設(shè)置表頭樣式
    header_fill = PatternFill(start_color='FF92D050', end_color='FF92D050', fill_type='solid')
    header_font = Font(bold=True, color='FFFFFF', size=12)
    
    for cell in ws[1]:
        cell.fill = header_fill
        cell.font = header_font
        cell.alignment = Alignment(horizontal='center')
    
    # 設(shè)置數(shù)據(jù)區(qū)域格式
    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = Alignment(horizontal='center')
            # 數(shù)字列右對(duì)齊
            if isinstance(cell.value, (int, float)):
                cell.alignment = Alignment(horizontal='right')
    
    wb.save(filename)

四、智能數(shù)據(jù)清洗

數(shù)據(jù)清洗是最耗時(shí)的工作，這個(gè)函數(shù)能自動(dòng)處理常見(jiàn)問(wèn)題：

def clean_excel_data(df):
    # 刪除全空的行
    df = df.dropna(how='all')
    
    # 填充空值
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(0)
    
    # 處理日期列
    date_columns = ['訂單日期', '發(fā)貨日期']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # 刪除重復(fù)記錄
    df = df.drop_duplicates()
    
    # 處理異常值
    for col in numeric_columns:
        # 將超過(guò)3個(gè)標(biāo)準(zhǔn)差的值替換為均值
        mean = df[col].mean()
        std = df[col].std()
        df.loc[abs(df[col] - mean) > 3*std, col] = mean
    
    return df

五、自動(dòng)生成數(shù)據(jù)透視表

手動(dòng)制作數(shù)據(jù)透視表太麻煩？Python一行搞定：

def create_pivot_tables(df, filename):
    # 創(chuàng)建Excel寫(xiě)入器
    writer = pd.ExcelWriter(filename, engine='openpyxl')
    
    # 按產(chǎn)品類(lèi)別統(tǒng)計(jì)銷(xiāo)售額
    pivot1 = pd.pivot_table(
        df,
        values='銷(xiāo)售額',
        index='產(chǎn)品類(lèi)別',
        columns='月份',
        aggfunc='sum',
        margins=True,
        margins_name='總計(jì)'
    )
    
    # 按銷(xiāo)售區(qū)域分析銷(xiāo)量
    pivot2 = pd.pivot_table(
        df,
        values=['銷(xiāo)量', '銷(xiāo)售額'],
        index='銷(xiāo)售區(qū)域',
        columns='產(chǎn)品類(lèi)別',
        aggfunc={
            '銷(xiāo)量': 'sum',
            '銷(xiāo)售額': ['sum', 'mean']
        }
    )
    
    # 寫(xiě)入不同工作表
    pivot1.to_excel(writer, sheet_name='產(chǎn)品類(lèi)別分析')
    pivot2.to_excel(writer, sheet_name='區(qū)域分析')
    
    writer.save()

六、自動(dòng)生成圖表

數(shù)據(jù)可視化也能自動(dòng)化：

import matplotlib.pyplot as plt
import seaborn as sns

def create_sales_charts(df, save_path):
    # 設(shè)置中文字體
    plt.rcParams['font.sans-serif'] = ['SimHei']
    
    # 1. 銷(xiāo)售趨勢(shì)圖
    plt.figure(figsize=(12, 6))
    df.groupby('日期')['銷(xiāo)售額'].sum().plot(kind='line')
    plt.title('銷(xiāo)售趨勢(shì)分析')
    plt.tight_layout()
    plt.savefig(f'{save_path}/銷(xiāo)售趨勢(shì).png')
    
    # 2. 品類(lèi)占比餅圖
    plt.figure(figsize=(8, 8))
    df.groupby('產(chǎn)品類(lèi)別')['銷(xiāo)售額'].sum().plot(kind='pie', autopct='%1.1f%%')
    plt.title('產(chǎn)品類(lèi)別銷(xiāo)售占比')
    plt.savefig(f'{save_path}/品類(lèi)占比.png')
    
    # 3. 區(qū)域銷(xiāo)售熱力圖
    plt.figure(figsize=(10, 8))
    pivot = df.pivot_table(
        values='銷(xiāo)售額',
        index='銷(xiāo)售區(qū)域',
        columns='產(chǎn)品類(lèi)別',
        aggfunc='sum'
    )
    sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd')
    plt.title('區(qū)域產(chǎn)品銷(xiāo)售熱力圖')
    plt.tight_layout()
    plt.savefig(f'{save_path}/銷(xiāo)售熱力圖.png')

七、自動(dòng)發(fā)送郵件報(bào)告

處理完數(shù)據(jù)后自動(dòng)發(fā)送郵件：

import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication

def send_excel_report(file_path, recipients):
    # 郵件設(shè)置
    msg = MIMEMultipart()
    msg['Subject'] = '銷(xiāo)售數(shù)據(jù)分析報(bào)告'
    msg['From'] = '你的郵箱'
    msg['To'] = ', '.join(recipients)
    
    # 郵件正文
    content = '''
    各位好：
    
    附件是最新的銷(xiāo)售數(shù)據(jù)分析報(bào)告，請(qǐng)查收。
    
    以下是重要發(fā)現(xiàn)：
    1. 本月銷(xiāo)售額較上月增長(zhǎng)15%
    2. 華東區(qū)域表現(xiàn)最好，占比40%
    3. 新品類(lèi)增長(zhǎng)迅速，環(huán)比增長(zhǎng)50%
    
    如有問(wèn)題請(qǐng)及時(shí)反饋。
    '''
    msg.attach(MIMEText(content, 'plain', 'utf-8'))
    
    # 添加附件
    with open(file_path, 'rb') as f:
        attachment = MIMEApplication(f.read())
        attachment.add_header(
            'Content-Disposition',
            'attachment',
            filename=os.path.basename(file_path)
        )
        msg.attach(attachment)
    
    # 發(fā)送郵件
    with smtplib.SMTP('smtp.公司郵箱.com', 25) as server:
        server.login('你的郵箱', '密碼')
        server.send_message(msg)

八、定時(shí)自動(dòng)運(yùn)行

把上面的功能集成起來(lái)，設(shè)置定時(shí)運(yùn)行：

import schedule
import time

def daily_report_job():
    # 1. 讀取數(shù)據(jù)
    data = batch_read_excel('數(shù)據(jù)文件夾路徑')
    
    # 2. 清洗數(shù)據(jù)
    clean_data = clean_excel_data(data)
    
    # 3. 生成報(bào)表
    create_pivot_tables(clean_data, '分析報(bào)告.xlsx')
    
    # 4. 生成圖表
    create_sales_charts(clean_data, '圖表文件夾路徑')
    
    # 5. 發(fā)送郵件
    send_excel_report(
        '分析報(bào)告.xlsx',
        ['leader@company.com', 'team@company.com']
    )

# 設(shè)置每天早上9點(diǎn)運(yùn)行
schedule.every().day.at('09:00').do(daily_report_job)

while True:
    schedule.run_pending()
    time.sleep(60)

九、Excel文件比對(duì)

經(jīng)常要對(duì)比兩個(gè)Excel文件的差異：

def compare_excel_files(file1, file2):
    # 讀取兩個(gè)文件
    df1 = pd.read_excel(file1)
    df2 = pd.read_excel(file2)
    
    # 設(shè)置索引
    key_columns = ['訂單號(hào)', '產(chǎn)品編碼']
    df1.set_index(key_columns, inplace=True)
    df2.set_index(key_columns, inplace=True)
    
    # 找出不同的行
    diff_rows = df1.compare(df2)
    
    # 找出file2中新增的行
    new_rows = df2.loc[~df2.index.isin(df1.index)]
    
    # 找出file2中刪除的行
    deleted_rows = df1.loc[~df1.index.isin(df2.index)]
    
    # 保存結(jié)果
    with pd.ExcelWriter('文件對(duì)比結(jié)果.xlsx') as writer:
        diff_rows.to_excel(writer, sheet_name='數(shù)據(jù)變化')
        new_rows.to_excel(writer, sheet_name='新增數(shù)據(jù)')
        deleted_rows.to_excel(writer, sheet_name='刪除數(shù)據(jù)')

十、性能優(yōu)化技巧

處理大文件時(shí)的一些優(yōu)化技巧：

def process_large_excel(filename):
    # 1. 分塊讀取
    chunks = pd.read_excel(
        filename,
        chunksize=10000  # 每次讀取1萬(wàn)行
    )
    
    results = []
    for chunk in chunks:
        # 處理每個(gè)數(shù)據(jù)塊
        processed = process_chunk(chunk)
        results.append(processed)
    
    # 合并結(jié)果
    final_result = pd.concat(results)
    
    # 2. 使用更小的數(shù)據(jù)類(lèi)型
    optimized_types = {
        'int64': 'int32',
        'float64': 'float32',
        'object': 'category'  # 對(duì)于重復(fù)值多的字符串列
    }
    
    for col in final_result.columns:
        if final_result[col].dtype.name in optimized_types:
            final_result[col] = final_result[col].astype(optimized_types[final_result[col].dtype.name])
    
    return final_result

這些代碼都經(jīng)過(guò)實(shí)際項(xiàng)目驗(yàn)證，復(fù)制就能用。還有一些使用小技巧：

代碼運(yùn)行前最好先備份數(shù)據(jù)
處理大文件時(shí)注意內(nèi)存占用
多使用pandas的向量化操作，少用循環(huán)
善用datetime處理日期
記得處理異常情況

以上就是Python處理大量Excel文件的十個(gè)技巧分享的詳細(xì)內(nèi)容，更多關(guān)于Python處理Excel文件的資料請(qǐng)關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章:

亚洲乱码中文字幕综合,中国熟女仑乱hd,亚洲精品乱拍国产一区二区三区,一本大道卡一卡二卡三乱码全集资源,又粗又黄又硬又爽的免费视频

Python處理大量Excel文件的十個(gè)技巧分享

目錄

一、批量讀取多個(gè)Excel文件

二、選擇性讀取工作表和列

三、自動(dòng)調(diào)整格式和樣式

四、智能數(shù)據(jù)清洗

五、自動(dòng)生成數(shù)據(jù)透視表

六、自動(dòng)生成圖表

七、自動(dòng)發(fā)送郵件報(bào)告

八、定時(shí)自動(dòng)運(yùn)行

九、Excel文件比對(duì)

十、性能優(yōu)化技巧

相關(guān)文章

最新評(píng)論

大家感興趣的內(nèi)容

最近更新的內(nèi)容

常用在線(xiàn)小工具

亚洲乱码中文字幕综合,中国熟女仑乱hd,亚洲精品乱拍国产一区二区三区,一本大道卡一卡二卡三乱码全集资源,又粗又黄又硬又爽的免费视频

Python處理大量Excel文件的十個(gè)技巧分享

目錄

一、批量讀取多個(gè)Excel文件

二、選擇性讀取工作表和列

三、自動(dòng)調(diào)整格式和樣式

四、智能數(shù)據(jù)清洗

五、自動(dòng)生成數(shù)據(jù)透 視表

六、自動(dòng)生成圖表

七、自動(dòng)發(fā)送郵件報(bào)告

八、定時(shí)自動(dòng)運(yùn)行

九、Excel文件比對(duì)

十、性能優(yōu)化技巧

相關(guān)文章

最新評(píng)論

大家感興趣的內(nèi)容

最近更新的內(nèi)容

常用在線(xiàn)小工具

一、批量讀取多個(gè)Excel文件

三、自動(dòng)調(diào)整格式和樣式

四、智能數(shù)據(jù)清洗

五、自動(dòng)生成數(shù)據(jù)透視表

六、自動(dòng)生成圖表

七、自動(dòng)發(fā)送郵件報(bào)告

八、定時(shí)自動(dòng)運(yùn)行

九、Excel文件比對(duì)

十、性能優(yōu)化技巧