Python進(jìn)行數(shù)據(jù)拆分和合并的超詳細(xì)指南
一、數(shù)據(jù)拆分詳解
1. 按條件拆分?jǐn)?shù)據(jù)
1.1 單條件拆分
import pandas as pd
# 創(chuàng)建示例數(shù)據(jù)
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace'],
'Age': [25, 30, 35, 40, 45, 28, 33],
'Department': ['HR', 'IT', 'HR', 'Finance', 'IT', 'Marketing', 'HR'],
'Salary': [5000, 7000, 5500, 9000, 7500, 6000, 5800]
}
df = pd.DataFrame(data)
# 單條件拆分 - 篩選HR部門的員工
hr_employees = df[df['Department'] == 'HR']
print("HR部門員工:")
print(hr_employees)
# 等價寫法
hr_employees = df.query('Department == "HR"')
1.2 多條件組合拆分
# AND條件: 年齡大于30且薪資低于6000
condition = (df['Age'] > 30) & (df['Salary'] < 6000)
filtered_df = df[condition]
print("\n年齡>30且薪資<6000的員工:")
print(filtered_df)
# OR條件: HR部門或IT部門
condition = (df['Department'] == 'HR') | (df['Department'] == 'IT')
dept_filtered = df[condition]
print("\nHR或IT部門的員工:")
print(dept_filtered)
# NOT條件: 非HR部門
non_hr = df[~df['Department'].isin(['HR'])]
print("\n非HR部門的員工:")
print(non_hr)
1.3 使用isin()進(jìn)行多值篩選
# 篩選特定部門的員工
target_departments = ['HR', 'Finance']
dept_filter = df['Department'].isin(target_departments)
filtered_df = df[dept_filter]
print("\nHR和Finance部門的員工:")
print(filtered_df)
2. 按比例拆分?jǐn)?shù)據(jù)
2.1 簡單隨機(jī)拆分
from sklearn.model_selection import train_test_split
# 隨機(jī)拆分: 70%訓(xùn)練集, 30%測試集
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
print(f"\n訓(xùn)練集 ({len(train_df)}條):")
print(train_df)
print(f"\n測試集 ({len(test_df)}條):")
print(test_df)
2.2 分層抽樣拆分
# 按部門分層抽樣,保持各部門比例
stratified_split = train_test_split(
df,
test_size=0.3,
random_state=42,
stratify=df['Department']
)
train_strat, test_strat = stratified_split
print("\n分層抽樣后的部門分布:")
print("訓(xùn)練集部門分布:")
print(train_strat['Department'].value_counts(normalize=True))
print("\n測試集部門分布:")
print(test_strat['Department'].value_counts(normalize=True))
2.3 時間序列拆分
# 添加日期列
df['Join_Date'] = pd.to_datetime(['2020-01-15', '2019-05-20', '2021-03-10',
'2018-11-05', '2022-02-28', '2020-07-15', '2019-09-01'])
# 按時間點拆分
cutoff_date = pd.to_datetime('2021-01-01')
historical = df[df['Join_Date'] < cutoff_date]
recent = df[df['Join_Date'] >= cutoff_date]
print(f"\n歷史數(shù)據(jù)(2021年前加入, {len(historical)}條):")
print(historical)
print(f"\n近期數(shù)據(jù)(2021年后加入, {len(recent)}條):")
print(recent)
3. 按組拆分?jǐn)?shù)據(jù)
3.1 使用groupby拆分
# 按部門分組
department_groups = df.groupby('Department')
# 查看分組結(jié)果
print("\n按部門分組結(jié)果:")
for name, group in department_groups:
print(f"\n{name}部門:")
print(group)
# 獲取特定組
hr_group = department_groups.get_group('HR')
print("\nHR部門數(shù)據(jù):")
print(hr_group)
3.2 拆分為多個DataFrame
# 將每個部門的數(shù)據(jù)保存到單獨的DataFrame
department_dfs = {name: group for name, group in department_groups}
# 訪問特定部門的數(shù)據(jù)
print("\nIT部門數(shù)據(jù):")
print(department_dfs['IT'])
# 或者拆分為列表
department_list = [group for _, group in department_groups]
二、數(shù)據(jù)合并詳解
1. concat方法
1.1 垂直合并(行方向)
# 創(chuàng)建兩個相似結(jié)構(gòu)的DataFrame
df1 = pd.DataFrame({
'Name': ['Alice', 'Bob'],
'Age': [25, 30],
'Department': ['HR', 'IT']
})
df2 = pd.DataFrame({
'Name': ['Charlie', 'David'],
'Age': [35, 40],
'Department': ['Finance', 'IT']
})
# 垂直合并
combined = pd.concat([df1, df2], axis=0)
print("\n垂直合并結(jié)果:")
print(combined)
# 重置索引
combined_reset = pd.concat([df1, df2], axis=0, ignore_index=True)
print("\n重置索引后的合并結(jié)果:")
print(combined_reset)
1.2 水平合并(列方向)
# 創(chuàng)建兩個不同列的DataFrame
info_df = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Employee_ID': [101, 102, 103, 104]
})
salary_df = pd.DataFrame({
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Salary': [5000, 7000, 5500, 9000],
'Bonus': [500, 700, 550, 900]
})
# 水平合并
combined_cols = pd.concat([info_df, salary_df.drop('Name', axis=1)], axis=1)
print("\n水平合并結(jié)果:")
print(combined_cols)
1.3 處理不同索引
# 設(shè)置不同索引
df1_indexed = df1.set_index('Name')
df2_indexed = df2.set_index('Name')
# 合并時保留所有索引
combined_index = pd.concat([df1_indexed, df2_indexed], axis=0)
print("\n保留所有索引的合并:")
print(combined_index)
2. merge方法
2.1 基本合并操作
# 員工信息
employees = pd.DataFrame({
'Employee_ID': [101, 102, 103, 104, 105],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Dept_ID': [1, 2, 1, 3, 2]
})
# 部門信息
departments = pd.DataFrame({
'Dept_ID': [1, 2, 3, 4],
'Dept_Name': ['HR', 'IT', 'Finance', 'Marketing'],
'Location': ['Floor1', 'Floor2', 'Floor3', 'Floor4']
})
# 內(nèi)連接(默認(rèn))
inner_merge = pd.merge(employees, departments, on='Dept_ID')
print("\n內(nèi)連接結(jié)果:")
print(inner_merge)
# 左連接
left_merge = pd.merge(employees, departments, on='Dept_ID', how='left')
print("\n左連接結(jié)果:")
print(left_merge)
# 右連接
right_merge = pd.merge(employees, departments, on='Dept_ID', how='right')
print("\n右連接結(jié)果:")
print(right_merge)
# 全外連接
outer_merge = pd.merge(employees, departments, on='Dept_ID', how='outer')
print("\n全外連接結(jié)果:")
print(outer_merge)
2.2 多鍵合并
# 添加位置信息
employees['Location'] = ['Floor1', 'Floor2', 'Floor1', 'Floor3', 'Floor2']
# 按部門和位置合并
multi_key_merge = pd.merge(
employees,
departments,
left_on=['Dept_ID', 'Location'],
right_on=['Dept_ID', 'Location'],
how='left'
)
print("\n多鍵合并結(jié)果:")
print(multi_key_merge)
2.3 處理重復(fù)列名
# 兩個表都有'Name'列
departments['Manager'] = ['Alice', 'Bob', 'Charlie', 'David']
# 合并時處理重復(fù)列名
merge_with_suffix = pd.merge(
employees,
departments,
left_on='Dept_ID',
right_on='Dept_ID',
suffixes=('_Employee', '_Manager')
)
print("\n處理重復(fù)列名的合并:")
print(merge_with_suffix)
3. join方法
3.1 基于索引的合并
# 設(shè)置索引
employees_indexed = employees.set_index('Employee_ID')
salary_info = pd.DataFrame({
'Employee_ID': [101, 102, 103, 104, 105],
'Salary': [5000, 7000, 5500, 9000, 7500],
'Bonus': [500, 700, 550, 900, 750]
}).set_index('Employee_ID')
# 使用join合并
joined_df = employees_indexed.join(salary_info)
print("\n基于索引的join合并:")
print(joined_df)
3.2 不同join類型
# 創(chuàng)建不完整的數(shù)據(jù)
partial_salary = salary_info.drop(index=[104, 105])
# 內(nèi)連接
inner_join = employees_indexed.join(partial_salary, how='inner')
print("\n內(nèi)連接join結(jié)果:")
print(inner_join)
# 左連接
left_join = employees_indexed.join(partial_salary, how='left')
print("\n左連接join結(jié)果:")
print(left_join)
三、高級合并技巧
1. 合并時的沖突處理
# 創(chuàng)建有沖突的數(shù)據(jù)
df_conflict1 = pd.DataFrame({
'ID': [1, 2, 3],
'Value': ['A', 'B', 'C']
})
df_conflict2 = pd.DataFrame({
'ID': [2, 3, 4],
'Value': ['X', 'Y', 'Z']
})
# 合并時處理沖突
merged_conflict = pd.merge(
df_conflict1,
df_conflict2,
on='ID',
how='outer',
suffixes=('_left', '_right')
)
# 解決沖突 - 優(yōu)先使用右邊的值
merged_conflict['Value'] = merged_conflict['Value_right'].fillna(merged_conflict['Value_left'])
merged_conflict = merged_conflict.drop(['Value_left', 'Value_right'], axis=1)
print("\n沖突處理后的合并結(jié)果:")
print(merged_conflict)
2. 合并時的復(fù)雜條件
# 創(chuàng)建需要復(fù)雜條件合并的數(shù)據(jù)
orders = pd.DataFrame({
'Order_ID': [1, 2, 3, 4, 5],
'Customer_ID': [101, 102, 101, 103, 104],
'Order_Date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
'Amount': [100, 200, 150, 300, 250]
})
customers = pd.DataFrame({
'Customer_ID': [101, 102, 103, 105],
'Join_Date': pd.to_datetime(['2022-01-01', '2022-05-15', '2022-11-20', '2023-01-01']),
'Tier': ['Gold', 'Silver', 'Silver', 'Bronze']
})
# 合并后篩選: 只保留下單日期晚于加入日期的記錄
merged_complex = pd.merge(
orders,
customers,
on='Customer_ID',
how='left'
)
merged_complex = merged_complex[merged_complex['Order_Date'] >= merged_complex['Join_Date']]
print("\n復(fù)雜條件合并結(jié)果:")
print(merged_complex)
3. 大型數(shù)據(jù)集的合并優(yōu)化
import numpy as np
# 創(chuàng)建大型數(shù)據(jù)集
large_df1 = pd.DataFrame({
'ID': range(1, 100001),
'Value1': np.random.rand(100000)
})
large_df2 = pd.DataFrame({
'ID': range(50000, 150001),
'Value2': np.random.rand(100000)
})
# 優(yōu)化合并方法1: 指定合并鍵的數(shù)據(jù)類型
large_df1['ID'] = large_df1['ID'].astype('int32')
large_df2['ID'] = large_df2['ID'].astype('int32')
# 優(yōu)化合并方法2: 使用更高效的合并方式
%timeit pd.merge(large_df1, large_df2, on='ID') # 測量執(zhí)行時間
# 優(yōu)化合并方法3: 先篩選再合并
filtered_df2 = large_df2[large_df2['ID'] <= 100000]
%timeit pd.merge(large_df1, filtered_df2, on='ID')
四、實際應(yīng)用案例
1. 電商數(shù)據(jù)分析
# 創(chuàng)建電商數(shù)據(jù)集
orders = pd.DataFrame({
'order_id': [1001, 1002, 1003, 1004, 1005],
'customer_id': [201, 202, 203, 204, 205],
'order_date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03', '2023-01-04']),
'amount': [150.0, 200.0, 75.5, 300.0, 125.0]
})
customers = pd.DataFrame({
'customer_id': [201, 202, 203, 204, 206],
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'join_date': pd.to_datetime(['2022-01-15', '2022-03-20', '2022-05-10', '2022-07-05', '2022-09-01']),
'tier': ['Gold', 'Silver', 'Silver', 'Bronze', 'Gold']
})
products = pd.DataFrame({
'order_id': [1001, 1001, 1002, 1003, 1004, 1004, 1005],
'product_id': [1, 2, 1, 3, 2, 3, 1],
'quantity': [1, 2, 1, 1, 3, 1, 2],
'price': [50.0, 50.0, 200.0, 75.5, 100.0, 100.0, 62.5]
})
# 合并訂單和客戶信息
order_customer = pd.merge(orders, customers, on='customer_id', how='left')
# 合并訂單詳情
full_data = pd.merge(order_customer, products, on='order_id', how='left')
# 計算擴(kuò)展金額
full_data['extended_price'] = full_data['quantity'] * full_data['price']
# 按客戶分析
customer_analysis = full_data.groupby(['customer_id', 'name', 'tier']).agg(
total_orders=('order_id', 'nunique'),
total_amount=('amount', 'sum'),
total_items=('quantity', 'sum')
).reset_index()
print("\n完整的電商合并數(shù)據(jù):")
print(full_data)
print("\n客戶分析:")
print(customer_analysis)
2. 學(xué)生成績分析
# 創(chuàng)建學(xué)生數(shù)據(jù)集
students = pd.DataFrame({
'student_id': [1, 2, 3, 4, 5],
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'class': ['A', 'B', 'A', 'B', 'A']
})
grades_math = pd.DataFrame({
'student_id': [1, 2, 3, 4, 6],
'math_score': [90, 85, 78, 92, 88],
'math_rank': [1, 2, 3, 1, 2]
})
grades_english = pd.DataFrame({
'student_id': [1, 3, 4, 5, 7],
'english_score': [88, 76, 95, 82, 90],
'english_rank': [2, 3, 1, 4, 1]
})
# 合并所有成績
all_grades = pd.merge(
pd.merge(students, grades_math, on='student_id', how='left'),
grades_english,
on='student_id',
how='left'
)
# 計算平均分和排名
all_grades['average_score'] = all_grades[['math_score', 'english_score']].mean(axis=1)
all_grades['average_rank'] = all_grades[['math_rank', 'english_rank']].mean(axis=1)
# 按班級分析
class_analysis = all_grades.groupby('class').agg(
avg_math=('math_score', 'mean'),
avg_english=('english_score', 'mean'),
top_math=('math_score', 'max'),
top_english=('english_score', 'max')
).reset_index()
print("\n完整的學(xué)生成績數(shù)據(jù):")
print(all_grades)
print("\n班級分析:")
print(class_analysis)
五、最佳實踐和常見問題
1. 合并前的準(zhǔn)備工作
# 1. 檢查鍵的唯一性
print("\n客戶ID在customers表中的唯一性:", customers['customer_id'].is_unique)
print("訂單ID在orders表中的唯一性:", orders['order_id'].is_unique)
# 2. 檢查缺失值
print("\ncustomers表中customer_id的缺失值:", customers['customer_id'].isnull().sum())
print("orders表中customer_id的缺失值:", orders['customer_id'].isnull().sum())
# 3. 檢查數(shù)據(jù)類型
print("\ncustomers表中customer_id的類型:", customers['customer_id'].dtype)
print("orders表中customer_id的類型:", orders['customer_id'].dtype)
# 4. 預(yù)處理 - 填充缺失值或轉(zhuǎn)換類型
orders['customer_id'] = orders['customer_id'].fillna(0).astype(int)
customers['customer_id'] = customers['customer_id'].astype(int)
2. 合并后的驗證
# 合并數(shù)據(jù)
merged_data = pd.merge(orders, customers, on='customer_id', how='left')
# 1. 檢查合并后的行數(shù)
print("\n合并后的行數(shù):", len(merged_data))
print("左表行數(shù):", len(orders))
print("右表行數(shù):", len(customers))
# 2. 檢查匹配情況
print("\n成功匹配的記錄數(shù):", len(merged_data[~merged_data['name'].isnull()]))
print("未匹配的記錄數(shù):", len(merged_data[merged_data['name'].isnull()]))
# 3. 檢查重復(fù)列
print("\n合并后的列名:", merged_data.columns.tolist())
# 4. 抽樣檢查
print("\n合并數(shù)據(jù)抽樣檢查:")
print(merged_data.sample(3, random_state=42))
3. 性能優(yōu)化技巧
# 1. 指定合并鍵的數(shù)據(jù)類型
orders['customer_id'] = orders['customer_id'].astype('int32')
customers['customer_id'] = customers['customer_id'].astype('int32')
# 2. 減少合并前的數(shù)據(jù)量
# 只選擇需要的列
customers_filtered = customers[['customer_id', 'name', 'tier']]
# 3. 使用更高效的合并方法
# 對于大型數(shù)據(jù)集,可以考慮使用Dask或PySpark
# 4. 分塊合并
def chunk_merge(left, right, on, chunksize=10000, how='left'):
chunks = []
for i in range(0, len(left), chunksize):
chunk = pd.merge(
left.iloc[i:i+chunksize],
right,
on=on,
how=how
)
chunks.append(chunk)
return pd.concat(chunks, axis=0)
# 5. 使用索引加速
orders_indexed = orders.set_index('customer_id')
customers_indexed = customers.set_index('customer_id')
%timeit orders_indexed.join(customers_indexed, how='left')
4. 常見問題及解決方案
問題1: 合并后行數(shù)異常增多
- 原因: 合并鍵在其中一個表中不唯一
- 解決: 檢查鍵的唯一性
df.duplicated().sum()
問題2: 合并后出現(xiàn)大量NaN值
- 原因: 鍵不匹配或使用了外連接
- 解決: 檢查鍵的匹配情況或使用內(nèi)連接
問題3: 合并速度非常慢
- 原因: 數(shù)據(jù)集太大或鍵的數(shù)據(jù)類型不一致
- 解決: 優(yōu)化數(shù)據(jù)類型,分塊處理,或使用更高效的工具
問題4: 列名沖突
- 原因: 兩個表有相同列名但非合并鍵
- 解決: 使用suffixes參數(shù)或提前重命名列
問題5: 內(nèi)存不足
- 原因: 數(shù)據(jù)集太大
- 解決: 使用分塊處理,或者考慮使用Dask等工具
以上就是Python進(jìn)行數(shù)據(jù)拆分和合并的超詳細(xì)指南的詳細(xì)內(nèi)容,更多關(guān)于Python數(shù)據(jù)拆分和合并的資料請關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
Python用selenium實現(xiàn)自動登錄和下單的項目實戰(zhàn)
本文主要介紹了Python用selenium實現(xiàn)自動登錄和下單的項目實戰(zhàn),文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2023-02-02
python 在右鍵菜單中加入復(fù)制目標(biāo)文件的有效存放路徑(單斜杠或者雙反斜杠)
這篇文章主要介紹了python 在右鍵菜單中加入復(fù)制目標(biāo)文件的有效存放路徑(單斜杠或者雙反斜杠),本文給大家介紹的非常詳細(xì),對大家的學(xué)習(xí)或工作具有一定的參考借鑒價值,需要的朋友可以參考下2020-04-04
pandas實現(xiàn)DataFrame顯示最大行列,不省略顯示實例
今天小編就為大家分享一篇pandas實現(xiàn)DataFrame顯示最大行列,不省略顯示實例,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2019-12-12
Python中關(guān)于面向?qū)ο蟾拍畹脑敿?xì)講解
要了解面向?qū)ο笪覀兛隙ㄐ枰戎缹ο蟮降资鞘裁赐嬉鈨骸jP(guān)于對象的理解很簡單,在我們的身邊,每一種事物的存在都是一種對象??偨Y(jié)為一句話也就是:對象就是事物存在的實體2021-10-10

