Python?Pandas數(shù)據(jù)處理高頻操作詳解

更新時(shí)間：2022年06月30日 16:56:01 作者：數(shù)據(jù)不吹牛

這篇文章主要為大家整理了一些Python?Pandas數(shù)據(jù)處理高頻操作，文中的示例代碼講解詳細(xì)，對(duì)我們學(xué)習(xí)Python有一定的幫助，需要的可以參考一下

引入依賴
算法相關(guān)依賴
獲取數(shù)據(jù)
生成df
重命名列
增加列
缺失值處理
獨(dú)熱編碼
替換值
刪除列
數(shù)據(jù)篩選
差值計(jì)算
數(shù)據(jù)修改
時(shí)間格式轉(zhuǎn)換
設(shè)置索引列
折線圖
散點(diǎn)圖
柱狀圖
熱力圖
66個(gè)最常用的pandas數(shù)據(jù)分析函數(shù)

從各種不同的來源和格式導(dǎo)入數(shù)據(jù)
導(dǎo)出數(shù)據(jù)
創(chuàng)建測試對(duì)象
查看、檢查數(shù)據(jù)
數(shù)據(jù)選取
數(shù)據(jù)清理
篩選，排序和分組依據(jù)
數(shù)據(jù)合并
數(shù)據(jù)統(tǒng)計(jì)

16個(gè)函數(shù)，用于數(shù)據(jù)清洗

1.cat函數(shù)
2.contains
3.startswith/endswith
4.count
5.get
6.len
7.upper/lower
8.pad+side參數(shù)/center
9.repeat
10.slice_replace
11.replace
12.replace
13.split方法+expand參數(shù)
14.strip/rstrip/lstrip
15.findall
16.extract/extractall

引入依賴

#?導(dǎo)入模塊
import?pymysql
import?pandas?as?pd
import?numpy?as?np
import?time

#?數(shù)據(jù)庫
from?sqlalchemy?import?create_engine

#?可視化
import?matplotlib.pyplot?as?plt
#?如果你的設(shè)備是配備Retina屏幕的mac，可以在jupyter?notebook中，使用下面一行代碼有效提高圖像畫質(zhì)
%config?InlineBackend.figure_format?=?'retina'
#?解決?plt?中文顯示的問題?mymac
plt.rcParams['font.sans-serif']?=?['Arial?Unicode?MS']
#?設(shè)置顯示中文?需要先安裝字體?aistudio
plt.rcParams['font.sans-serif']?=?['SimHei']?#?指定默認(rèn)字體
plt.rcParams['axes.unicode_minus']?=?False??#?用來正常顯示負(fù)號(hào)
import?seaborn?as?sns
#?notebook渲染圖片
%matplotlib?inline
import?pyecharts

#?忽略版本問題
import?warnings
warnings.filterwarnings("ignore")??

#?下載中文字體
!wget?https://mydueros.cdn.bcebos.com/font/simhei.ttf?
#?將字體文件復(fù)制到?matplotlib'字體路徑
!cp?simhei.ttf?/opt/conda/envs/python35-paddle120-env/Lib/python3,7/site-packages/matplotib/mpl-data/fonts.

#?一般只需要將字體文件復(fù)制到系統(tǒng)字體田錄下即可,但是在?studio上該路徑?jīng)]有寫權(quán)限,所以此方法不能用?
#?!cp?simhei.?ttf?/usr/share/fonts/

#?創(chuàng)建系統(tǒng)字體文件路徑
!mkdir?.fonts
#?復(fù)制文件到該路徑
!cp?simhei.ttf?.fonts/
!rm?-rf?.cache/matplotlib

算法相關(guān)依賴

#?數(shù)據(jù)歸一化
from?sklearn.preprocessing?import?MinMaxScaler

#?kmeans聚類
from?sklearn.cluster?import?KMeans
#?DBSCAN聚類
from?sklearn.cluster?import?DBSCAN
#?線性回歸算法
from?sklearn.linear_model?import?LinearRegression
#?邏輯回歸算法
from?sklearn.linear_model?import?LogisticRegression
#?高斯貝葉斯
from?sklearn.naive_bayes?import?GaussianNB
#?劃分訓(xùn)練/測試集
from?sklearn.model_selection?import?train_test_split
#?準(zhǔn)確度報(bào)告
from?sklearn?import?metrics
#?矩陣報(bào)告和均方誤差
from?sklearn.metrics?import?classification_report,?mean_squared_error

獲取數(shù)據(jù)

from?sqlalchemy?import?create_engine
engine?=?create_engine('mysql+pymysql://root:root@127.0.0.1:3306/ry?charset=utf8')

#?查詢插入后相關(guān)表名及行數(shù)
result_query_sql?=?"use?information_schema;"
engine.execute(result_query_sql)
result_query_sql?=?"SELECT?table_name,table_rows?FROM?tables?WHERE?TABLE_NAME?LIKE?'log%%'?order?by?table_rows?desc;"
df_result?=?pd.read_sql(result_query_sql,?engine)

生成df

#?list轉(zhuǎn)df
df_result?=?pd.DataFrame(pred,columns=['pred'])
df_result['actual']?=?test_target
df_result

#?df取子df
df_new?=?df_old[['col1','col2']]

#?dict生成df
df_test?=?pd.DataFrame({<!--?-->'A':[0.587221,?0.135673,?0.135673,?0.135673,?0.135673],?
????????????????????????'B':['a',?'b',?'c',?'d',?'e'],
????????????????????????'C':[1,?2,?3,?4,?5]})

#?指定列名
data?=?pd.DataFrame(dataset.data,?columns=dataset.feature_names)

#?使用numpy生成20個(gè)指定分布(如標(biāo)準(zhǔn)正態(tài)分布)的數(shù)
tem?=?np.random.normal(0,?1,?20)
df3?=?pd.DataFrame(tem)

#?生成一個(gè)和df長度相同的隨機(jī)數(shù)dataframe
df1?=?pd.DataFrame(pd.Series(np.random.randint(1,?10,?135)))

重命名列

#?重命名列
data_scaled?=?data_scaled.rename(columns={<!--?-->'本體油位':?'OILLV'})

增加列

#?df2df
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?新增一列根據(jù)salary將數(shù)據(jù)分為3組
bins?=?[0,5000,?20000,?50000]
group_names?=?['低',?'中',?'高']
df['categories']?=?pd.cut(df['salary'],?bins,?labels=group_names)

缺失值處理

#?檢查數(shù)據(jù)中是否含有任何缺失值
df.isnull().values.any()

#?查看每列數(shù)據(jù)缺失值情況
df.isnull().sum()

#?提取某列含有空值的行
df[df['日期'].isnull()]

#?輸出每列缺失值具體行數(shù)
for?i?in?df.columns:
????if?df[i].count()?!=?len(df):
????????row?=?df[i][df[i].isnull().values].index.tolist()
????????print('列名："{}", 第{}行位置有缺失值'.format(i,row))

#?眾數(shù)填充
heart_df['Thal'].fillna(heart_df['Thal'].mode(dropna=True)[0],?inplace=True)

#?連續(xù)值列的空值用平均值填充
dfcolumns?=?heart_df_encoded.columns.values.tolist()
for?item?in?dfcolumns:
????if?heart_df_encoded[item].dtype?==?'float':
???????heart_df_encoded[item].fillna(heart_df_encoded[item].median(),?inplace=True)

獨(dú)熱編碼

df_encoded?=?pd.get_dummies(df_data)

替換值

#?按列值替換
num_encode?=?{<!--?-->
????'AHD':?{<!--?-->'No':0,?"Yes":1},
}
heart_df.replace(num_encode,inplace=True)

刪除列

df_jj2.drop(['coll_time',?'polar',?'conn_type',?'phase',?'id',?'Unnamed:?0'],axis=1,inplace=True)

數(shù)據(jù)篩選

#?取第33行數(shù)據(jù)
df.iloc[32]

#?某列以xxx字符串開頭
df_jj2?=?df_512.loc[df_512["transformer"].str.startswith('JJ2')]

df_jj2yya?=?df_jj2.loc[df_jj2["變壓器編號(hào)"]=='JJ2YYA']

#?提取第一列中不在第二列出現(xiàn)的數(shù)字
df['col1'][~df['col1'].isin(df['col2'])]

#?查找兩列值相等的行號(hào)
np.where(df.secondType?==?df.thirdType)

#?包含字符串
results?=?df['grammer'].str.contains("Python")

#?提取列名
df.columns

#?查看某列唯一值（種類）
df['education'].nunique()

#?刪除重復(fù)數(shù)據(jù)
df.drop_duplicates(inplace=True)

#?某列等于某值
df[df.col_name==0.587221]
#?df.col_name==0.587221?各行判斷結(jié)果返回值(True/False)

#?查看某列唯一值及計(jì)數(shù)
df_jj2["變壓器編號(hào)"].value_counts()

#?時(shí)間段篩選
df_jj2yyb_0501_0701?=?df_jj2yyb[(df_jj2yyb['r_time']?&gt;=pd.to_datetime('20200501'))?&amp;?(df_jj2yyb['r_time']?&lt;=?pd.to_datetime('20200701'))]

#?數(shù)值篩選
df[(df['popularity']?&gt;?3)?&amp;?(df['popularity']?&lt;?7)]

#?某列字符串截取
df['Time'].str[0:8]

#?隨機(jī)取num行
ins_1?=?df.sample(n=num)

#?數(shù)據(jù)去重
df.drop_duplicates(['grammer'])

#?按某列排序(降序)
df.sort_values("popularity",inplace=True,?ascending=False)

#?取某列最大值所在行
df[df['popularity']?==?df['popularity'].max()]

#?取某列最大num行
df.nlargest(num,'col_name')
#?最大num列畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

差值計(jì)算

# axis=0或index表示上下移動(dòng)， periods表示移動(dòng)的次數(shù)，為正時(shí)向下移，為負(fù)時(shí)向上移動(dòng)。
print(df.diff(?periods=1,?axis=‘index‘))
print(df.diff(?periods=-1,?axis=0))
# axis=1或columns表示左右移動(dòng)，periods表示移動(dòng)的次數(shù)，為正時(shí)向右移，為負(fù)時(shí)向左移動(dòng)。
print(df.diff(?periods=1,?axis=‘columns‘))
print(df.diff(?periods=-1,?axis=1))

#?變化率計(jì)算
data['收盤價(jià)(元)'].pct_change()

#?以5個(gè)數(shù)據(jù)作為一個(gè)數(shù)據(jù)滑動(dòng)窗口，在這個(gè)5個(gè)數(shù)據(jù)上取均值
df['收盤價(jià)(元)'].rolling(5).mean()

數(shù)據(jù)修改

#?刪除最后一行
df?=?df.drop(labels=df.shape[0]-1)

#?添加一行數(shù)據(jù)['Perl',6.6]
row?=?{<!--?-->'grammer':'Perl','popularity':6.6}
df?=?df.append(row,ignore_index=True)

#?某列小數(shù)轉(zhuǎn)百分?jǐn)?shù)
df.style.format({<!--?-->'data':?'{0:.2%}'.format})

#?反轉(zhuǎn)行
df.iloc[::-1,?:]

#?以兩列制作數(shù)據(jù)透視
pd.pivot_table(df,values=["salary","score"],index="positionId")

#?同時(shí)對(duì)兩列進(jìn)行計(jì)算
df[["salary","score"]].agg([np.sum,np.mean,np.min])

#?對(duì)不同列執(zhí)行不同的計(jì)算
df.agg({<!--?-->"salary":np.sum,"score":np.mean})

時(shí)間格式轉(zhuǎn)換

#?時(shí)間戳轉(zhuǎn)時(shí)間字符串
df_jj2['cTime']?=df_jj2['coll_time'].apply(lambda?x:?time.strftime("%Y-%m-%d?%H:%M:%S",?time.localtime(x)))

#?時(shí)間字符串轉(zhuǎn)時(shí)間格式
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?時(shí)間格式轉(zhuǎn)時(shí)間戳
dtime?=?pd.to_datetime(df_jj2yyb['r_time'])
v?=?(dtime.values?-?np.datetime64('1970-01-01T08:00:00Z'))?/?np.timedelta64(1,?'ms')
df_jj2yyb['timestamp']?=?v

設(shè)置索引列

df_jj2yyb_small_noise?=?df_jj2yyb_small_noise.set_index('timestamp')

折線圖

fig,?ax?=?plt.subplots()
df.plot(legend=True,?ax=ax)
plt.legend(loc=1)
plt.show()

plt.figure(figsize=(20,?6))
plt.plot(max_iter_list,?accuracy,?color='red',?marker='o',
?????????markersize=10)
plt.title('Accuracy?Vs?max_iter?Value')
plt.xlabel('max_iter?Value')
plt.ylabel('Accuracy')

散點(diǎn)圖

plt.scatter(df[:,?0],?df[:,?1],?c="red",?marker='o',?label='lable0')???
plt.xlabel('x')??
plt.ylabel('y')??
plt.legend(loc=2)??
plt.show()??

柱狀圖

df?=?pd.Series(tree.feature_importances_,?index=data.columns)
#?取某列最大Num行畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

熱力圖

df_corr?=?combine.corr()
plt.figure(figsize=(20,20))
g=sns.heatmap(df_corr,annot=True,cmap="RdYlGn")

66個(gè)最常用的pandas數(shù)據(jù)分析函數(shù)

df?#任何pandas?DataFrame對(duì)象?
s?#任何pandas?series對(duì)象

從各種不同的來源和格式導(dǎo)入數(shù)據(jù)

pd.read_csv(filename)?#?從CSV文件?
pd.read_table(filename)?#?從分隔的文本文件（例如CSV）中?
pd.read_excel(filename)?#?從Excel文件?
pd.read_sql(query,?connection_object)?#?從SQL表/數(shù)據(jù)庫中讀取?
pd.read_json(json_string)?#?從JSON格式的字符串，URL或文件中讀取。
pd.read_html(url)?#?解析html?URL，字符串或文件，并將表提取到數(shù)據(jù)幀列表?
pd.read_clipboard()?#?獲取剪貼板的內(nèi)容并將其傳遞給?read_table()?
pd.DataFrame(dict)?#?從字典中，列名稱的鍵，列表中的數(shù)據(jù)的值

導(dǎo)出數(shù)據(jù)

df.to_csv(filename)?#?寫入CSV文件?
df.to_excel(filename)?#?寫入Excel文件?
df.to_sql(table_name,?connection_object)?#?寫入SQL表?
df.to_json(filename)?#?以JSON格式寫入文件

創(chuàng)建測試對(duì)象

pd.DataFrame(np.random.rand(20,5))???????????????#?5列20行隨機(jī)浮點(diǎn)數(shù)?pd.Series(my_list)???????????????????????????????#?從一個(gè)可迭代的序列創(chuàng)建一個(gè)序列?my_list?
df.index?=?pd.date_range('1900/1/30',?periods=df.shape[0])?#?添加日期索引

查看、檢查數(shù)據(jù)

df.head(n)???????????????????????#?DataFrame的前n行?
df.tail(n)???????????????????????#?DataFrame的最后n行?
df.shape?????????????????????????#?行數(shù)和列數(shù)?
df.info()????????????????????????#?索引，數(shù)據(jù)類型和內(nèi)存信息?
df.describe()????????????????????#?數(shù)值列的摘要統(tǒng)計(jì)信息?
s.value_counts(dropna=False)?????#?查看唯一值和計(jì)數(shù)?
df.apply(pd.Series.value_counts)?#?所有列的唯一值和計(jì)數(shù)

數(shù)據(jù)選取

使用這些命令選擇數(shù)據(jù)的特定子集。
df[col]???????????????#?返回帶有標(biāo)簽col的列?
df[[col1,?col2]]??????#?返回列作為新的DataFrame?
s.iloc[0]?????????????#?按位置選擇?
s.loc['index_one']????#?按索引選擇?
df.iloc[0,:]??????????#?第一行?
df.iloc[0,0]??????????#?第一欄的第一元素

數(shù)據(jù)清理

df.columns?=?['a','b','c']??????????????????#?重命名列?
pd.isnull()?????????????????????????????????#?空值檢查，返回Boolean?Arrray?
pd.notnull()????????????????????????????????#?與pd.isnull()?相反?
df.dropna()?????????????????????????????????#?刪除所有包含空值的行?
df.dropna(axis=1)???????????????????????????#?刪除所有包含空值的列?
df.dropna(axis=1,thresh=n)??????????????????#?刪除所有具有少于n個(gè)非null值的行?
df.fillna(x)????????????????????????????????#?將所有空值替換為x?
s.fillna(s.mean())??????????????????????????#?用均值替換所有空值（均值可以用統(tǒng)計(jì)模塊中的幾乎所有函數(shù)替換?）?
s.astype(float)?????????????????????????????#?將系列的數(shù)據(jù)類型轉(zhuǎn)換為float?
s.replace(1,'one')??????????????????????????#?1?用?'one'?
s.replace([1,3],['one','three'])????????????#?替換所有等于的值?替換為所有1?'one'?，并?3?用?'three'?df.rename(columns=lambda?x:?x?+?1)??????????#?列的重命名?
df.rename(columns={<!--?-->'old_name':?'new_?name'})#?選擇性重命名?
df.set_index('column_one')??????????????????#?更改索引?
df.rename(index=lambda?x:?x?+?1)????????????#?大規(guī)模重命名索引

篩選，排序和分組依據(jù)

df[df[col]?&gt;?0.5]??????????????????????#?列?col?大于?0.5?df[(df[col]?&gt;?0.5)?&amp;?(df[col]?&lt;?0.7)]??#?小于?0.7?大于0.5的行?
df.sort_values(col1)???????????????????#?按col1升序?qū)χ颠M(jìn)行排序?
df.sort_values(col2,ascending=False)???#?按col2?降序?qū)χ颠M(jìn)行?排序?
df.sort_values([col1,col2],ascending=[True,False])?#按?col1?升序排序，然后?col2?按降序排序?
df.groupby(col)????????????????????????#從一個(gè)欄返回GROUPBY對(duì)象?
df.groupby([col1,col2])?#?返回來自多個(gè)列的groupby對(duì)象?
df.groupby(col1)[col2]?????????????????#?返回中的值的平均值?col2，按中的值分組?col1?（平均值可以用統(tǒng)計(jì)模塊中的幾乎所有函數(shù)替換?）?
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean)?#?創(chuàng)建一個(gè)數(shù)據(jù)透視表組通過?col1?，并計(jì)算平均值的?col2?和?col3?
df.groupby(col1).agg(np.mean)??????????#?在所有列中找到每個(gè)唯一col1?組的平均值?
df.apply(np.mean)??????????????????????#np.mean()?在每列上應(yīng)用該函數(shù)?
df.apply(np.max,axis=1)????????????????#?np.max()?在每行上應(yīng)用功能

數(shù)據(jù)合并

df1.append(df2)???????????????????#?將df2添加?df1的末尾?（各列應(yīng)相同）?
pd.concat([df1,?df2],axis=1)??????#?將?df1的列添加到df2的末尾?（行應(yīng)相同）?
df1.join(df2,on=col1,how='inner')?# SQL樣式將列 df1 與 df2 行所在的列col 具有相同值的列連接起來。'how'可以是一個(gè)?'left'，?'right'，?'outer'，?'inner'

數(shù)據(jù)統(tǒng)計(jì)

df.describe()????#?數(shù)值列的摘要統(tǒng)計(jì)信息?
df.mean()????????#?返回均值的所有列?
df.corr()????????#?返回DataFrame中各列之間的相關(guān)性?
df.count()???????#?返回非空值的每個(gè)數(shù)據(jù)幀列中的數(shù)字?
df.max()?????????#?返回每列中的最高值?
df.min()?????????#?返回每一列中的最小值?
df.median()??????#?返回每列的中位數(shù)?
df.std()?????????#?返回每列的標(biāo)準(zhǔn)偏差

16個(gè)函數(shù)，用于數(shù)據(jù)清洗

#?導(dǎo)入數(shù)據(jù)集
import?pandas?as?pd

df?={<!--?-->'姓名':['?黃同學(xué)','黃至尊','黃老邪?','陳大美','孫尚香'],
?????'英文名':['Huang?tong_xue','huang?zhi_zun','Huang?Lao_xie','Chen?Da_mei','sun?shang_xiang'],
?????'性別':['男','women','men','女','男'],
?????'身份證':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
?????'身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
?????'家庭住址':['湖北廣水','河南信陽','廣西桂林','湖北孝感','廣東廣州'],
?????'電話號(hào)碼':['13434813546','19748672895','16728613064','14561586431','19384683910'],
?????'收入':['1.1萬','8.5千','0.9萬','6.5千','2.0萬']}
df?=?pd.DataFrame(df)
df

1.cat函數(shù)

用于字符串的拼接

df["姓名"].str.cat(df["家庭住址"],sep='-'*3)

2.contains

判斷某個(gè)字符串是否包含給定字符

df["家庭住址"].str.contains("廣")

3.startswith/endswith

判斷某個(gè)字符串是否以…開頭/結(jié)尾

#?第一個(gè)行的“?黃偉”是以空格開頭的
df["姓名"].str.startswith("黃")?
df["英文名"].str.endswith("e")

4.count

計(jì)算給定字符在字符串中出現(xiàn)的次數(shù)

df["電話號(hào)碼"].str.count("3")

5.get

獲取指定位置的字符串

df["姓名"].str.get(-1)
df["身高"].str.split(":")
df["身高"].str.split(":").str.get(0)

6.len

計(jì)算字符串長度

df["性別"].str.len()

7.upper/lower

英文大小寫轉(zhuǎn)換

df["英文名"].str.upper()
df["英文名"].str.lower()

8.pad+side參數(shù)/center

在字符串的左邊、右邊或左右兩邊添加給定字符

df["家庭住址"].str.pad(10,fillchar="*")??????#?相當(dāng)于ljust()
df["家庭住址"].str.pad(10,side="right",fillchar="*")????#?相當(dāng)于rjust()
df["家庭住址"].str.center(10,fillchar="*")

9.repeat

重復(fù)字符串幾次

df["性別"].str.repeat(3)

10.slice_replace

使用給定的字符串，替換指定的位置的字符

df["電話號(hào)碼"].str.slice_replace(4,8,"*"*4)

11.replace

將指定位置的字符，替換為給定的字符串

df["身高"].str.replace(":","-")

12.replace

將指定位置的字符，替換為給定的字符串(接受正則表達(dá)式)

replace中傳入正則表達(dá)式，才叫好用；- 先不要管下面這個(gè)案例有沒有用，你只需要知道，使用正則做數(shù)據(jù)清洗多好用；

df["收入"].str.replace("\d+\.\d+","正則")

13.split方法+expand參數(shù)

搭配join方法功能很強(qiáng)大

#?普通用法
df["身高"].str.split(":")
#?split方法，搭配expand參數(shù)
df[["身高描述","final身高"]]?=?df["身高"].str.split(":",expand=True)
df
#?split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)

14.strip/rstrip/lstrip

去除空白符、換行符

df["姓名"].str.len()
df["姓名"]?=?df["姓名"].str.strip()
df["姓名"].str.len()

15.findall

利用正則表達(dá)式，去字符串中匹配，返回查找結(jié)果的列表

findall使用正則表達(dá)式，做數(shù)據(jù)清洗，真的很香！

df["身高"]
df["身高"].str.findall("[a-zA-Z]+")

16.extract/extractall

接受正則表達(dá)式，抽取匹配的字符串(一定要加上括號(hào))

df["身高"].str.extract("([a-zA-Z]+)")
#?extractall提取得到復(fù)合索引
df["身高"].str.extractall("([a-zA-Z]+)")
#?extract搭配expand參數(shù)
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True

以上就是Python Pandas數(shù)據(jù)處理高頻操作詳解的詳細(xì)內(nèi)容，更多關(guān)于Python Pandas數(shù)據(jù)處理的資料請關(guān)注腳本之家其它相關(guān)文章！

您可能感興趣的文章: