Python爬蟲入門案例之爬取二手房源數(shù)據(jù)
本文重點(diǎn)
- 系統(tǒng)分析網(wǎng)頁性質(zhì)
- 結(jié)構(gòu)化的數(shù)據(jù)解析
- csv數(shù)據(jù)保存
環(huán)境介紹
- python 3.8
- pycharm 專業(yè)版 >>> 激活碼
#模塊使用
- requests >>> pip install requests
- parsel >>> pip install parsel
- csv
【付費(fèi)VIP完整版】只要看了就能學(xué)會的教程,80集Python基礎(chǔ)入門視頻教學(xué)
爬蟲代碼實(shí)現(xiàn)步驟: 發(fā)送請求 >>> 獲取數(shù)據(jù) >>> 解析數(shù)據(jù) >>> 保存數(shù)據(jù)
導(dǎo)入模塊
import requests # 數(shù)據(jù)請求模塊 第三方模塊 pip install requests import parsel # 數(shù)據(jù)解析模塊 import re import csv
發(fā)送請求, 對于房源列表頁發(fā)送請求
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請求頭: 把python代碼偽裝成瀏覽器 對于服務(wù)器發(fā)送請求 # User-Agent 瀏覽器的基本信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
獲取數(shù)據(jù)
print(response.text)
解析數(shù)據(jù)
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 數(shù)據(jù)內(nèi)容轉(zhuǎn)成 selector 對象 href = selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語法 # try: title = selector.css('.title h1::text').get() # 標(biāo)題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區(qū)域 community_name = selector.css('.communityName .info::text').get() # 小區(qū) room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進(jìn)行字符串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 列表索引位置取值 取列表中最后一個元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無數(shù)據(jù)電梯' or Elevator == None: # Elevator = '無電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價格(萬元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標(biāo)題': title, '市區(qū)': area, '小區(qū)': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價格(萬元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
保存數(shù)據(jù)
f = open('二手房數(shù)據(jù).csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標(biāo)題', '市區(qū)', '小區(qū)', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價格(萬元)', '年份', ]) csv_writer.writeheader()
數(shù)據(jù)可視化
導(dǎo)入所需模塊
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
讀取數(shù)據(jù)
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()
各城區(qū)二手房數(shù)量北京市地圖
new = [x + '區(qū)' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區(qū)分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()
各城區(qū)二手房數(shù)量-平均價格柱狀圖
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數(shù)量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價格(萬元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區(qū)二手房數(shù)量-平均價格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數(shù)量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['小區(qū)'].values.tolist() count = top_price['價格(萬元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數(shù)量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價格(萬元)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
散點(diǎn)圖
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價格(萬元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()
房屋朝向占比
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} (ublnpf9mb%)'),position="outside") ) c1.render_notebook()
裝修情況/有無電梯玫瑰圖(組合圖)
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數(shù)量'), title_opts=opts.TitleOpts(title='裝修情況/有無電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無電梯',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter=':{c} \n (ublnpf9mb%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()
二手房樓層分布柱狀縮放圖
floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數(shù)量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數(shù)量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()
房屋面積分布縱向柱狀圖
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數(shù)量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數(shù)量'), ) ) bar.render_notebook()
到此這篇關(guān)于Python爬蟲入門案例之爬取二手房源數(shù)據(jù)的文章就介紹到這了,更多相關(guān)Python 爬取二手房數(shù)據(jù)內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家!
相關(guān)文章
Python 如何創(chuàng)建一個簡單的REST接口
這篇文章主要介紹了Python 如何創(chuàng)建一個簡單的REST接口,文中講解非常細(xì)致,代碼幫助大家更好的理解和學(xué)習(xí),感興趣的朋友可以了解下2020-07-07Django app配置多個數(shù)據(jù)庫代碼實(shí)例
這篇文章主要介紹了Django app配置多個數(shù)據(jù)庫代碼實(shí)例,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價值,需要的朋友可以參考下2019-12-12Python一行代碼識別增值稅發(fā)票實(shí)現(xiàn)示例
這篇文章主要為大家介紹了Python一行代碼識別增值稅發(fā)票實(shí)現(xiàn)示例詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2023-03-03Python+tkinter使用40行代碼實(shí)現(xiàn)計算器功能
這篇文章主要為大家詳細(xì)介紹了Python+tkinter使用40行代碼實(shí)現(xiàn)計算器功能,具有一定的參考價值,感興趣的小伙伴們可以參考一下2018-01-01