python爬蟲獲取淘寶天貓商品詳細(xì)參數(shù)
首先我是從淘寶進(jìn)去,爬取了按銷量排序的所有(100頁(yè))女裝的列表信息按綜合、銷量分別爬取淘寶女裝列表信息,然后導(dǎo)出前100商品的 link,爬取其詳細(xì)信息。這些商品有淘寶的,也有天貓的,這兩個(gè)平臺(tái)有些區(qū)別,處理的時(shí)候要注意。比如,有的說(shuō)“面料”、有的說(shuō)“材質(zhì)成分”,其實(shí)是一個(gè)意思,等等??梢匀〔煌逆溄幼鲆幌聹y(cè)試。
import re from collections import OrderedDict from bs4 import BeautifulSoup from pyquery import PyQuery as pq #獲取整個(gè)網(wǎng)頁(yè)的源代碼 from config import * #可引用congif的所有變量 import pymysql import urllib import json import bs4 import requests from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from pyquery import PyQuery as pq #獲取整個(gè)網(wǎng)頁(yè)的源代碼 import pandas as pd # 測(cè)試 淘寶+天貓,可完整輸出及保存 browser = webdriver.Firefox() wait = WebDriverWait(browser,10) ####### 天貓上半部分詳情 ############# def get_tianmao_header(url): browser.get(url) # wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝 html=browser.page_source doc = pq(html) # print(doc) info = OrderedDict() # 存放該商品所具有的全部信息 items = doc('#page') # info['店鋪名'] = items.find('.slogo').find('.slogo-shopname').text() # info['ID'] = items.find('#LineZing').attr['itemid'] info['寶貝'] = items.find('.tb-detail-hd').find('h1').text() info['促銷價(jià)'] = items.find('#J_PromoPrice').find('.tm-promo-price').find('.tm-price').text() info['原價(jià)'] = items.find('#J_StrPriceModBox').find('.tm-price').text() # '月銷量' :items.find('.tm-ind-panel').find('.tm-ind-item tm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(), info['月銷量'] = items.find('.tm-ind-panel').find('.tm-indcon').find('.tm-count').text().split(' ',2)[0] info['累計(jì)評(píng)價(jià)'] = items.find('#J_ItemRates').find('.tm-indcon').find('.tm-count').text() # print(info) return info ######## 淘寶上半部分詳情 ############### def get_taobao_header(url): browser.get(url) # wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) #加載所有寶貝 html=browser.page_source doc = pq(html) # print(doc) info = OrderedDict() # 存放該商品所具有的全部信息 items = doc('#page') # info['店鋪名'] = items.find('.tb-shop-seller').find('.tb-seller-name').text() # info['ID'] = items.find('#J_Pine').attr['data-itemid'] info['寶貝'] = items.find('#J_Title').find('h3').text() info['原價(jià)'] = items.find('#J_StrPrice').find('.tb-rmb-num').text() info['促銷價(jià)'] = items.find('#J_PromoPriceNum').text() # '月銷量' :items.find('.tm-ind-panel').find('.tm-ind-item tm-ind-sellCount').find('.tm-indcon').find('.tm-count').text(), info['月銷量'] = items.find('#J_SellCounter').text() info['累計(jì)評(píng)價(jià)'] = items.find('#J_RateCounter').text() # print(info) return info ####################### 詳情 ############################ # 抓取所有商品詳情 def get_Details(attrs,info): # res = requests.get(url) # soup = BeautifulSoup(res.text, "html.parser") # # attrs = soup.select('.attributes-list li') # attrs= [<li title=" 薄">厚薄: 薄</li>, <li title=" 其他100%">材質(zhì)成分: 其他100%</li>,<li ...</li>] attrs_name = [] attrs_value = [] ''''' [\s] 匹配空格,[\s]*,后面有 *,則可以為空 * : 匹配前面的子表達(dá)式任意次 ''' for attr in attrs: attrs_name.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(1)) attrs_value.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(2)) # print('attrs_name=',attrs_name) # attrs_name= ['厚薄', '材質(zhì)成分', ...] # print('attrs_value=',attrs_value) # attrs_value= ['薄', '其他100%', ...] allattrs = OrderedDict() # 存放該產(chǎn)品詳情頁(yè)面所具有的屬性 for k in range(0, len(attrs_name)): allattrs[attrs_name[k]] = attrs_value[k] # print('allattrs=',allattrs) # allattrs= OrderedDict([('厚薄', '薄'), ('材質(zhì)成分', '其他100%'),...]) # info = OrderedDict() # 存放該商品所具有的全部信息 # info = get_headdetail2(url) # 下面三條語(yǔ)句獲取描述、服務(wù)、物流的評(píng)分信息 # 下面的語(yǔ)句用來(lái)判斷該商品具有哪些屬性,如果具有該屬性,將屬性值插入有序字典,否則,該屬性值為空 # 適用場(chǎng)景 if '材質(zhì)成分' in attrs_name: info['材質(zhì)成分'] = allattrs['材質(zhì)成分'] elif '面料' in attrs_name: info['材質(zhì)成分'] = allattrs['面料'] else: info['材質(zhì)成分'] = 'NA' # 適用對(duì)象 if '流行元素' in attrs_name: info['流行元素'] = allattrs['流行元素'] else: info['流行元素'] = 'NA' #季節(jié) if '年份季節(jié)' in attrs_name: info['年份季節(jié)'] = allattrs['年份季節(jié)'] else: info['年份季節(jié)'] = 'NA' # 款式 if '袖長(zhǎng)' in attrs_name: info['袖長(zhǎng)'] = allattrs['袖長(zhǎng)'] else: info['袖長(zhǎng)'] = 'NA' # 尺碼 if '銷售渠道類型' in attrs_name: info['銷售渠道類型'] = allattrs['銷售渠道類型'] else: info['銷售渠道類型'] = 'NA' # 帽頂款式 if '貨號(hào)' in attrs_name: info['貨號(hào)'] = allattrs['貨號(hào)'] else: info['貨號(hào)'] = 'NA' # 帽檐款式 if '服裝版型' in attrs_name: info['服裝版型'] = allattrs['服裝版型'] else: info['服裝版型'] = 'NA' # 檐形 if '衣長(zhǎng)' in attrs_name: info['衣長(zhǎng)'] = allattrs['衣長(zhǎng)'] else: info['衣長(zhǎng)'] = 'NA' # 主要材質(zhì) if '領(lǐng)型' in attrs_name: info['領(lǐng)型'] = allattrs['領(lǐng)型'] else: info['領(lǐng)型'] = 'NA' # 人群 if '袖型' in attrs_name: info['袖型'] = allattrs['袖型'] else: info['袖型'] = 'NA' # 品牌 if '品牌' in attrs_name: info['品牌'] = allattrs['品牌'] else: info['品牌'] = 'NA' # 風(fēng)格 if '圖案' in attrs_name: info['圖案'] = allattrs['圖案'] elif '中老年女裝圖案' in attrs_name: info['圖案'] = allattrs['中老年女裝圖案'] else: info['圖案'] = 'NA' # 款式細(xì)節(jié) if '服裝款式細(xì)節(jié)' in attrs_name: info['服裝款式細(xì)節(jié)'] = allattrs['服裝款式細(xì)節(jié)'] else: info['服裝款式細(xì)節(jié)'] = 'NA' # 適用年齡 if '適用年齡' in attrs_name: info['適用年齡'] = allattrs['適用年齡'] else: info['適用年齡'] = 'NA' # 風(fēng)格 if '風(fēng)格' in attrs_name: info['風(fēng)格'] = allattrs['風(fēng)格'] elif '中老年風(fēng)格' in attrs_name: info['風(fēng)格'] = allattrs['中老年風(fēng)格'] else: info['風(fēng)格'] = 'NA' #通勤 if '通勤' in attrs_name: info['通勤'] = allattrs['通勤'] else: info['通勤'] = 'NA' if '裙長(zhǎng)' in attrs_name: info['裙長(zhǎng)'] = allattrs['裙長(zhǎng)'] else: info['裙長(zhǎng)'] = 'NA' if '裙型' in attrs_name: info['裙型'] = allattrs['裙型'] else: info['裙型'] = 'NA' if '腰型' in attrs_name: info['腰型'] = allattrs['腰型'] else: info['腰型'] = 'NA' # 顏色分類 if '主要顏色' in attrs_name: info['主要顏色'] = allattrs['主要顏色'] else: info['主要顏色'] = 'NA' if '顏色分類' in attrs_name: info['主要顏色'] = allattrs['顏色分類'] else: info['主要顏色'] = 'NA' #尺碼 if '尺碼' in attrs_name: info['尺碼'] = allattrs['尺碼'] else: info['尺碼'] = 'NA' if '組合形式' in attrs_name: info['組合形式'] = allattrs['組合形式'] else: info['組合形式'] = 'NA' if '褲長(zhǎng)' in attrs_name: info['褲長(zhǎng)'] = allattrs['褲長(zhǎng)'] else: info['褲長(zhǎng)'] = 'NA' return info import csv def main(): # 提取 列 with open('clothes_detai.csv', 'w', newline='', encoding='utf-8') as csvfile: # fieldnames = ['店鋪ID','店鋪名','鏈接','寶貝','原價(jià)','促銷價(jià)','月銷量','累計(jì)評(píng)價(jià)','材質(zhì)成分','流行元素','袖長(zhǎng)','年份季節(jié)','銷售渠道類型','貨號(hào)','服裝版型','衣長(zhǎng)','領(lǐng)型','袖型', # '裙型','裙長(zhǎng)','腰型','褲長(zhǎng)','組合形式','品牌','圖案','服裝款式細(xì)節(jié)', '適用年齡','風(fēng)格','通勤','主要顏色','尺碼'] fieldnames=[ 'Link','Brand','Title','Price','Sale price','Sales','Evaluations', 'Component', 'Fashion elements','Sleeve','Seasons','Sales channels', 'Number','Clothes_Style','Long','Collar type','Sleeve type', 'Skirt type','Skirt length','Waist','Combining form','Outseam', 'Design','Fashion pattern detail','Applicable age', 'Style','Commuter','color','Size'] # 'Shop','Data_id','Shop_id','Shop','Link','Data_id', writer = csv.DictWriter(csvfile, fieldnames = fieldnames) writer.writeheader() # urls = ['//detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2eGyUw1&id=549177691667&ns=1&abbucket=4', # '//item.taobao.com/item.htm?id=548443640333&ns=1&abbucket=0#detail'] f = pd.read_csv('women_clothes_sales2.csv') urls = f['link'][0:100] # sh = f['shop_id'][0:3] # s = f['shop'][0:3] # for url in urls: # print(url) # writer.writerow({'店鋪ID':f['shop_id'],'店鋪名':f['shop']}) keys, values = [], [] # for url in urls: for i in urls: url = 'http:' + i # endswith 判斷字符串是否以指定的字符串結(jié)尾 if url.endswith('detail'): info = get_taobao_header(url) res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") attrs = soup.select('.attributes-list li') # 淘寶 class else: info = get_tianmao_header(url) res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") attrs = soup.select('#J_AttrUL li') # 天貓 id # print('attrs=',attrs) d = get_Details(attrs,info) print(d) # for j in f[shop_id]: # d['店鋪ID'] = j # for s in f['shop']: # d['店鋪名'] = s #'Shop':d['店鋪名'],'Data_id':d['ID'], writer.writerow({'Link':url,'Brand':d['品牌'],'Title':d['寶貝'], 'Price':d['原價(jià)'], 'Sale price':d['促銷價(jià)'], 'Sales':d['月銷量'], 'Evaluations':d['累計(jì)評(píng)價(jià)'], 'Component':d['材質(zhì)成分'], 'Fashion elements':d['流行元素'], 'Sleeve':d['袖長(zhǎng)'], 'Seasons':d['年份季節(jié)'], 'Sales channels':d['銷售渠道類型'], 'Number':d['貨號(hào)'],'Clothes_Style':d['服裝版型'],'Long':d['衣長(zhǎng)'],'Collar type':d['領(lǐng)型'], 'Sleeve type':d['袖型'], 'Skirt type':d['裙型'], 'Skirt length':d['裙長(zhǎng)'], 'Waist':d['腰型'], 'Combining form':d['組合形式'], 'Outseam':d['褲長(zhǎng)'], 'Design':d['圖案'], 'Fashion pattern detail':d['服裝款式細(xì)節(jié)'], 'Applicable age':d['適用年齡'], 'Style':d['風(fēng)格'], 'Commuter':d['通勤'], 'color':d['主要顏色'], 'Size':d['尺碼']}) if __name__=='__main__': main()
以上就是本文的全部?jī)?nèi)容,希望對(duì)大家的學(xué)習(xí)有所幫助,也希望大家多多支持腳本之家。
- 一個(gè)簡(jiǎn)單的python爬蟲程序 爬取豆瓣熱度Top100以內(nèi)的電影信息
- python書籍信息爬蟲實(shí)例
- python爬蟲爬取網(wǎng)頁(yè)表格數(shù)據(jù)
- python爬蟲爬取淘寶商品信息(selenum+phontomjs)
- python爬蟲爬取淘寶商品信息
- Python爬蟲使用Selenium+PhantomJS抓取Ajax和動(dòng)態(tài)HTML內(nèi)容
- Python即時(shí)網(wǎng)絡(luò)爬蟲項(xiàng)目啟動(dòng)說(shuō)明詳解
- 零基礎(chǔ)寫python爬蟲之爬蟲編寫全記錄
- Python爬蟲框架Scrapy安裝使用步驟
- python2.7實(shí)現(xiàn)爬蟲網(wǎng)頁(yè)數(shù)據(jù)
相關(guān)文章
python 實(shí)現(xiàn)二維字典的鍵值合并等函數(shù)
今天小編就為大家分享一篇python 實(shí)現(xiàn)二維字典的鍵值合并等函數(shù),具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2019-12-12Python列表reverse()函數(shù)使用方法詳解
這篇文章主要詳細(xì)介紹了Python列表reverse()函數(shù)使用方法,文章通過(guò)代碼示例講解的非常詳細(xì),對(duì)我們的學(xué)習(xí)或工作有一定的幫助,需要的朋友可以參考下2023-07-07python利用Appium實(shí)現(xiàn)自動(dòng)控制移動(dòng)設(shè)備并提取數(shù)據(jù)功能
這篇文章主要介紹了python利用Appium自動(dòng)控制移動(dòng)設(shè)備并提取數(shù)據(jù),本文以控制抖音app滑動(dòng)并獲取抖音短視頻發(fā)布者為例,通過(guò)實(shí)例代碼給大家介紹的非常詳細(xì),需要的朋友可以參考下2021-09-09Django中針對(duì)基于類的視圖添加csrf_exempt實(shí)例代碼
這篇文章主要介紹了Django中針對(duì)基于類的視圖添加csrf_exempt實(shí)例代碼,分享了相關(guān)代碼示例,小編覺(jué)得還是挺不錯(cuò)的,具有一定借鑒價(jià)值,需要的朋友可以參考下2018-02-02