解決Python3 抓取微信賬單信息問題
這段時間有個朋友想導出微信里面的賬單信息,后來發(fā)現(xiàn)微信的反爬蟲還是很厲害的,花了點時間去分析。
一、采用傳統(tǒng)模擬http抓取
抓取的主要URL:https://wx.tenpay.com/userroll/userrolllist,其中后面帶上三個參數(shù),具體參數(shù)見代碼,其中exportkey這參數(shù)是會過期的,userroll_encryption和userroll_pass_ticket 這兩個參數(shù)需要從cookie中獲得,應該是作為獲取數(shù)據的標識,通過抓包也看不出端倪,應該是微信程序內部生成的,如果使用微信開發(fā)著工具登錄后直接訪問網址有的時候可以訪問返回數(shù)據,但是只是在較短的時間內有效,而且當返回會話超時后,繼續(xù)使用網頁訪問就會被限制,一直提示會話超時,應該是在網頁和移動端中exportkey有不同的時間和訪問次數(shù)的限制。
之后想通過破解seesion的方式,研究了一下,發(fā)現(xiàn)這是不可能的,想要破解session需要搞定wx.login,而wx.login是微信提供的,想要破解難度應該不用我說了。
二、解決exportkey 這個key和Cookie的獲取
需要的工具:
1、安卓/蘋果手機
2、Fiddler(抓包工具)
搞過爬蟲的都知道Fiddler,具體操作就不多說了,設置好代理和開啟Fiddler后,抓取url中的exportkey和相應的Cookie,用于接下來的數(shù)據抓取。
三、上代碼
代碼寫的不是很好,若有錯誤還望各位大大指正。
# coding:utf-8
import datetime
import time
import urllib
import urllib.request
import json
import sys
import io
import ssl
from DBController import DBController #數(shù)據庫
#設置系統(tǒng)編碼格式
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
#解決訪問Https時不受信任SSL證書問題
ssl._create_default_https_context = ssl._create_unverified_context
class MainCode:
def __init__(self, url=""):
self.url = url
self.dbController = DBController() # 數(shù)據庫控制
self.userroll_encryption = "uoxQXsCenowxj0G0ppRKBg8iHRPZwZKaUZB0ka1Y5apUuQnKkZTsA/2RMhBPGyMdiHS8QXk8y2JeLgqTPqZPU9fkrCUp+TIQPkHH/uExAwKeBFLute0ztdHaC6GJUJ2+/R8NGWGe16hSKc6L1+LvAw=="
self.userroll_pass_ticket = "V7oum4glDbdaAwibC8mcuTizGIKmC9A/Y/V12qASuDALdRMveHcRHv1QXamFk27Z"
# self.last_bill_id = ""
# self.last_bill_type = ""
# self.last_create_time = ""
# self.last_trans_id = ""
self.last_item = {}
self.num= 0
#獲取網頁信息
def get_html(self, url, maxTryNum=5):
goon = True # 網絡中斷標記
obj = {}
for tryNum in range(maxTryNum):
try:
# print(self.token)
header = {
"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
"Accept-Encoding":'gzip, deflate, br',
"Accept-Language":'zh-CN,zh;q=0.8',
"Cache-Control":'max-age=0',
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 Safari/601.1 wechatdevtools/1.02.1810240 MicroMessenger/6.5.7 Language/zh_CN webview/15415760070117398 webdebugger port/32594",
"Cookie":"userroll_encryption="+self.userroll_encryption+"; userroll_pass_ticket="+self.userroll_pass_ticket,
"Host":"wx.tenpay.com",
"Upgrade-Insecure-Requests":"1",
}
req = urllib.request.Request(url=url, headers=header)
# 訪問網址
result = urllib.request.urlopen(req, timeout=5).read()
break
except urllib.error.HTTPError as e:
if tryNum < (maxTryNum - 1):
print("嘗試連接請求" + str(tryNum + 1))
# host = self.host2
time.sleep(5)
else:
print('Internet Connect Error!', "Error URL:" + url)
goon = False
break
if goon:
page = result.decode('utf-8')
obj = json.loads(page)
#print(obj)
#print(page)
else:
print("--------------------------")
return obj
#保存到數(shù)據庫
def save_info_to_db(self, item):
select_sql = "SELECT count(*)as num FROM wx_order2 where trans_id = '%s'" % (item["trans_id"])
results = self.dbController.ExecuteSQL_Select(select_sql)
if int(results[0][0]) == 0:
sql = "INSERT INTO wx_order2 (bill_id, bill_type, classify_type, fee, fee_type, out_trade_no, pay_bank_name, payer_remark, remark, order_time, title, total_refund_fee, trans_id,fee_attr) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s','%s','%s')" % (
str(item['bill_id']),
str(item['bill_type']),
str(item['classify_type']),
str(item['fee']),
str(item['fee_type']) ,
str(item['out_trade_no']),
str(item['pay_bank_name']),
str(item['payer_remark']),
str(item['remark']),
str(item['order_time']),
str(item['title']),
str(item['total_refund_fee']),
str(item['trans_id']),
str(item['fee_attr'])
)
# print(sql)
try:
self.dbController.ExecuteSQL_Insert(sql)
# self.log.info("插入數(shù)據成功")
except Exception as e:
print("save_info_to_db:",e)
return
#從獲取的網頁信息中過濾所需要的信息
def get_data(self,url):
res_obj = self.get_html(url)
this_page_num = 0
#若返回的ret_code== 0 則說明獲取數(shù)據成功
if res_obj['ret_code'] == 0:
record_list = res_obj['record']
self.last_bill_id = res_obj['last_bill_id']
self.last_bill_type = res_obj['last_bill_type']
self.last_create_time = res_obj['last_create_time']
self.last_trans_id = res_obj['last_trans_id']
num = 1
this_page_num = len(record_list)
# order = record_list[i]
for order in record_list:
bill_id = order['bill_id']
bill_type = order['bill_type']
classify_type = order['classify_type']
fee = order['fee'] #賬單金額
fee = fee * 0.01
fee = round(fee, 2) #對金額保留兩位小數(shù)
fee_type = order['fee_type'] #金額類型
out_trade_no = order['out_trade_no'] #賬單編號
pay_bank_name = order['pay_bank_name'] #支付的銀行
payer_remark =order['payer_remark'] #支付說明
remark = order['remark'] #賬單說明
order_time = datetime.datetime.fromtimestamp(order['timestamp']) #將時間戳轉為時間
title = order['title'] #賬單標題
title = title.replace(',','').replace('.','').replace("'",'') #去除英文逗號和單引號
total_refund_fee = "0"
trans_id = order['trans_id']
fee_attr = order['fee_attr']
#title = self.remove_emoji(title)
fee_attr = order['fee_attr']
pay_type = ""
if bill_type == 1:
pay_type= "支付"
elif bill_type == 2:
pay_type = "充值"
elif bill_type == 4:
pay_type = "轉賬"
elif bill_type == 6:
pay_type="紅包"
else:
pay_type = str(bill_type)
if fee_attr == "positive":
fee_attr = "收入"
elif fee_attr == "negtive":
fee_attr = "支出"
elif fee_attr == "neutral":
fee_attr = "提現(xiàn)"
item = {}
item['bill_id'] = bill_id
item['bill_type'] =bill_type
item['classify_type'] = classify_type
item['fee'] = fee
item['fee_type'] = fee_type
item['out_trade_no'] = out_trade_no
item['pay_bank_name'] = pay_bank_name
item['payer_remark'] = payer_remark
item['remark'] = remark
item['order_time'] = order_time
item['title'] = title
item['total_refund_fee'] = total_refund_fee
item['trans_id'] = trans_id
item['fee_attr'] = fee_attr
# title = self.remove_emoji(title)
if bill_id != '':
self.last_item['last_bill_id'] = bill_id
self.last_item['last_bill_type'] = bill_type
self.last_item['last_create_time'] = order['timestamp']
self.last_item['last_trans_id'] = trans_id
try:
print(str(self.num),self.last_item,end='\n')
self.num += 1
time.sleep(0.2)
self.save_info_to_db(item)
#print(str(num)+" 時間:" + str(order_time) + " 賬單標題:" + title + " 說明:"+ str(remark)+ " " +str(pay_type) +"金額:" + str(fee) + " 支付方式:"+ str(pay_bank_name)+" 類型:" + str(pay_type) +" fee_attr:"+str(fee_attr)+ '\n',end='')
except Exception as e:
print(e,end='\n')
num = num+1
else:#若獲取數(shù)據不成功,打印原因
print(res_obj)
return this_page_num
#實例化
maincode = MainCode();
#設置Cookie參數(shù)
maincode.userroll_encryption = "6Ow68aKrAz70mEczqeevA2gOXbr9H2a7+2ite6uuyWFdB6j1+SLhlaCNpYA6RjmaOI7IfCi9PXjQsrZPFIs1SMn38Uxr04GJsxMuSO/9wG+eBFLute0ztdHaC6GJUJ2+vmo+JIw351su8RiFxSagwA=="
maincode.userroll_pass_ticket = "i0Co+55KSEjmFjfFZqMG14hasW4qtKFtbj0FiErcSzHY0afkFqHGib3YfsAZWcaG"
#用于非第一頁的數(shù)據抓取
#maincode.last_item['last_bill_id'] = "2ce3d65b20a10700b2048d68"
#maincode.last_item['last_bill_type'] = "4"
#maincode.last_item['last_create_time'] = "1540809516"
#maincode.last_item['last_trans_id'] = "1000050201201810290100731805325"
#設置每次返回的數(shù)量
count = "20"
#exportkey 需要從Fiddler 抓包獲取,有一定的時間限制
exportkey ="A%2BsIJaTGZksgZWPLtSKiyos%3D"
#抓取的URL
url ="https://wx.tenpay.com/userroll/userrolllist?classify_type=0&count="+count+"&exportkey="+exportkey+"&sort_type=1"
for page in range(0,10):
#記錄當前頁返回的數(shù)據數(shù)量
this_page_num = 0
#第一頁
if page == 0:
this_page_num = maincode.get_data(url)
#從第二頁開始需要增加上一頁最后一個item的部分參數(shù),進行下一頁的數(shù)據的抓取
else:
url = "https://wx.tenpay.com/userroll/userrolllist?classify_type=0&count="+count+"&exportkey="+exportkey+"&sort_type=1"+"&last_bill_id="+str(maincode.last_item['last_bill_id'])+"&last_bill_type="+str(maincode.last_item['last_bill_type'])+"&last_create_time="+str(maincode.last_item['last_create_time'])+"&last_trans_id="+str(maincode.last_item['last_trans_id'] + "&start_time="+str(maincode.last_item['last_create_time']))
print(url)
this_page_num = maincode.get_data(url)
#如果數(shù)量少于20個則跳出循環(huán),抓取結束
if this_page_num < 20:
break
time.sleep(0.5)
print(maincode.last_item)
因為是幫朋友抓取的,能實現(xiàn)就可以了。之后若有需要再繼續(xù)優(yōu)化代碼吧!
總結
以上所述是小編給大家介紹的Python3 抓取微信賬單信息,希望對大家有所幫助,如果大家有任何疑問請給我留言,小編會及時回復大家的。在此也非常感謝大家對腳本之家網站的支持!
如果你覺得本文對你有幫助,歡迎轉載,煩請注明出處,謝謝!
相關文章
Python中的yeild關鍵字提高代碼執(zhí)行效率場景實例探究
在Python編程語言中,yeild是一個非常實用的關鍵字,它不僅可以幫助你編寫更加簡潔的代碼,還可以提高代碼的執(zhí)行效率,本文將詳細介紹yeild在Python中的使用方法,并通過示例代碼進行演示,讓我們一起來探索這個強大的關鍵字吧2024-01-01
使用PyInstaller將Pygame庫編寫的小游戲程序打包為exe文件及出現(xiàn)問題解決方法
這篇文章主要介紹了使用PyInstaller將Pygame庫編寫的小游戲程序打包為exe文件的方法,給大家介紹了通過Pyinstaller打包Pygame庫寫的小游戲程序出現(xiàn)的問題及解決方法,非常不錯,具有一定的參考借鑒價值,需要的朋友可以參考下2019-09-09
Python3 串口接收與發(fā)送16進制數(shù)據包的實例
今天小編就為大家分享一篇Python3 串口接收與發(fā)送16進制數(shù)據包的實例,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2019-06-06
解決Python import .pyd 可能遇到路徑的問題
這篇文章主要介紹了解決Python import .pyd 可能遇到路徑的問題,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2021-03-03
python常用的各種排序算法原理與實現(xiàn)方法小結
這篇文章主要介紹了python常用的各種排序算法原理與實現(xiàn)方法,結合實例形式總結分析了冒泡排序、插入排序、選擇排序、快速排序等排序算法的相關原理與實現(xiàn)方法,需要的朋友可以參考下2023-04-04

