一、采集步骤
(1)网页解析
(2)引入第三方模块
import requests #请求数据
import json #数据解析
from jsonpath import jsonpath #数据解析
import pandas as pd # 数据处理
import random # 随机函数
import time #间隔时间
(2)request请求
def __init__(self,sku):
self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36'
,'referer':'https://item.jd.com/'}
def get_data(self):
response = requests.get(self.url,headers = self.headers)
response = response.content.decode()
return response
(3)数据解析
def parse_data(self,data):
# 因为还采集了一些其他相关信息,有的数据格式不一样,所以下面使用了两种json语法解析数据
def try_1(data,path):
try:
re = jsonpath(data,path)[0]
except:
re = '无'
return re
def try_2(data,path,value):
try:
re_list = jsonpath(data,path)[0]
for re in re_list:
info = re[value]
return info
except:
info = '无'
return info
data = '{'+ data.split('({')[1].split('})')[0] + '}'
json_data = json.loads(data)
data_price_p = try_1(json_data,'$.price.p')
data_gift_value = try_2(json_data,'$.promotion.gift','value')
data_dict ={,'页面':data_price_p
,'赠品名':data_gift_value
,'时间':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
}
df = pd.DataFrame(data_dict,index=[0])
return df
(4)数据输入&保存
这一步主要就是实现批量输入和输出
sku_list = pd.read_excel(r'data/输入文件.xlsx',header=None).iloc[:,0].to_list()
print('需采集共{}个\n'.format(len(sku_list)))
df_list = []
for sku in sku_list:
try:
price = Price(sku)
df = price.run()
df_list.append(df)
time.sleep(random.random()*3)
if (sku_list.index(sku)+1)%10 == 0:
print('已采集{}条\n'.format(sku_list.index(sku)+1))
except:
print('共采集{}条,剩余未采集成功\n'.format(sku_list.index(sku)))
break
df_tmp = df_list[0]
for i in range(1,len(df_list)):
df_tmp = pd.concat([df_tmp,df_list[i]],join='outer')
print('全部采集完成\n')
df_tmp.to_excel(r'data/输出文件.xlsx', index=False)
二、遇到的问题
问题1:response部分数据为空值
问题描述:浏览器页面是有显示满减优惠券的,且在抓包数据里也是存在的,但实际采集的时候返回的是空值
问题原因:经过多次测试之后后来发现是登录问题,因为是针对新用户的优惠券,而我在采集的时候并没有设置cookie,所以采集不到数据,设置之后应该是可以采集成功的(后续实践后更新)
三、源代码
以下是源代码
import requests
import random
import time
import json
from jsonpath import jsonpath
import pandas as pd
class Price(object):
def __init__(self,sku):
self.headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36'
,'referer':'https://item.jd.com/'}
def get_data(self):
response = requests.get(self.url,headers = self.headers)
response = response.content.decode()
return response
def parse_data(self,data):
def try_1(data,path):
try:
re = jsonpath(data,path)[0]
except:
re = '无'
return re
def try_2(data,path,value):
try:
re_list = jsonpath(data,path)[0]
for re in re_list:
info = re[value]
return info
except:
info = '无'
return info
data = '{'+ data.split('({')[1].split('})')[0] + '}'
json_data = json.loads(data)
data_ad = try_1(json_data,'$.adText')
data_coupon_ad = try_2(json_data,'$.couponInfo','addDays')
data_coupon_btime = try_2(json_data,'$.couponInfo','beginTime')
data_coupon_discount = try_2(json_data,'$.couponInfo','discountText')
data_coupon_etime = try_2(json_data,'$.couponInfo','endTime')
data_coupon_id = try_2(json_data,'$.couponInfo','couponId')
data_coupon_label = try_2(json_data,'$.couponInfo','labelTxt')
data_coupon_name = try_2(json_data,'$.couponInfo','name')
data_ms = try_1(json_data,'$.miaoshaInfo.miaosha')
data_ms_title = try_1(json_data,'$.miaoshaInfo.title')
data_ms_stime = try_1(json_data,'$.miaoshaInfo.startTime')
data_ms_etime = try_1(json_data,'$.miaoshaInfo.endTime')
data_ms_op = try_1(json_data,'$.miaoshaInfo.originPrice')
data_ms_p = try_1(json_data,'$.miaoshaInfo.promo')
data_price_id = try_1(json_data,'$.price.id')
data_price_m = try_1(json_data,'$.price.m')
data_price_op = try_1(json_data,'$.price.op')
data_price_p = try_1(json_data,'$.price.p')
data_promotion = try_2(json_data,'$.promotion.activity','value')
data_gift_id = try_2(json_data,'$.promotion.gift','proId')
data_gift_value = try_2(json_data,'$.promotion.gift','value')
data_rank = try_1(json_data,'$.rankUnited.revertItem.name')
df = pd.DataFrame(data_dict,index=[0])
return df
def run(self):
re = self.get_data()
data = self.parse_data(re)
return data
if __name__ == "__main__":
sku_list = pd.read_excel(r'data/输入文件.xlsx',header=None).iloc[:,0].to_list()
print('需采集共{}个\n'.format(len(sku_list)))
df_list = []
for sku in sku_list:
try:
price = Price(sku)
df = price.run()
df_list.append(df)
time.sleep(random.random()*3)
if (sku_list.index(sku)+1)%10 == 0:
print('已采集{}条\n'.format(sku_list.index(sku)+1))
except:
print('共采集{}条,剩余未采集成功\n'.format(sku_list.index(sku)))
break
df_tmp = df_list[0]
for i in range(1,len(df_list)):
df_tmp = pd.concat([df_tmp,df_list[i]],join='outer')
print('全部采集完成\n')
df_tmp.to_excel(r'data/输出文件.xlsx', index=False)
声明:本博客均用于个人学习&分享,有错误欢迎指正