继续上次的文章,我们找到了json的数据包,那么证明我们可以获取到他们的数据。
点击Headers,Request URL对应的就是json数据的url。
找到url之后我们就可以开始爬虫了。
import requests
import json
import jsonpath
if __name__ == '__main__':
#json数据的url(链接)
url_ = 'https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0'
#请求头参数,有一些网站比较严格,所以要携带上
headers_ = {
#Cookie 每个手机,电脑请求的到的都不一样,也可以使用我这个
'Cookie':'ll="118288"; bid=WNmW8xeE1WU; _vwo_uuid_v2=D518432420B0EFE08F30999B3E1F63F8A|3647f77f1dd3acd1718a37891ea18bce; _ga=GA1.2.1331951364.1626512396; gr_user_id=eebfffd2-255f-4ee9-9520-55358f3a07a3; __yadk_uid=WtfdGVdYe4Pqxe79Jj9YlozCai1av1xW; ct=y; dbcl2="244686947:nV4+FwTXi9o"; push_doumail_num=0; push_noty_num=0; __utmv=30149280.24468; __gads=ID=542d39b73c948918-2220fbcc0fcb0001:T=1626512409:RT=1629436017:S=ALNI_MaE_YW5Tpnl7s-9BTDiCmS4Jvb0xA; douban-fav-remind=1; ck=RR8z; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1631605636%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DK3049iyI9I-pP5SJ_IYWHAb80TU6ErcNeJVnX1v0tD_wBa1pKgPC_MOuXS67bX82%26wd%3D%26eqid%3Dd041a3df00020d6b0000000661405380%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1331951364.1626512396.1631591847.1631605636.25; __utmb=30149280.0.10.1631605636; __utmc=30149280; __utmz=30149280.1631605636.25.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.411956023.1626512401.1631591847.1631605636.24; __utmb=223695111.0.10.1631605636; __utmc=223695111; __utmz=223695111.1631605636.24.20.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=32db886b983a2094.1626512401.23.1631605642.1631592297.',
#Referer 意思就是说,是从豆瓣这个链接过来的,管他是不是,反正是欺骗服务器的
'Referer':'https://movie.douban.com/',
#这个User_Agent呢,是你电脑的型号,什么系统,多少位的电脑什么的,也是独一无二的
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
#发送请求
response_ = requests.get(url_,headers = headers_)
#获取到的数据直接转json数据
data_ = response_.json()
# print(data_)
#获取电影名字
title_name = jsonpath.jsonpath(data_,'$..title')
#获取评分
rate_ = jsonpath.jsonpath(data_,'$..rate')
#获取电影的链接
url_data = jsonpath.jsonpath(data_,'$..url')
# print(title_name)
# print(rate_)
# print(url_data)
#将解析出来的数据保存到本地
with open('douban.txt','w',encoding='utf-8')as f:
for i in range(len(title_name)):
dict_ = {}
dict_[title_name[i]] = rate_[i]
# print(dict_)
dict_[i] = url_data[i]
print(dict_)
json_data = json.dumps(dict_, ensure_ascii=False) + ',\n'
f.write(json_data)
下一篇放jspnpath解析json数据。