爬取目标:腾讯新闻,疫情数据。
确定信息。
在目标网页按F12,在全局数据中寻找,发现有目标数据。接下来在Network中寻找数据并在Headers中获得链接地址。也可以点击搜索查找(较为方便)(省份数据和全球数据也是同样的方法)(注意图二)
最终确定国内数据在https://api.inews.qq.com/newsqa/v1/query/pubished/daily/list?
编写代码
导入库
import json
import requests#爬虫模块
import pandas as pd #数据处理模块
爬取程序。
# 全国疫情数据
chinatotal_url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'
chinatotal_h5_url = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
# 省区信息请求网址
province_city_url = "https://view.inews.qq.com/g2/getOnsInfo?name=wuwei_ww_city_list_order"
def getChinaTotalData(chinatotal_url):
response = requests.get(chinatotal_url).json() # 发出请求并json化处理
#不要反复获取,访问太频繁可能会被识别出来
data = json.loads(response['data'])
return data
def getChinaAbsData(chinatotal_h5_url):
response = requests.get(chinatotal_h5_url).json()
data = json.loads(response['data'])
return data['chinaTotal']
def getProvinceTotalData(chinatotal_h5_url):
response = requests.get(chinatotal_h5_url).json()
data = json.loads(response['data'])
areaTree = data['areaTree'][0]['children']
return areaTree
all_data = getChinaTotalData(chinatotal_url)
abs_data = getChinaAbsData(chinatotal_h5_url)
# 获取从1月03日起的一系列累计数据(网站仅提供最近三十天的数据)
chinaDayList = all_data["chinaDayList"]
date_list1 = list()
year_day_list1 = list()
total_confirm = list()
total_suspect = list()
total_dead = list()
total_heal = list()
total_importedCase = list()
for total in chinaDayList:
#date_list1.append(total['date'][:2] + "/" + total['date'][3:])
year_day_list1.append(total["y"]+"/"+total['date'][:2] + "/" + total['date'][3:])
total_confirm.append(int(total['confirm']))
total_suspect.append(int(total['suspect']))
total_dead.append(int(total['dead']))
total_heal.append(int(total['heal']))
total_importedCase.append(int(total['importedCase']))
# 获取从1月03日起的一系列每日数据
chinaDayAddList = all_data["chinaDayAddList"]
date_list2 = list()
everyday_confirm = list()
everyday_suspect = list()
everyday_dead = list()
everyday_heal = list()
everyday_importedCase = list()
for everyday in chinaDayAddList:
date_list2.append(everyday["y"]+"/"+everyday['date'][:2] + "/" + everyday['date'][3:])
everyday_confirm.append(int(everyday['confirm']))
everyday_suspect.append(int(everyday['suspect']))
everyday_dead.append(int(everyday['dead']))
everyday_heal.append(int(everyday['heal']))
everyday_importedCase.append(int(everyday['importedCase']))
areaTree = getProvinceTotalData(chinatotal_h5_url)
province_name = list()
province_total_confirm = list()
province_total_suspect = list()
province_total_dead = list()
province_total_heal = list()
province_today_confirm = list()
for province in areaTree:
province_name.append(province['name'])
province_total_confirm.append(province['total']['confirm'])
province_today_confirm.append(province['today']['confirm'])
#province_total_suspect.append(province['total']['suspect'])
province_total_dead.append(province['total']['dead'])
province_total_heal.append(province['total']['heal'])
将数据转化为表格(pandas)。
china_total_data = pd.DataFrame({
'日期':year_day_list1 ,
'累计确诊': total_confirm,
'疑似病例': total_suspect,
'累计死亡': total_dead,
'累计治愈': total_heal,
'累计境外输入病例': total_importedCase,
})
china_total_data.tail(10)
china_daily_data = pd.DataFrame({
'日期': date_list2,
'今日确诊': everyday_confirm,
'疑似病例': everyday_suspect,
'今日死亡': everyday_dead,
'今日_治愈': everyday_heal,
'今日_境外输入病例': everyday_importedCase,
})
china_daily_data.tail()
province_total_data = pd.DataFrame({
'省份': province_name,
'今日新增':province_today_confirm,
'累计确诊': province_total_confirm,
#'疑似': province_total_suspect,
'死亡': province_total_dead,
'治愈': province_total_heal,
})
province_total_data.head()
文件展示