爬取数据1
1、数据来源:https://2020.cctv.com/medal_list/index.shtml
数据为下面图片的表格数据
2、具体代码
2.1需要提前下载好的pip install 库名
from selenium import webdriver
import lxml.html
import csv
2.2获取网页完整代码
#自动打开chrome,获取代码
driver = webdriver.Chrome('D:\\数据分析\\chromedriver_win32\\chromedriver.exe')
driver.get('https://2020.cctv.com/medal_list/index.shtml')
content=driver.page_source
driver.quit() #关闭chrome
2.3本地创建 .csv,存储爬下来的数据
f = open('nation_data.csv', 'w' , encoding='utf-8')
csv_writer = csv.writer(f)
#表头
csv_writer.writerow((["排名", "国家", "金牌", "银牌", "铜牌", "总数"]))
2.4定位xpath,写入文件
metree = lxml.html.etree
parser = metree.HTML(content)
td_list = parser.xpath("/html/body/div[3]/div/div/div/div[1]/div[3]/div[2]/div/div[3]/table/tbody[@id='medal_list1']//tr")
for td_item in td_list:
num_item= td_item.xpath('.//text()')
csv_writer.writerow(num_item)
f.close() #关闭文件
2.5.csv无乱码转换,请看我的另一篇文章https://blog.csdn.net/weixin_44394124/article/details/120097063?spm=1001.2014.3001.5501
爬取出来的部分数据如图所示
数据爬取2
1、爬取每个国家的金、银、铜牌奖牌榜,如下图(中国和美国)
页面来源:
https://2020.cctv.com/medal_list/details/index.shtml?countryid=USA
https://2020.cctv.com/medal_list/details/index.shtml?countryidCHN
https://2020.cctv.com/medal_list/details/index.shtml?countryidJPN
可以看出除了countryid={}不同,其他都相同
2、三字母国家代码
(http://api.cntv.cn/olympic/getOlyMedals?serviceId=pcocean&itemcode=GEN-------------------------------&t=jsonp&cb=banomedals)来源,如下图
复制到nation.txt文件
然后通过
import pandas as pd
file = open('D:\\Olympic_Games\\nation.txt', 'r', encoding='utf-8')
strfile = file.read()
s1=strfile.split('},{')
nat_list=[]
for i in s1:
nat_list.append(i[-4:-1])
file.close()
#nation_data最加一列
csv_data = pd.read_csv('nation_data.csv', low_memory = False)#example.csv是需要被追加的CSV文件,low_memory防止弹出警告
csv_df = pd.DataFrame(csv_data)
csv_df['nationid'] = nat_list
csv_df.to_csv('nations_data.csv', index = None) #追加后生成的文件为out_csv
nat_list复制到nat.txt,文本内容为
‘USA’, ‘CHN’, ‘JPN’, ‘GBR’, ‘ROC’, ‘AUS’, ‘NED’, ‘FRA’, ‘GER’, ‘ITA’, ‘CAN’, ‘BRA’, ‘NZL’, ‘CUB’, ‘HUN’, ‘KOR’, ‘POL’, ‘CZE’, ‘KEN’, ‘NOR’, ‘JAM’, ‘ESP’, ‘SWE’, ‘SUI’, ‘DEN’, ‘CRO’, ‘IRI’, ‘SRB’, ‘BEL’, ‘BUL’, ‘SLO’, ‘UZB’, ‘GEO’, ‘TPE’, ‘TUR’, ‘GRE’, ‘UGA’, ‘ECU’, ‘ISR’, ‘IRL’, ‘QAT’, ‘KOS’, ‘BAH’, ‘UKR’, ‘BLR’, ‘ROU’, ‘VEN’, ‘IND’, ‘HKG’, ‘PHI’, ‘SVK’, ‘RSA’, ‘AUT’, ‘EGY’, ‘INA’, ‘POR’, ‘ETH’, ‘TUN’, ‘EST’, ‘THA’, ‘FIJ’, ‘LAT’, ‘BER’, ‘PUR’, ‘MAR’, ‘COL’, ‘AZE’, ‘DOM’, ‘ARM’, ‘KGZ’, ‘MGL’, ‘ARG’, ‘SMR’, ‘JOR’, ‘MAS’, ‘NGR’, ‘TKM’, ‘MKD’, ‘NAM’, ‘LTU’, ‘BRN’, ‘KSA’, ‘KAZ’, ‘MEX’, ‘FIN’, ‘KUW’, ‘CIV’, ‘GHA’, ‘SYR’, ‘BUR’, ‘GRN’, ‘MDA’, ‘BOT’
并把这些数据插入到nations_data.csv,如图
3、`把每个国家的金银铜名单汇总成三个表
from selenium import webdriver
import lxml.html
import csv
#先创建好gold.csv、silver.csv、bronze.csv
for str in [‘gold.csv’,‘silver.csv’,‘bronze.csv’]:
g = open(str, ‘w’, encoding=‘utf-8’)
csv_writer = csv.writer(g)
csv_writer.writerow(([“日期”, “项目”, “获得者”, ‘名次’, ‘countryid’]))
g.close()
file1 = open(‘D:\Olympic_Games\nat.txt’, ‘r’, encoding=‘utf-8’)
file1_list=file1.read()
file1.close()
p = file1_list.replace(’’’, ‘’)
c = p.replace(’ ‘, ‘’)
ls = c.split(’,’)
for i in ls:
print(i)
driver = webdriver.Chrome(‘D:\数据分析\chromedriver_win32\chromedriver.exe’)
#获奖金银铜名单
driver.get(‘https://2020.cctv.com/medal_list/details/index.shtml?countryid={}’.format(i))
content = driver.page_source
driver.quit()
metree = lxml.html.etree
parser = metree.HTML(content)
#金牌
gold_list = parser.xpath("/html/body/div[3]/div/div/div/div[1]/div[3]/div[2]/div/div[2]/div[1]/table/tbody[@id='gold']//tr")
f1 = open('gold.csv', 'a+', encoding='utf-8')
csv_writer = csv.writer(f1)
for td_item in gold_list:
num_item = td_item.xpath('.//text()')
csv_writer.writerow([num_item[0], num_item[1], num_item[2], 1, i])
f1.close()
#银牌
silver_list = parser.xpath("/html/body/div[3]/div/div/div/div[1]/div[3]/div[2]/div/div[2]/div[2]/table/tbody[@id='silver']//tr")
f2 = open('silver.csv', 'a+', encoding='utf-8')
csv_writer = csv.writer(f2)
for td_item in silver_list:
num_item = td_item.xpath('.//text()')
csv_writer.writerow([num_item[0], num_item[1], num_item[2], 2, i])
f2.close()
#铜牌
bronze_list = parser.xpath("/html/body/div[3]/div/div/div/div[1]/div[3]/div[2]/div/div[2]/div[3]/table/tbody[@id='bronze']//tr")
f3 = open('bronze.csv', 'a+', encoding='utf-8')
csv_writer = csv.writer(f3)
for td_item in bronze_list:
num_item = td_item.xpath('.//text()')
csv_writer.writerow([num_item[0], num_item[1], num_item[2], 3, i])
f3.close()
表1金牌gold.csv
表2银牌silver .csv
表3铜牌bronze.csv
略
数据处理
1、三个金银铜表,通过表拼接,形成一个大表
(csv转化xlsx,方便之间excel表处理数据)
2、大表和nations.xlsx左链接(nation_data.csv转化为nations.xlsx)
import pandas as pd
gold=pd.read_excel('golds.xlsx')
silver=pd.read_excel('silvers.xlsx')
bronze=pd.read_excel('bronzes.xlsx')
#清洗空行
gold=gold.dropna(axis=0,how='any')
silver=silver.dropna(axis=0,how='any')
bronze=bronze.dropna(axis=0,how='any')
#写进表里面
gold.to_excel('gold.xlsx')
silver.to_excel('silver.xlsx')
bronze.to_excel('bronze.xlsx')
#三个表行拼接为
su=gold.append(silver).append(bronze) #三个表行拼接
#存进表里面
su.to_excel('./su.xlsx')
#国家金牌表
nations=pd.read_excel('D:\\Olympic_Games\\nations.xlsx')
#左外连接
itemsum=pd.merge(nations,su,left_on="countryid",right_on="countryid",how="left")#左表,右表,左表标识列,右表标识列,左/右
#存进一个大数据表
itemsum.to_excel('./itemtoge.xlsx')
如图
3、
由于世界地图需要知道国家的英文名称,所以通过查询百度百科以及excel一系统操作,得到表china_eng.xlsx
2、
china_eng.xlsx和itemtoge.xlsx链接
#世界各国和地区名称代码、百度百科
left_table = pd.read_excel('D:\\Olympic_Games\\datas\\china-eng.xlsx')
right_table = pd.read_excel('D:\\Olympic_Games\\nations.xlsx')
#清洗空行
left_table=left_table.dropna(axis=0, how='any')
#链接
left_right = pd.merge(
left_table,
right_table,
left_on='三字母代码',
right_on='countryid',
)
print(left_right)
left_right.to_excel('nation-eng.xlsx')
生成nation-eng.xlsx
数据可视化
1、世界地图
准备
from pyecharts import options as opts
from pyecharts.charts import Map
import pandas as pd
import os
datas=pd.read_excel('nation-eng.xlsx')
datas['总数'] = datas['总数'].astype('float')
# 基础数据
value = datas['总数']
attr = datas['英文简称']
data = []
for index in range(len(attr)):
city_ionfo = [attr[index], value[index]]
data.append(city_ionfo)
# 打开html
c = (
Map()
.add("世界地图", data, "world")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="奖牌总数"),
visualmap_opts=opts.VisualMapOpts(max_=200),
)
.render()
)
# 打开html
os.system("render.html")
地图展示
2、多系列柱状图
排行前五的国家,金牌数量、银牌数量、铜牌数量拼接展示
import matplotlib.pyplot as plt
import pandas as pd
datas =pd.read_excel('D:\\Olympic_Games\\nations.xlsx')
x = datas['国家'][0:5]
y1 = datas['金牌'][0:5]
y2 = datas['银牌'][0:5]
y3 = datas['铜牌'][0:5]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
bar_width=0.2
plt.bar(y3.index,y3,color='orange',width=bar_width)
plt.bar(y2.index+bar_width,y2,color='yellow',width=bar_width)
plt.bar(y1.index+bar_width+bar_width,y1,color='red',width=bar_width)
plt.legend(('银牌','铜牌','金牌'))
plt.xticks(x.index+ 3 * bar_width / 3,x)
plt.show()
3、美国获取金牌数量、银牌数量、铜牌数量按时间累计
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('D:\\Olympic_Games\\datas\\itemtoge.xlsx')
usa_gold=data[(data['名次']==1)&(data['国家']=='美国')]
usa_silver=data[(data['名次']==2)&(data['国家']=='美国')]
usa_bronze=data[(data['名次']==3)&(data['国家']=='美国')]
plt.rcParams['font.sans-serif']=['SimHei']#用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False#用来正常显示负号
#x
x=usa_gold.sort_values(by='日期')['日期'].drop_duplicates(keep="first")
x2=usa_bronze.sort_values(by='日期')['日期'].drop_duplicates(keep="first")
x3=usa_silver.sort_values(by='日期')['日期'].drop_duplicates(keep="first")
#日期所对应的累加的银牌
def count_num(usa_three):
group = usa_three.groupby('日期')
s = group.size()
sum = 0
y = []
for n in range(len(s)):
sum = s[n] + sum
y.append(sum)
return y
#日期所对应的累加的金牌、银牌、铜牌
data_gold=count_num(usa_gold)
data_silver=count_num(usa_silver)
data_bronze=count_num(usa_bronze)
plt.figure(figsize=(15,10))
#折线图
plt.title('美国')
plt.xlabel('时间')
plt.ylabel('数量')
plt.plot(x, data_gold, marker='o', markersize=3) # 绘制折线图,添加数据点,设置点的大小
plt.plot(x2, data_bronze, marker='o', markersize=3)
plt.plot(x3, data_silver, marker='o', markersize=3)
plt.legend(['金牌', '银牌', '铜牌']) # 设置折线名称
plt.show() # 显示折线图