import requests
from lxml import etree
import xlwt
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i * 25)) for i in range(0, 10)]
titles = []
point = []
count = []
remark = []
for url in urls:
session = requests.Session()
# 登录后,我们需要获取另一个网页中的内容
response = session.get(url, headers=headers)
a = response.text
# 使用xpath解析从网络上获取的数据
tree = etree.HTML(a)
# 解析获取当页所有段子的标题
title_list = tree.xpath("//ol[@class='grid_view']//div[@class='info']//a/span[text()][1]")
for a in title_list:
titles.append(a.text)
# 解析豆瓣评分
file_point = tree.xpath("//div[@class='star']//span[text()][1]")
for b in file_point:
point.append(b.text)
# 解析电影名评价数量(解析出数量)
flie_count = tree.xpath("//div[@class='star']//span[text()][2]")
for i in flie_count:
count.append(i.text)
# 解析电影名备注
flie_remark = tree.xpath("//p[@class='quote']//span")
for i in flie_remark:
remark.append(i.text)
print(len(titles), len(point), len(count), len(remark))
# 创建一个excel
workbook = xlwt.Workbook()
# 创建一个sheet页
worksheet = workbook.add_sheet('Input', 'w+b')
titiless = ['电影名称','豆瓣评分','评价人数']
for i in range(len(titiless)):
worksheet.write(0, i, titiless[i])
for i in range (len(titles)):
worksheet.write(1 + i, 0, titles[i])
worksheet.write(1 + i, 1, point[i])
worksheet.write(1 + i, 2, count[i])
# worksheet.write(1 + i, 4, remark[i])
workbook.save(r"E:\PythonExcel\豆瓣top250.xls")
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)