# *-* coding:utf-8 *-*
import requests
import re,os
#爬取豆瓣250
def douban_ranking(start):
#构建请求
param = {"start": start,
"filter":""
}
respon = requests.get(url,headers=headers,params=param)
page_content = respon.text
respon.close()
#re 正则
obj = re.compile(r'<li>.*?<span class="title">(?P<video_name>.*?)</span>.*?导演:(?P<director>.*?) .*?<br>(?P<year>.*?) .*?property="v:average">(?P<score>.*?)</span>',re.S)
# re_boj=obj.findall(page_content)
re_boj=obj.finditer(page_content)
return re_boj
if __name__ == '__main__':
path = r'C:\Users\wyb\Desktop\python\BBtest\aa.html'
if os.path.exists(path):
os.remove(path)
url = "https://movie.douban.com/top250"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
start = 0
while start < 250:
re_boj=douban_ranking(start)
start = start+25
with open(r'C:\Users\wyb\Desktop\python\BBtest\aa.html',mode='a',encoding="utf-8") as f:
for i in re_boj:
# print(i.group("video_name"),i.group("director"),i.group("year").strip(),i.group("score"))
f.write(i.group("video_name")+' ')
f.write(i.group("director")+' ')
f.write(i.group("year").strip()+' ')
f.write(i.group("score")+'\n')
# dic = i.groupdict()
# dic['year'] = dic['year'].strip()
# f.write(str(dic))
print("over")
生成结果: 电影名字,导演,年份,评分