在学习python爬虫一段时间后,进行第一个项目的实践。
本文使用的库为:requests库 time库 xpath库 os库,
目的:实现对彼岸图4k图片的下载
源码:
import requests
import time
from lxml import etree
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Cookie': '_xsrf=2|564c23e9|a46b8cac21ef08bf52221f1b840f5462|1565579147; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1565579182; BAIDU_SSP_lcr=https://cn.bing.com/; _qqq_uuid_="2|1:0|10:1565579148|10:_qqq_uuid_|56:NGVlZTIxMjdiYjYxMTQ5ZTU2NDk2YjhkMmM1ODM2Njg2N2I0Y2JjYw==|520099f3c590773f531a9c5f1eb1d82adba8893e730d3b508c3045904e1cfd9c"; _ga=GA1.2.349374621.1565579182; _gid=GA1.2.361209414.1565579182; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1565580312'
}
def handle_request(url,page):
#由于首页的构建方式和后续界面不一致,所以需要进行判断
if page==1:
url=url
else:
url=url+'index_'+str(page)+'.html'
#print(url)
request = requests.get(url=url, headers=headers)
#print(request.content)
return request.text
#分析界面,生成下载的界面的链接,并且返回
def parse_content(content):
#建造对象
tree=etree.HTML(content)
image_list=tree.xpath('//*[@id="main"]/div[3]/ul/li/a/@href')
#懒加载技术
#print(image_list)
#print(len(image_list))
for image_src in image_list:
image_src='http://pic.netbian.com'+image_src
parse_secondcontent(image_src)
def parse_secondcontent(image_src):
request=requests.get(url=image_src,headers=headers)
tree=etree.HTML(request.text)
url_list=tree.xpath('//*[@id="img"]/img/@src')
for url in url_list:
image_url='http://pic.netbian.com'+url
download_image(image_url)
def download_image(image_url):
print('开始下载图片---请稍后')
dirpath = '壁纸图片'
# 创建文件夹
if not os.path.exists(dirpath):
os.mkdir(dirpath)
# 创建文件名(返回文件path最后的的名称作为文件名)
filename = os.path.basename(image_url)
# 搞个文件路径(生成一个路径)
filepath = os.path.join(dirpath, filename)
# 发送请求,保存图片
request = requests.get(url=image_url,headers=headers)
with open(filepath, 'wb') as fp:
fp.write(request.content)
print('图片下载结束')
def main():
number=int(input('请输入分类的代码:1.4k风景 2.4k美女 3.4k游戏 4.4k动漫 5.'))
if number ==1:
url='http://pic.netbian.com/4kfengjing'
elif number ==2:
url='http://pic.netbian.com/4kmeinv'
elif number ==3:
url='http://pic.netbian.com/4kyouxi'
elif number==4:
url='http://pic.netbian.com/4kdongman/'
else:
url='http://pic.netbian.com/'
start_page = int(input('请输入你的起始页面'))
end_page = int(input('请输入你的终止界面'))
for page in range(start_page,end_page+1):
print('开始下载%s页的内容'%page)
#根据url和page去生成指定的请求request
request=handle_request(url,page)
parse_content(request)
print('第%s页的图片下载完成'%page)
time.sleep(3)
print('谢谢你的使用,吃屎类')
if __name__ == '__main__':
main()
实现结果: