import requests
import threading
import os
from bs4 import BeautifulSoup
from queue import Queue
from threading import Thread
"""表情包爬虫"""
#创建一个多线程类
class DownLoad_Images(Thread):
#重写构造函数
def __init__(self,queue,path):
Thread.__init__(self)
#类属性
self.queue=queue
self.path=path
if not os.path.exists(path):
os.mkdir(path)
#重写run方法
def run(self):
while True:
url = self.queue.get()
try:
download_images(url,self.path)
except:
print('下载失败')
finally:
self.queue.task_done()
def download_images(url,path):
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
#数据请求
response=requests.get(url=url,headers=headers).text
#数据筛选
soup=BeautifulSoup(response,'lxml')
#find查找指定标签数据
img_list=soup.find_all('img',class_='ui image lazy')
#二次提取
for img in img_list:
image_name=img['title']
image_url = img['data-original']
#下载
try:
with open(path+image_name+os.path.splitext(image_url)[-1],'wb') as f:
image=requests.get(image_url,headers=headers).content
print("正在保存图片:",image_name)
f.write(image)
print('保存成功:',image_name)
except:
pass
if __name__ == "__main__":
_url='https://www.fabiaoqing.com/biaoqing/lists/page/{page}.html'
urls=[_url.format(page=page) for page in range(1,201)]
queue=Queue()
path = './threading_images/'
for x in range(10):
worker = DownLoad_Images(queue,path)
#守护线程
worker.daemon=True
worker.start()
for url in urls:
queue.put(url)
queue.join()
print("下载完成")
网站比较简单,就没必要分析了,也可直接使用requests直接请求,先拿到一个页面,在拿到详情页,拿到所有的url,保存二进制数据就行了,这个爬虫的主要有点是使用多线程加快爬取效率。