1.以豆瓣网为例分享一下scrapy使用中需要注意的地方:
2.注意点:
- response.xpath方法的返回结果是一个类似list的类型,其中包含的是selector对象,操作和列表一样,但是有一些额外的方法
- extract() 返回一个包含有字符串的列表
- extract_first() 返回列表中的第一个字符串,列表为空没有返回None
- spider中的parse方法必须有
- 需要抓取的url地址必须属于allowed_domains,但是start_urls中的url地址没有这个限制
- 启动爬虫的时候注意启动的位置,是在项目路径下启动,就是项目文件需要单独打开
- 单独打开项目文件的方法:
(1)找到file->open
(2) 点击file->open
(3)选好自己的项目文件以后选择打开方式为newwindow
3.豆瓣项目示例
import scrapy
class DoubanTestSpider(scrapy.Spider):
name = 'douban_test'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250?start=%s&filter='%i for i in range(0,226,25)]
def parse(self, response):
urls = response.xpath('//*[@id="content"]/div/div[1]/ol/li[*]/div/div[1]/a/@href').getall()
# 获取所有详情页的url
for url in urls:
item = {}
yield scrapy.Request(url,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
title_ = response.xpath('//*[@id="content"]/h1/span[1]/text()').get()
score_ = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong//text()').get()
time_ = response.xpath('//div[@class="subjectwrap clearfix"]/div/div/span[@property="v:runtime"]/@content').get()
type_ = response.xpath('//div[@class="subject clearfix"]/div/span[@property="v:genre"][1]/text()').getall()
num_ = response.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').get()
propotion_ = response.xpath('//*[@id="interest_sectl"]/div[1]/div[3]/div[1]/span[2]/text()').get()
item = response.meta.get('item')
item['title'] = title_
item['score'] = score_
item['time'] = time_
item['type'] = ''.join(type_)
item['num'] = num_
item['propotion'] = propotion_
yield item
4.开启pipeline
from itemadapter import ItemAdapter
import pymysql
class DoubanPipeline:
def process_item(self, item, spider):
conn = pymysql.connect(user='root',password='081228ljf',
charset='utf8',database='douban')
cur = conn.cursor()
cur.execute('insert into films value("%s","%s","%s","%s","%s","%s")'%
(item['title'],item['score'],item['time'],item['type'],item['num'],item['propotion']))
conn.commit()
cur.close()
conn.close()
return item
5.注意pipeline保存数据前要先在settings中开启管道接口
6.setting s设置请求头
# Scrapy settings for Douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Douban'
SPIDER_MODULES = ['Douban.spiders']
NEWSPIDER_MODULE = 'Douban.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'Cookie': 'douban-fav-remind=1; ll="118163"; bid=vxwT8bzpjHA; __gads=ID=44c122f089ed5da4-2281cc7b31d000bf:T=1643175816:RT=1643175816:S=ALNI_MYbpA_3kd91woNUI_rhk323TW9zYQ; _ga=GA1.2.2065482662.1604846529; __yadk_uid=vdmf7Lm3cIWf9yGoGJVNWSi0CRqEctcp; __gpi=UID=00000485465e7c61:T=1649171757:RT=1649171757:S=ALNI_Mas4xcxVTd-ydhslNdy3PLs3RTjmQ; _vwo_uuid_v2=D13DCA7A0C2EF878ED1AAE1B7FD8861CF|d9847755add0d5530c50bdef513d7a6d; _vwo_uuid_v2=D13DCA7A0C2EF878ED1AAE1B7FD8861CF|d9847755add0d5530c50bdef513d7a6d; gr_user_id=084374e4-236c-43d3-94d4-8837ad999083; ap_v=0,6.0; __utma=30149280.2065482662.1604846529.1651568744.1651572480.34; __utmc=30149280; __utmz=30149280.1651572480.34.13.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1651572480; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1651572481%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.436626640.1604928846.1651568745.1651572481.28; __utmb=223695111.0.10.1651572481; __utmc=223695111; __utmz=223695111.1651572481.28.12.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_id.100001.4cf6=d8b22cf303aa9232.1604928846.28.1651572487.1651568768.'
}