scrapy基本结构、爬取流程、定义随机请求头、抓取异步网页请参考:scrapy框架–基础结构加爬取异步加载数据的网址项目完整实例
- items.py
class BooksItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
book_name = scrapy.Field() # 图书名
book_star = scrapy.Field() # 图书评分
book_pl = scrapy.Field() # 图书评论数
book_author = scrapy.Field() # 图书作者
book_publish = scrapy.Field() # 出版社
book_date = scrapy.Field() # 出版日期
book_price = scrapy.Field() # 图书价格
- spider.py
import scrapy
from scrapy import Selector
from books.items import BooksItem
class BookspiderSpider(scrapy.Spider):
name = 'bookspider'
allowed_domains = ['douban.com']
start_urls = ['https://book.douban.com/tag/%E7%BB%8F%E6%B5%8E%E5%AD%A6']
# 自定义start_requests方法 多页爬取
def start_requests(self):
for i in range(2,4):
yield scrapy.Request(url='http://www.abckg.com/index_%s.html'%i,callback=self.parse)
# 数据解析
def parse(self, response):
sel = Selector(response)
book_list = sel.css('#subject_list > ul > li')
print(response.request.headers)
for i in book_list:
item = BooksItem()
try:
# strip() 方法用于移除字符串头尾指定的字符(默认为空格)
item['book_name'] = i.xpath('div[@class="info"]/h2/a/text()').extract()[0].strip()
item['book_star'] = i.xpath("div[@class='info']/div[2]/span[@class='rating_nums']/text()").extract()[
0].strip()
item['book_pl'] = i.xpath("div[@class='info']/div[2]/span[@class='pl']/text()").extract()[0].strip()
pub = i.xpath('div[@class="info"]/div[@class="pub"]/text()').extract()[0].strip().split('/')
item['book_price'] = pub.pop()
item['book_date'] = pub.pop()
item['book_publish'] = pub.pop()
item['book_author'] = '/'.join(pub)
yield item
except:
pass
# 返回下一页url 进行多页爬取
nextPage = sel.xpath('//div[@id="subject_list"]/div[@class="paginator"]/span[@class="next"]/a/@href').extract()[0].strip()
if nextPage:
next_url = 'https://book.douban.com'+nextPage
yield scrapy.http.Request(next_url,callback=self.parse)
- pipeline.py
import pymysql as db
class BooksPipeline(object):
def __init__(self):
self.con = db.connect(user="root", passwd="123456", host="localhost", db="spiderdbtushu", charset="utf8")
self.cur = self.con.cursor()
self.cur.execute('drop table if exists douban_books')
self.cur.execute(
"create table douban_books(id int auto_increment primary key,"
"book_name varchar(200),book_star varchar(244),book_pl varchar(244),"
"book_author varchar(200),book_publish varchar(200),book_date varchar(200),"
"book_price varchar(200))")
def process_item(self, item, spider):
self.cur.execute(
"insert into douban_books(id,book_name,book_star,book_pl,book_author,book_publish,book_date,book_price) values(NULL,%s,%s,%s,%s,%s,%s,%s)",
(item['book_name'], item['book_star'], item['book_pl'], item['book_author'], item['book_publish'],
item['book_date'], item['book_price']))
self.con.commit()
return item
- run.py
from scrapy import cmdline
#启动爬虫命令
#自定义py文件 pycharm内运行爬虫
cmdline.execute('scrapy crawl one1'.split())