scrapy爬虫框架实例二当当图书信息

2023-11-06

spider.py

import scrapy
from DD.items import DdItem

class DdSpider(scrapy.Spider):
    name = 'dd'
    allowed_domains = ['http://search.dangdang.com/']
    start_urls = ['http://search.dangdang.com/?key=python&act=input&page_index=1']

    def start_requests(self):
        """
        爬虫请求之前
        :return:
        """
        for i in range(2,101):
            url='http://search.dangdang.com/?key=python&act=input&page_index='+str(i)
            yield scrapy.Request(url,self.parse)

    def parse(self, response):
        li_list=response.xpath('//div[@id="search_nature_rg"]/ul/li')
        for book in li_list:
            item=DdItem()

            #书名
            item["book_name"]=book.xpath('./a/@title').extract()
            if len(book.xpath('./a/@title').extract()) > 0:
                item["book_name"] = book.xpath('./a/@title').extract()
            else:
                item["book_name"]=["无简介信息"]

            #价格
            item["search_now_price"]=book.xpath('./p[3]/span[1]/text()').extract()

            #作者
            item["author"] = book.xpath('./p[5]/span[1]/a[1]/@title').extract()
            if len(book.xpath('./p[5]/span[1]/a[1]/@title').extract()) > 0:
                item["author"] = book.xpath('./p[5]/span[1]/a[1]/@title').extract()
            else:
                item["author"]=["无作者信息"]

            #出版社
            item["house"]=book.xpath('./p[5]/span[3]/a/@title').extract()
            if len(book.xpath('./p[5]/span[3]/a/@title').extract())>0:
                item["house"]=book.xpath('./p[5]/span[3]/a/@title').extract()
            else:
                item["house"]=["无出版社信息"]

            #出版日期
            item["data"]=book.xpath('./p[5]/span[2]/text()').extract()
            if len(book.xpath('./p[5]/span[2]/text()').extract())>0:
                item["data"] = book.xpath('./p[5]/span[2]/text()').extract()
            else:
                item["data"] =["无出版日期"]

            #评论数量
            item["review"]=book.xpath('./p[4]/a/text()').extract()
            # print(item)
            yield item

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DdItem(scrapy.Item):
    # define the fields for your item here like:
    #书名
    book_name = scrapy.Field()
    #价格
    search_now_price=scrapy.Field()
    #作者
    author=scrapy.Field()
    #出版社
    house=scrapy.Field()
    #出版日期
    data=scrapy.Field()
    #评论数量
    review=scrapy.Field()

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class DdPipeline:
    def process_item(self, item, spider):
        #连接数据库
        conn=pymysql.connect(host="localhost",user="root",db="qu",passwd="123456",charset="utf8")
        #定义游标
        cur=conn.cursor()

        # 书名
        book_name = item["book_name"][0]
        # 价格
        search_now_price = item["search_now_price"][0]
        # 作者
        author = item["author"][0]
        # 出版社
        house = item["house"][0]
        # 出版日期
        data = item["data"][0]
        # 评论数量
        review = item["review"][0]

        sql="insert into dd(book_name,search_now_price,author,house,data,review)values ('%s','%s','%s','%s','%s','%s')"%(book_name,search_now_price,author,house,data,review)
        print(sql)
        cur.execute(sql)
        conn.commit()
        cur.close()
        conn.close()
        return item

settings.py

# Scrapy settings for DD project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'DD'

SPIDER_MODULES = ['DD.spiders']
NEWSPIDER_MODULE = 'DD.spiders'
LOG_LEVEL="ERROR"
FEED_EXPROT_ENCODING="UTF-8"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'DD.middlewares.DdSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'DD.middlewares.DdDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'DD.pipelines.DdPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

scrapy

xpath

mysql

Scrapy

scrapy爬虫框架实例二当当图书信息的相关文章

mysql 准备好的语句错误：MySQLSyntaxErrorException

我使用准备好的语句编写了选择语句每次尝试运行都会出现此错误我如何克服这个错误我的jdbc连接器是mysql connector java 5 1 13 bin jar 我的代码 public Main add ad to getAdD
学说 - 获取下一个和上一个记录

这样我就已经获取了一些记录我已创建日期字段现在我想按日期获取下一条和上一条记录通过以下方式让它工作 qb this gt createQueryBuilder a next qb gt expr gt gt a created dat
PDO 和 MySQL 全文搜索

我正在将所有站点代码从使用 mysql 函数转换为 PDO 关于 PDO 的 PHP 文档对于我的需求来说并不清楚它为您提供了可以使用的功能但没有详细解释它们在不同场景下的情况基本上我有一个 mysql 全文搜索 sql SELEC
Visual Studio 2015：SQL 数据源：无法检索架构。确保 ConnectionString 和 SelectCommand 属性有效

我有以下ASP Net网页的开发环境 VS 2015专业版使用 Net Framework 4 0 MySQL服务器5 6 MySQL Net 连接器 6 9 5 MySQL for Visual Studio 2 0 2 我能够从 Vi
需要在 select 语句中连接子查询的结果

我有三张桌子 Table1 Users Columns User ID int FirstName LastName Values 1 Jane Doe 2 John Doe 3 Mike Smith Table2 User Groups
如何在Sequelize中配置一对多关系？

我目前正在使用 Express 和 Sequelize MySQL 想知道解决这个问题的最佳方法是什么如果这是一个基本问题我很抱歉因为我对 Sequelize 甚至 SQL 数据库都很陌生我有一个模型User像这样 export d
如何在我的查询中使用日期格式？

这适用于 phpmyadmin 但是当我在代码上使用时给我一个错误错误说解析错误语法错误意外的我的语法有什么问题 gt
更改 MySQL Workbench 上的默认字符集

我正在尝试使用连接到我的 MYSQL 数据库utf8mb4字符集请注意数据库字符集的全局设置已经是 utf8mb4 我可以使用 CLI 轻松完成此操作如下所示 mysql h myhostname u myuser p default
复制具有不同列名的 MySQL 表

我需要将 table1 中与特定列匹配的所有行复制到具有不同列名称的 table2 中例如 table1 name oldAddressBook table1 的列 name Name Surname Number table2 name
MySQL Workbench：如何将 mysql 数据库导出到 .sql 文件？

我需要将 mysql 工作台中的数据库导出到文件 sql 该怎么办在 MySql Workbench 版本 8 0 中您只需按照以下步骤操作即可 Go to Server tab Go to 数据库导出这会打开类似这样的东西在中选择
如何反转散列和加盐密码？ [关闭]

Closed 这个问题不符合堆栈溢出指南 help closed questions 目前不接受答案我正在使用 vBulletin 登录使用它来交叉引用数据库 md5 md5 pass salt 我如何制作一个 PHP 脚本以便每个密码
Codeigniter，为MySQL创建表和用户

我想以编程方式使用 CI 创建数据库和用户到目前为止我有这 2 个简单的 MySQL 语句 CREATE DATABASE testdb DEFAULT CHARACTER SET utf8 COLLATE utf8 general c
在表中添加新列，其值取决于同一表中另一列的值

我是 Mysql 的新手我想在表中添加一列其值取决于同一个表中的另一个列值例如我有company table如下 fldId companyName date 1 adeco 2012 01 12 2 mic 2001 03 09 3
MySQL 列数据以逗号分隔列表形式返回

我目前有一个 MySQL 表例如 id friend 1 2 1 5 1 10 3 6 15 19 21 4 我试图获取某个特定用户的所有好友 ID 并将它们排列到一个以逗号分隔的列表中例如抓取了 user1 的好友它将返回为 fr
PHP 中的 XPath 删除 HTML 标签

我在 PHP 中使用 XPath 来检索 HTML 文档的一部分假设我的 HTML 文档如下所示 div a href some link address com Hello a p Some text here p div div p
mysql 分区错误主键必须包含表分区函数中的所有列

在 MySQL 8 上我有这个表 CREATE TABLE float values id bigint UNSIGNED NOT NULL attribute id bigint UNSIGNED NOT NULL value doubl
选择 MYSQL 行，但将行转换为列，将列转换为行

我想选择数据库中的所有行但我希望它们按相反的顺序排列意思是我想使用第一列数据作为新实体并将实体作为第一列我想你明白我的意思这是一个例子 id name marks 1 Ram 45 2 Shyam 87 to id 1 2 Na
需要 php pdo 内爆数组并在 mysql 中插入多行

基于this https stackoverflow com questions 4629022 how to insert an array into a single mysql prepared statement w php and
将 1 添加到字段

如何将以下 2 个查询变成 1 个查询 sql SELECT level FROM skills WHERE id id LIMIT 1 result db gt sql query sql level int db gt sql fetc
MySQL为什么在插入时我的自动增量不是从1开始？

为什么当我使用 jdbc 向数据库中插入数据时我的表 auto increment 会被提升填充完全空表的示例狗桌 DogId DogName 3 Woofer 4 Kujo 5 Spike 所有者表 OwnerId DogID Ow

随机推荐

C语言的排序函数qsort()详解

一 qsort 函数的用法及使用说明目录一 qsort 函数的用法及使用说明二使用qsort 函数来求关于各种类型的降序排序 1 int类型的数组进行排序 2 char类型的数组进行排序 3 double类型的数组排序与前两个
js数组去重的4种方法：

js数组去重的4种方法
常用符号大全

卐
华为OD机试 C++ 【座位调整】

题目由于疫情原因学生之间的座位要保持一定距离每个学生的左右都要至少有一个空座给定一个代表座位情况的数组desk 其中1代表有学生坐在那个位置 0代表该位置为空问你在保持疫情安全距离的前提下我们还能安排多少学生输入一个整数数组
WPS授权过期问题解决方案——编程方法

在使用WPS时有时会遇到授权已到期的提示这意味着您的WPS软件无法再正常使用然而通过编程我们可以采取一些方法来解决这个问题本文将介绍一种通过编程来解决WPS授权过期问题的方法解决WPS授权过期问题的一种常见方法是修改系统的ho
python中mean的用法_Python Pandas dataframe.mean()用法及代码示例

Python是进行数据分析的一种出色语言主要是因为以数据为中心的python软件包具有奇妙的生态系统 Pandas是其中的一种使导入和分析数据更加容易 Pandas dataframe mean 函数返回所请求轴的平均值如果将方法应用
C语言实现二叉树（链式存储结构）+ 遍历

C语言实现链式存储结构二叉树遍历结构体定义及三种遍历方法 1 结构体定义 2 先序遍历先序遍历的递归实现先序遍历的非递归实现 3 中序遍历中序遍历的递归实现中序遍历的非递归实现 4 后续遍历后序遍历的递归实现 5 二叉树的递归建
九款实用的在线画图工具(那些可以替代Visio的应用)

Visio是付费软件通常会遇到下载安装以及授权等各种问题今天介绍的几款在线作图工具帮你抛开下载安装授权等各种烦恼 1 LucidChart LucidChart是一个基于HTML5的在线流程图绘制和协作应用平台用户可以方便
冒号等于(:=)在Python语言中是什么意思？

Python 3 8中提供了此语法在Python语言中支持运算符以允许在表达式中进行变量赋值此符号是Python语言中的赋值运算符主要称为海象运算符简而言之海象操作符压缩了我们的代码以使其更短下面是一个非常简单的例子 wi
WSL编译linux-5.16.9 时出现 fatal error: libelf.h: No such file or directory

make时出现两个错误第一个是
Mac安装Android Studio并配置环境变量

Mac安装Android Studio并配置环境变量文章目录 Mac安装Android Studio并配置环境变量安装JDK 下载并安装Android Studio 配置环境变量安装JDK 检查 JDK 版本在终端输入 java v
echarts地图自定义tooltip样式

效果图自定义tooltip样式 tooltip position 50 50 trigger item backgroundColor rgba 0 0 0 0 borderColor rgba 0 0 0 0 extraCssText
SQL学习笔记——REGEXP运算符

REGEXP运算符是正则表达式 regular expression 的缩写正则表达式在搜索字符串时非常强大下面是关于它的应用 1 查找名字中包含field的顾客 select from customers where last na
pytorch从0开始安装

文章目录一安装anaconda 1 安装pytorch前需要先安装anaonda 首先进入官网 Anaconda The World s Most Popular Data Science Platform 进行安装相应的版本 2 接着
java编辑文件FileUtils

FileUtiles 进行获取文件把每行添加到字符串数组里然后对每行进行替换最后写回文件里 import org apache commons io FileUtils try str FileUtils readFileToStri
Python 学习资源 ( 整理日期2010-02-24 )

Python 简明教程入门必看在线浏览 http www woodpecker org cn 9081 doc abyteofpython cn chinese index html PDF http bbs chinaunix ne
小酌Django4——博客文章展示

小酌Django4 博客文章展示文章列表页已发布的文章列表展示页面展示文章标题交互模式下的数据读取 blog models py中创建数据模型后 Django会自动提供数据库抽象的API ORM 进行增删改查操作使用命令pytho
TCP/IP 通信

学习资料来源正点原子STM32 目录 TCP IP TCP连接 TCP终止连接 MAC LAN8720 DMA LWIP内存分配内存池内存堆数据包管理 pbuf介绍数据包申请与释放网络接口管理 ARP协议 TCP IP TCP是
input 去除边框/设置placeholder样式--SCSS

input 去除边框设置placeholder样式 SCSS el input edit v deep input webkit input placeholder font size 22px el input inner border
scrapy爬虫框架实例二当当图书信息

spider py import scrapy from DD items import DdItem class DdSpider scrapy Spider name dd allowed domains http search dan

scrapy爬虫框架实例二 当当图书信息

scrapy爬虫框架实例二 当当图书信息 的相关文章

随机推荐

热门标签

scrapy爬虫框架实例二当当图书信息

scrapy爬虫框架实例二当当图书信息的相关文章