scrapy 项目在存储到 couchdb 时不可 JSON 序列化

2024-01-02

items.py classes

import scrapy
from scrapy.item import Item, Field
import json


class Attributes(scrapy.Item):
    description = Field()
    pages=Field()
    author=Field()
class Vendor(scrapy.Item):
    title=Field()
    order_url=Field()

class bookItem(scrapy.Item):

    title = Field()
    url = Field()
    marketprice=Field()
    images=Field()
    price=Field()
    attributes=Field()
    vendor=Field()
    time_scraped=Field()

my scraper

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapper.items import bookItem,Attributes,Vendor
import couchdb
import logging
import json
import time
from couchdb import Server


class libertySpider(CrawlSpider):
   

    couch = couchdb.Server()
    db = couch['python-tests']
    name = "libertybooks"
    allowed_domains = ["libertybooks.com"]
    unvisited_urls = []
    visited_urls = []
    start_urls = [
        "http://www.libertybooks.com"
    ]
    url=["http://www.kaymu.pk"]
    rules = [Rule(SgmlLinkExtractor(),  callback='parse_item', follow=True)]
    
    total=0
    productpages=0
    exceptionnum=0



    def parse_item(self,response):
        if response.url.find("pid")!=-1:
            with open("number.html","w") as w:
                self.total=self.total+1
                w.write(str(self.total)+","+str(self.productpages))
            itm=bookItem()
            attrib=Attributes()
            ven=Vendor()
            images=[]
            try:
                name=response.xpath('//span[@id="pagecontent_lblbookName"]/text()').extract()[0]
                name=name.encode('utf-8')
                
            except:
                name="name not found"
            try:
                price=response.xpath('//span[@id="pagecontent_lblPrice"]/text()').extract()[0]
                price=price.encode('utf-8')
            except:
                price=-1
            try:
                marketprice=response.xpath('//span[@id="pagecontent_lblmarketprice"]/text()').extract()[0]
                marketprice=marketprice.encode('utf-8')
            except:
                marketprice=-1
            try:
                pages=response.xpath('//span[@id="pagecontent_spanpages"]/text()').extract()[0]
                pages=pages.encode('utf-8')
            except:
                pages=-1
            try:
                author=response.xpath('//span[@id="pagecontent_lblAuthor"]/text()').extract()[0]
                author=author.encode('utf-8')
            except:
                author="author not found"
            try:
                description=response.xpath('//span[@id="pagecontent_lblbookdetail"]/text()').extract()[0]
                description=description.encode('utf-8')
            except:
                description="des: not found"
            try:
                image=response.xpath('//img[@id="pagecontent_imgProduct"]/@src').extract()[0]
                image=image.encode('utf-8')
            except:
                image="#"


            ven['title']='libertybooks'
            ven['order_url']=response.url
            itm['vendor']=ven
           
            itm['time_scraped']=time.ctime()
            



            itm['title']=name
            itm['url']=response.url




            itm['price']=price
            itm['marketprice']=marketprice
            itm['images']=images

            attrib['pages']=pages
            attrib['author']=author
            attrib['description']=description
            itm['attributes']=attrib
 
            self.saveindb(itm)
            return itm

    def saveindb(self,obj):
        logging.debug(obj)
        self.db.save(obj)

堆栈跟踪

2014-12-09 13:57:37-0800 [libertybooks] ERROR: Spider error processing <GET http://www.libertybooks.com/bookdetail.aspx?pid=16532>
    Traceback (most recent call last):
      File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
        call.func(*call.args, **call.kw)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
        taskObj._oneWorkUnit()
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
        result = next(self._iterator)
      File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
        work = (callable(elem, *args, **named) for elem in iterable)
    --- <exception caught here> ---
      File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
        yield next(it)
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
        for x in result:
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
        return (_set_referer(r) for r in result or ())
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
        cb_res = callback(response, **cb_kwargs) or ()
      File "/home/asad/Desktop/scraper/scraper/spiders/liberty_spider.py", line 107, in parse_item
        self.saveindb(itm)
      File "/home/asad/Desktop/scraper/scraper/spiders/liberty_spider.py", line 112, in saveindb
        self.db.save(obj)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/client.py", line 431, in save
        _, _, data = func(body=doc, **options)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 514, in post_json
        **params)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 533, in _request_json
        headers=headers, **params)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 529, in _request
        credentials=self.credentials)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 244, in request
        body = json.encode(body).encode('utf-8')
      File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 69, in encode
        return _encode(obj)
      File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 135, in <lambda>
        dumps(obj, allow_nan=False, ensure_ascii=False)
      File "/usr/lib/python2.7/json/__init__.py", line 250, in dumps
        sort_keys=sort_keys, **kw).encode(obj)
      File "/usr/lib/python2.7/json/encoder.py", line 207, in encode
        chunks = self.iterencode(o, _one_shot=True)
      File "/usr/lib/python2.7/json/encoder.py", line 270, in iterencode
        return _iterencode(o, 0)
      File "/usr/lib/python2.7/json/encoder.py", line 184, in default
        raise TypeError(repr(o) + " is not JSON serializable")
    exceptions.TypeError: {'attributes': {'author': 'Tina Fey',
     'description': "Once in a generation a woman comes along who changes everything. Tina Fey is not that woman, but she met that woman once and acted weird around her.\r\n\r\nBefore 30 Rock, Mean Girls and 'Sarah Palin', Tina Fey was just a young girl with a dream: a recurring stress dream that she was being chased through a local airport by her middle-school gym teacher.\r\n\r\nShe also had a dream that one day she would be a comedian on TV. She has seen both these dreams come true.\r\n\r\nAt last, Tina Fey's story can be told. From her youthful days as a vicious nerd to her tour of duty on Saturday Night Live; from her passionately halfhearted pursuit of physical beauty to her life as a mother eating things off the floor; from her one-sided college romance to her nearly fatal honeymoon - from the beginning of this paragraph to this final sentence.\r\n\r\nTina Fey reveals all, and proves what we've all suspected: you're no one until someone calls you bossy.",
     'pages': '304 Pages'},
     'images': [],
     'marketprice': '1,095',
     'price': '986',
     'time_scraped': 'Tue Dec  9 13:57:37 2014',
     'title': 'Bossypants',
     'url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532',
     'vendor': {'order_url': 'http://www.libertybooks.com/bookdetail.aspx?pid=16532',
     'title': 'libertybooks'}} is not JSON serializable

我是 scrapy 和 couchdb 的初学者，我还尝试使用“json.dumps(itm, default=lambda o: o.dict，sort_keys = True，indent = 4）”但得到了相同的响应，所以请告诉我有没有办法让我的类json可序列化，以便它们可以存储在couchdb中？

好吧，更简短的答案就是使用ScrapyJSON编码器 https://github.com/scrapy/scrapy/blob/master/scrapy/utils/serialize.py#L11:

from scrapy.utils.serialize import ScrapyJSONEncoder
_encoder = ScrapyJSONEncoder()

    ...

    def saveindb(self,obj):
        logging.debug(obj)
        self.db.save(_encoder.encode(obj))

较长的版本是：如果您想让这只蜘蛛成长（如果它不应该是一次性的事情），您可能需要使用pipeline http://doc.scrapy.org/en/latest/topics/item-pipeline.html将项目存储在 CouchDB 中并保持关注点分离（在蜘蛛代码中爬行/抓取，在管道代码中存储在数据库中）。

乍一看，这可能看起来像是过度设计，但当项目开始增长并使测试变得更容易时，它确实很有帮助。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

scrapy 项目在存储到 couchdb 时不可 JSON 序列化的相关文章

通过列表理解压平列表列表

我正在尝试使用 python 中的列表理解来展平列表我的清单有点像 1 2 3 4 5 6 7 8 只是为了打印这个列表列表中的单个项目我编写了这个函数 def flat listoflist for item in listoflis
Django 模型在模板中不可迭代

我试图迭代模型以获取列表中的第一个图像但它给了我错误即模型不可迭代以下是我的模型和模板的代码我只需要获取与单个产品相关的列表中的第一个图像模型 py class Product models Model title models
填充两个函数之间的区域

import matplotlib pyplot as plt import numpy as np def domain x np arange 0 10 0 001 f1 lambda x 2 x x 2 0 5 plt plot x
如何创建一个语句来打印以特定单词开头的单词？ [关闭]

Closed 这个问题需要多问focused help closed questions 目前不接受答案如何在 python 中打印从特定字母开始的单词而不使用函数而是使用方法或循环 1 我有一个字符串想要打印以 m 开头的单词 S
python suds SOAP 请求中的名称空间前缀错误

我使用 python suds 来实现客户端并且在发送的 SOAP 标头中得到了错误的命名空间前缀用于定义由element ref 在 wsdl 中 wsdl 正在引用数据类型 xsd 文件请参见下文问题出在函数上GetRecord
如何通过在 Python 3.x 上按键来启动和中断循环

我有这段代码当按下 P 键时会中断循环但除非我按下非 P 键否则循环不会工作 def main openGame while True purchase imageGrab if a sum gt 1200 fleaButton ti
Python urllib.request.urlopen：AttributeError：'bytes'对象没有属性'data'

我正在使用 Python 3 并尝试连接到dstk 我收到错误urllib包裹我对SO进行了很多研究但找不到与这个问题类似的东西 api url self api base street2coordinates api body jso
从 python 发起 SSH 隧道时出现问题

目标是在卫星服务器和集中式注册数据库之间建立 n 个 ssh 隧道我已经在我的服务器之间设置了公钥身份验证因此它们只需直接登录而无需密码提示怎么办我试过帕拉米科它看起来不错但仅仅建立一个基本的隧道就变得相当复杂尽管代码示例将受
使用鼻子获取设置中当前测试的名称

我目前正在使用鼻子编写一些功能测试我正在测试的库操作目录结构为了获得可重现的结果我存储了一个测试目录结构的模板并在执行测试之前创建该模板的副本我在测试中执行此操作 setup功能这确保了我在测试开始时始终具有明确定义的状态现在
奇怪的 MySQL Python mod_wsgi 无法连接到 'localhost' (49) 上的 MySQL 服务器问题

StackOverflow上也有类似的问题但我还没有发现完全相同的情况这是在使用 MySQL 的 OS X Leopard 机器上一些起始信息 MySQL Server version 5 1 30 Apache 2 2 13 Uni
Seaborn Pairplot 图例不显示颜色

我一直在学习如何在Python中使用seaborn和pairplot 这里的一切似乎都工作正常但由于某种原因图例不会显示相关的颜色我无法找到解决方案因此如果有人有任何建议请告诉我 x sns pairplot stats2 hue
python Soap zeep模块获取结果

我从 SOAP API 得到如下结果 client zeep Client wsdl self wsdl transport transport auth header lb E authenticate self login res cl
Tkinter - 浮动窗口 - 调整大小

灵感来自this https stackoverflow com a 22424245 13629335问题我想为我的根窗口编写自己的调整大小函数但我刚刚注意到我的代码显示了一些性能问题如果你快速调整它的大小你会发现窗口没有像我希望
如何为每个屏幕添加自己的 .py 和 .kv 文件？

我想为每个屏幕都有一个单独的 py 和 kv 文件应通过 main py main kv 中的 ScreenManager 选择屏幕设计应从文件 screen X kv 加载类等应从文件 screen X py 加载 Screens
当鼠标悬停在上面时，intellisense vscode 不显示参数或文档

我正在尝试将整个工作流程从 Eclipse 和 Jupyter Notebook 迁移到 VS Code 我安装了 python 扩展它应该带有 Intellisense 但它只是部分更糟糕我在输入句点后收到建议但当将鼠标悬停在其上方
Ubuntu 上的 Python 2.7

我是 Python 新手正在 Linux 机器 Ubuntu 10 10 上工作它正在运行 python 2 6 但我想运行 2 7 因为它有我想使用的功能有人敦促我不要安装 2 7 并将其设置为我的默认 python 我的问题是如
限制 django 应用程序模型中的单个记录？

我想使用模型来保存 django 应用程序的系统设置因此我想限制该模型使其只能有一条记录极限怎么办尝试这个 class MyModel models Model onefield models CharField The fiel
具有自定义值的 Django 管理外键下拉列表

我有 3 个 Django 模型 class Test models Model pass class Page models Model test models ForeignKey Test class Question model M
如何读取Python字节码？

我很难理解 Python 的字节码及其dis module import dis def func x 1 dis dis func 上述代码在解释器中输入时会产生以下输出 0 LOAD CONST 1 1 3 STORE FAST 0 x
Elastic Beanstalk 中的 enum34 问题

我正在尝试在 Elastic Beanstalk 中设置 django 环境当我尝试通过requirements txt 文件安装时我遇到了python3 6 问题 File opt python run venv bin pip li

随机推荐

scipy.integrate.solve_ivp 不清楚如何求解形式 0=F(t, y(t), y'(t)) 的隐式 ODE

目前我确实使用assimulos 求解器套件 https jmodelica org assimulo tutorial imp html求解 0 F t y t y t 形式的隐式微分方程我想使用本机 scipy 安装附带的求解器并
Android页面卷曲动画

有没有简单的方法来做Curl翻页动画卷曲动画是页面翻转的动画包括上面的页面滚动和下面的页面阴影一次显示两页的画廊就像一本书一样的推荐方法是什么 Is it 让适配器一次显示两个图像的线性布局它不会让我像书一样显示一页翻过另一页
为什么 echo 不返回与没有 echo 相同的结果

我有以下案例 regex OK space alnum alnum text OK AAA BBBBBB aaabbbcccdddfffed asdadadadadadsada OK CCC KKKKKKK some text here O
C# 刷新 StreamWriter 和 MemoryStream

我使用以下代码片段我不确定是否需要调用Flush方法一旦StreamWriter 一旦开启MemoryStream converts an xsd object to the corresponding xml string using
如何在 Perl 中运行子命令正确导入环境？

在从子命令导入环境时我想将从 bash 脚本导出的所有环境变量添加到哈希中什么时候program运行后它将设置一些变量并导出它们我想将这些变量保存在 Perl 脚本中供以后使用但是我不想采用子命令中定义的 bash 函数目前我
如何从 Java 获取 JanusGraphManagement

我无法理解如何从使用ConfiguredGraphFactory 创建的图表中获取JanusGraphManagement 实例我尝试做这样的事情 JanusGraphFactory Builder config JanusGraphFa
更新 Popup.Animated 以播放 gif 直到外部任务完成 (PYSimpleGUI)

我希望创建一个 UI 在执行另一项任务时显示动画弹出窗口完成后将退出我正在使用 PYSimpleGUI 并使用列出的示例here https github com PySimpleGUI PySimpleGUI blob master
自定义注释不适用于 spring Bean

我创建了新的自定义注释 MyCustomAnnotation Target ElementType METHOD ElementType TYPE ElementType FIELD Retention RUNTIME public int
com4j 与 jacob 从 Java 调用 COM 方法

我维护一个遗留的 Java 应用程序它使用Jacob http danadler com jacob 或Java COM Bridge 通过MS VBA和MS Word的COM接口进行调用我一直在看com4j https com4j d
实体框架中内容的国际化

我不断遇到 i18n 要求其中我的数据而不是 UI 需要国际化 public class FooEntity public long Id get set public string Code get set Some values m
Groupby、移位和前向填充

我有这个 df ID Date Time Lat Lon A 07 16 2019 08 00 29 39291 98 50925 A 07 16 2019 09 00 29 39923 98 51256 A 07 16 2019 10 0
为应用程序操作创建自定义内置意图

有一个可用的内置意图列表可以在应用程序操作中使用谷歌开发者网站 https developers google com actions reference built in intents 我们有什么方法可以创建自定义的内置意图吗不可以
Django Restframework、Django 通道、Ionic 2 - websocket 握手错误

我目前正在开发一个使用线程标题中提到的技术的项目我从浏览器中运行了这一切该应用程序托管在 heroku 上但是当我尝试从 Ionic 2 应用程序连接到 websockets 时我总是在建立握手时遇到错误 2016 09 17T15
Excel 2007 及更高版本之间的 Range.Interior.Color 不同

我发现 Range Interior Color 在某些情况下会为相同颜色返回不同的数字具体取决于它是否在 Excel 2007 Excel 2010 或 2013 中运行这是预期的吗我很惊讶 Range Interior Color
Nginx 从旧 URL 重定向到新 URL

我们正在更换房产搜索供应商每个供应商的 URL 格式都略有不同我们已经对 40 000 多个 URL 建立了索引并希望将用户 301 重定向到新 URL URL 中的唯一区别是从下划线切换为连字符以及从 idx 切换为 proper
更新 Vector 中对象的属性

我有一个包含对象的向量这些对象有一个称为名字的属性我想更新属性中的名字为了做到这一点我必须传递保存对象的向量唯一标识每个对象的员工编号最后是从用户输入中获取的新名称我的问题是它在循环中显示更新名称我用它来设置新名称但如果我
Android：FastScrolling SectionIndexer getSections() 仅被调用一次

我创建了一个ListView正在使用FastScroll 见图当用户单击以下任何按钮即所有曲目艺术家专辑时每次都会调用以下自定义 ArrayAdapter ArrayAdapter
Yii2：Ajax调用多个参数

我使用此代码通过 ajax 调用自动填充没有任何问题我的视图文件中的代码 this gt registerJs dailywardentry doctor visit name on change function ajax url y
我们可以使用函数作为 useEffect 中的第二个参数吗

我有以下功能 function handleEnterPress e if e keyCode 13 if value let toAdd true chips forEach chip gt if chip value value toA
scrapy 项目在存储到 couchdb 时不可 JSON 序列化

items py classes import scrapy from scrapy item import Item Field import json class Attributes scrapy Item description F

scrapy 项目在存储到 couchdb 时不可 JSON 序列化

scrapy 项目在存储到 couchdb 时不可 JSON 序列化 的相关文章

随机推荐

热门标签

scrapy 项目在存储到 couchdb 时不可 JSON 序列化的相关文章