使用 Python 抓取 Google 购物

2024-02-17

我需要抓取 Google Shopping,例如此链接

但在服务器的响应中,我刚刚收到没有项目的测试。即使在 Google Chrome 的源代码查看器中,我也看不到项目详细信息。 什么请求可以获得所有物品详细信息数据?


您可以使用以下方法实现此目的parsel https://parsel.readthedocs.io/en/latest/ and requests https://requests.readthedocs.io/en/latest/user/quickstart/库,因为它可以在没有selenium因为您需要的一切都在 HTML 中(不是通过 JavaScript 呈现)。

确保您使用用户代理作为default requests用户代理是python-requests https://github.com/psf/requests/blob/589c4547338b592b1fb77c65663d8aa6fbb7e38b/requests/utils.py#L808-L814这样谷歌就会明白这是一个发送请求并可以阻止它的脚本。检查您的用户代理是什么 https://www.whatismybrowser.com/detect/what-is-my-user-agent/. 用户代理列表 https://developers.whatismybrowser.com/useragents/explore/(如果你需要针对每个请求轮换用户代理 https://serpapi.com/blog/how-to-reduce-chance-of-being-blocked-while-web/#rotate-user-agents).

如果很难弄清楚如何使用 CSS 选择器来提取正确的数据,请查看SelectorGadget Chrome 扩展程序 https://selectorgadget.com/它允许您在浏览器中单击所需的元素并返回 CSS 选择器。

从中提取图像数据的代码内联 JSON https://dev.to/serpapi/13-ways-to-scrape-any-public-data-from-any-website-1bn9#inline-json使用正则表达式,以及其他数据与 CSS 选择器在线 IDE 中的完整示例 https://replit.com/@DimitryZub1/Google-Scrape-Inline-Shopping-pythonserpapi#main.py:

import requests, json, re
from parsel import Selector

# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
    "q": "minecraft",
    "hl": "en",     # language
    "gl": "us",     # country of the search, US -> USA
    "tbm": "shop"   # google search shopping tab
}

# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
selector = Selector(html.text)

def get_original_images():
    all_script_tags = "".join(
        [
            script.replace("</script>", "</script>\n")
            for script in selector.css("script").getall()
        ]
    )
    
    image_urls = []
    
    for result in selector.css(".Qlx7of .sh-dgr__grid-result"):
        # https://regex101.com/r/udjFUq/1
        url_with_unicode = re.findall(rf"var\s?_u='(.*?)';var\s?_i='{result.attrib['data-pck']}';", all_script_tags)

        if url_with_unicode:
            url_decode = bytes(url_with_unicode[0], 'ascii').decode('unicode-escape')
            image_urls.append(url_decode)
            
    return image_urls

def get_suggested_search_data():
    google_shopping_data = []

    for result, thumbnail in zip(selector.css(".Qlx7of .i0X6df"), get_original_images()):
        title = result.css(".tAxDx::text").get()        
        product_link = "https://www.google.com" + result.css(".Lq5OHe::attr(href)").get()   
        product_rating = result.css(".NzUzee .Rsc7Yb::text").get()      
        product_reviews = result.css(".NzUzee > div::text").get()       
        price = result.css(".a8Pemb::text").get()       
        store = result.css(".aULzUe::text").get()       
        store_link = "https://www.google.com" + result.css(".eaGTj div a::attr(href)").get()        
        delivery = result.css(".vEjMR::text").get()

        store_rating_value = result.css(".zLPF4b .XEeQ2 .QIrs8::text").get()
        # https://regex101.com/r/kAr8I5/1
        store_rating = re.search(r"^\S+", store_rating_value).group() if store_rating_value else store_rating_value

        store_reviews_value = result.css(".zLPF4b .XEeQ2 .ugFiYb::text").get()
        # https://regex101.com/r/axCQAX/1
        store_reviews = re.search(r"^\(?(\S+)", store_reviews_value).group() if store_reviews_value else store_reviews_value

        store_reviews_link_value = result.css(".zLPF4b .XEeQ2 .QhE5Fb::attr(href)").get()
        store_reviews_link = "https://www.google.com" + store_reviews_link_value if store_reviews_link_value else store_reviews_link_value

        compare_prices_link_value = result.css(".Ldx8hd .iXEZD::attr(href)").get()      
        compare_prices_link = "https://www.google.com" + compare_prices_link_value if compare_prices_link_value else compare_prices_link_value

        google_shopping_data.append({
            "title": title,
            "product_link": product_link,
            "product_rating": product_rating,
            "product_reviews": product_reviews,
            "price": price,
            "store": store,
            "thumbnail": thumbnail,
            "store_link": store_link,
            "delivery": delivery,
            "store_rating": store_rating,
            "store_reviews": store_reviews,
            "store_reviews_link": store_reviews_link,
            "compare_prices_link": compare_prices_link,
        })

    print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))

部分输出:

]
  {
    "title": "Minecraft Mini Mob 4-Piece Figure Mood Light Set | Battery Operated",
    "product_link": "https://www.google.com/shopping/product/15256303704867209410?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wII7xY",
    "product_rating": "5.0",
    "product_reviews": null,
    "price": "$29.99",
    "store": "Oriental Trading Company",
    "thumbnail": "https://encrypted-tbn1.gstatic.com/shopping?q=tbn:ANd9GcS7Xddy5pF2gPiRFpF0E1YumatHuyBW3HYiltvZrimFoP_r3yAGWWMcYcnhaRrb7prHSAc93lWBEGQEGJ9NUCBkvQuvMCfxFXWXjY6oqrLebAmDtqcwpY6l&usqp=CAE",
    "store_link": "https://www.google.com/url?url=https://www.orientaltrading.com/minecraft-mini-mob-4-piece-figure-mood-light-set-battery-operated-a2-14260956.fltr%3Fsku%3D14260956%26cm_mmc%3DGooglePLA-_-Free-_-Google-_-14260956%26BP%3DPS544&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECPEW&usg=AOvVaw0KxuR61pE4aEt37xEXBI2O",
    "delivery": "Delivery by Wed, Dec 7",
    "store_rating": "4.7",
    "store_reviews": "45",
    "store_reviews_link": "https://www.google.com/url?url=https://www.google.com/shopping/ratings/account/metrics%3Fq%3Dorientaltrading.com%26c%3DUS%26v%3D19%26hl%3Den&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ9-wCCPkW&usg=AOvVaw2WL-Mo7EBJ9N8C4NlQEJ_n",
    "compare_prices_link": "https://www.google.com/shopping/product/15256303704867209410/offers?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECPoW"
  }, # other results
  {
    "title": "Minecraft Explorer Kit - Build Minecraft in The Real World",
    "product_link": "https://www.google.com/shopping/product/10073223339448590299?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wIIzRg",
    "product_rating": null,
    "product_reviews": null,
    "price": "$99.99",
    "store": "Make-A-Fort",
    "thumbnail": "https://encrypted-tbn2.gstatic.com/shopping?q=tbn:ANd9GcTJ55chkN9FYuwRQbupSWJRdSS70Y8XHKxQEUvOOuwHKbuBaSekHcWo9wndDFA-5_ZMlIdJFpWqMwpyMd9RDmUEiQ_DpaSaigwmPHBceO5rg885VEh_YbacBw&usqp=CAE",
    "store_link": "https://www.google.com/url?url=https://www.makeafort.fun/shop/original-fort-kits/1mek%3Futm_source%3Dgoogle-shopping%26utm_medium%3Dcpc&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECM4Y&usg=AOvVaw3ZzxgI8ILnCg0-Nd78JH7F",
    "delivery": "Delivery by Thu, Dec 8",
    "store_rating": null,
    "store_reviews": null,
    "store_reviews_link": null,
    "compare_prices_link": "https://www.google.com/shopping/product/10073223339448590299/offers?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECNEY"
  }
]

或者你可以使用谷歌购物结果API https://serpapi.com/shopping-results通过 SerpApi:

from serpapi import GoogleSearch
import requests, lxml, os, json

params = {
    "q": "minecraft",                   # search query
    "tbm": "shop",                      # shop results
    "location": "Dallas",               # location from where search comes from
    "hl": "en",                         # language of the search
    "gl": "us",                         # country of the search
    # https://docs.python.org/3/library/os.html#os.getenv
    "api_key": os.getenv("API_KEY"),    # your serpapi api
}

search = GoogleSearch(params)           # where data extraction happens on the SerpApi backend
results = search.get_dict()             # JSON -> Python dict

google_shopping_data = results["shopping_results"]
    
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False)) 

部分输出:

]
  # other results
  {
    "position": 80,
    "title": "Minecraft Steve Vacuform Mask",
    "link": "https://www.fun.com/minecraft-steve-vacuform-mask.html?mpid=191051&srsltid=AYJSbAfU8d_TRhvnvhvi9-U79_BB8bgh_dTHGkD75Dt6mq8nK0apj3hUOjY",
    "product_link": "https://www.google.com/shopping/product/15914996745618368243?gl=us",
    "product_id": "15914996745618368243",
    "serpapi_product_api": "https://serpapi.com/search.json?device=desktop&engine=google_product&gl=us&google_domain=google.com&hl=en&location=Dallas&product_id=15914996745618368243",
    "source": "Fun.com",
    "price": "$12.99",
    "extracted_price": 12.99,
    "rating": 4.1,
    "reviews": 40,
    "extensions": [
      "15% OFF"
    ],
    "thumbnail": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQe1LOeSKWgFvhVt_bct6rRohpAvl2023AqbnqE78dxwocrz7Sbre-tQ5s9M26_4q8bp86eRzI9PvfXwaBLmaESZlXwxH5HF9monqhr7jyChYqSLHWo9PcUFmU&usqp=CAE",
    "tag": "15% OFF",
    "delivery": "$4.99 delivery"
  }
]

如果您想更好地了解所示代码的作用,可以阅读专门的博客文章使用 Python 抓取 Google Shopping Tab https://serpapi.com/blog/web-scraping-google-shopping-tab-in-python/#what_will_be_scraped.

免责声明,我为 SerpApi 工作。

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

使用 Python 抓取 Google 购物 的相关文章

随机推荐