您可以使用以下方法实现此目的parsel https://parsel.readthedocs.io/en/latest/ and requests https://requests.readthedocs.io/en/latest/user/quickstart/库,因为它可以在没有selenium
因为您需要的一切都在 HTML 中(不是通过 JavaScript 呈现)。
确保您使用用户代理作为default requests用户代理是python-requests https://github.com/psf/requests/blob/589c4547338b592b1fb77c65663d8aa6fbb7e38b/requests/utils.py#L808-L814这样谷歌就会明白这是一个发送请求并可以阻止它的脚本。检查您的用户代理是什么 https://www.whatismybrowser.com/detect/what-is-my-user-agent/. 用户代理列表 https://developers.whatismybrowser.com/useragents/explore/(如果你需要针对每个请求轮换用户代理 https://serpapi.com/blog/how-to-reduce-chance-of-being-blocked-while-web/#rotate-user-agents).
如果很难弄清楚如何使用 CSS 选择器来提取正确的数据,请查看SelectorGadget Chrome 扩展程序 https://selectorgadget.com/它允许您在浏览器中单击所需的元素并返回 CSS 选择器。
从中提取图像数据的代码内联 JSON https://dev.to/serpapi/13-ways-to-scrape-any-public-data-from-any-website-1bn9#inline-json使用正则表达式,以及其他数据与 CSS 选择器在线 IDE 中的完整示例 https://replit.com/@DimitryZub1/Google-Scrape-Inline-Shopping-pythonserpapi#main.py:
import requests, json, re
from parsel import Selector
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "minecraft",
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"tbm": "shop" # google search shopping tab
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
def get_original_images():
all_script_tags = "".join(
[
script.replace("</script>", "</script>\n")
for script in selector.css("script").getall()
]
)
image_urls = []
for result in selector.css(".Qlx7of .sh-dgr__grid-result"):
# https://regex101.com/r/udjFUq/1
url_with_unicode = re.findall(rf"var\s?_u='(.*?)';var\s?_i='{result.attrib['data-pck']}';", all_script_tags)
if url_with_unicode:
url_decode = bytes(url_with_unicode[0], 'ascii').decode('unicode-escape')
image_urls.append(url_decode)
return image_urls
def get_suggested_search_data():
google_shopping_data = []
for result, thumbnail in zip(selector.css(".Qlx7of .i0X6df"), get_original_images()):
title = result.css(".tAxDx::text").get()
product_link = "https://www.google.com" + result.css(".Lq5OHe::attr(href)").get()
product_rating = result.css(".NzUzee .Rsc7Yb::text").get()
product_reviews = result.css(".NzUzee > div::text").get()
price = result.css(".a8Pemb::text").get()
store = result.css(".aULzUe::text").get()
store_link = "https://www.google.com" + result.css(".eaGTj div a::attr(href)").get()
delivery = result.css(".vEjMR::text").get()
store_rating_value = result.css(".zLPF4b .XEeQ2 .QIrs8::text").get()
# https://regex101.com/r/kAr8I5/1
store_rating = re.search(r"^\S+", store_rating_value).group() if store_rating_value else store_rating_value
store_reviews_value = result.css(".zLPF4b .XEeQ2 .ugFiYb::text").get()
# https://regex101.com/r/axCQAX/1
store_reviews = re.search(r"^\(?(\S+)", store_reviews_value).group() if store_reviews_value else store_reviews_value
store_reviews_link_value = result.css(".zLPF4b .XEeQ2 .QhE5Fb::attr(href)").get()
store_reviews_link = "https://www.google.com" + store_reviews_link_value if store_reviews_link_value else store_reviews_link_value
compare_prices_link_value = result.css(".Ldx8hd .iXEZD::attr(href)").get()
compare_prices_link = "https://www.google.com" + compare_prices_link_value if compare_prices_link_value else compare_prices_link_value
google_shopping_data.append({
"title": title,
"product_link": product_link,
"product_rating": product_rating,
"product_reviews": product_reviews,
"price": price,
"store": store,
"thumbnail": thumbnail,
"store_link": store_link,
"delivery": delivery,
"store_rating": store_rating,
"store_reviews": store_reviews,
"store_reviews_link": store_reviews_link,
"compare_prices_link": compare_prices_link,
})
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
部分输出:
]
{
"title": "Minecraft Mini Mob 4-Piece Figure Mood Light Set | Battery Operated",
"product_link": "https://www.google.com/shopping/product/15256303704867209410?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wII7xY",
"product_rating": "5.0",
"product_reviews": null,
"price": "$29.99",
"store": "Oriental Trading Company",
"thumbnail": "https://encrypted-tbn1.gstatic.com/shopping?q=tbn:ANd9GcS7Xddy5pF2gPiRFpF0E1YumatHuyBW3HYiltvZrimFoP_r3yAGWWMcYcnhaRrb7prHSAc93lWBEGQEGJ9NUCBkvQuvMCfxFXWXjY6oqrLebAmDtqcwpY6l&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.orientaltrading.com/minecraft-mini-mob-4-piece-figure-mood-light-set-battery-operated-a2-14260956.fltr%3Fsku%3D14260956%26cm_mmc%3DGooglePLA-_-Free-_-Google-_-14260956%26BP%3DPS544&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECPEW&usg=AOvVaw0KxuR61pE4aEt37xEXBI2O",
"delivery": "Delivery by Wed, Dec 7",
"store_rating": "4.7",
"store_reviews": "45",
"store_reviews_link": "https://www.google.com/url?url=https://www.google.com/shopping/ratings/account/metrics%3Fq%3Dorientaltrading.com%26c%3DUS%26v%3D19%26hl%3Den&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ9-wCCPkW&usg=AOvVaw2WL-Mo7EBJ9N8C4NlQEJ_n",
"compare_prices_link": "https://www.google.com/shopping/product/15256303704867209410/offers?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECPoW"
}, # other results
{
"title": "Minecraft Explorer Kit - Build Minecraft in The Real World",
"product_link": "https://www.google.com/shopping/product/10073223339448590299?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wIIzRg",
"product_rating": null,
"product_reviews": null,
"price": "$99.99",
"store": "Make-A-Fort",
"thumbnail": "https://encrypted-tbn2.gstatic.com/shopping?q=tbn:ANd9GcTJ55chkN9FYuwRQbupSWJRdSS70Y8XHKxQEUvOOuwHKbuBaSekHcWo9wndDFA-5_ZMlIdJFpWqMwpyMd9RDmUEiQ_DpaSaigwmPHBceO5rg885VEh_YbacBw&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.makeafort.fun/shop/original-fort-kits/1mek%3Futm_source%3Dgoogle-shopping%26utm_medium%3Dcpc&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECM4Y&usg=AOvVaw3ZzxgI8ILnCg0-Nd78JH7F",
"delivery": "Delivery by Thu, Dec 8",
"store_rating": null,
"store_reviews": null,
"store_reviews_link": null,
"compare_prices_link": "https://www.google.com/shopping/product/10073223339448590299/offers?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECNEY"
}
]
或者你可以使用谷歌购物结果API https://serpapi.com/shopping-results通过 SerpApi:
from serpapi import GoogleSearch
import requests, lxml, os, json
params = {
"q": "minecraft", # search query
"tbm": "shop", # shop results
"location": "Dallas", # location from where search comes from
"hl": "en", # language of the search
"gl": "us", # country of the search
# https://docs.python.org/3/library/os.html#os.getenv
"api_key": os.getenv("API_KEY"), # your serpapi api
}
search = GoogleSearch(params) # where data extraction happens on the SerpApi backend
results = search.get_dict() # JSON -> Python dict
google_shopping_data = results["shopping_results"]
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
部分输出:
]
# other results
{
"position": 80,
"title": "Minecraft Steve Vacuform Mask",
"link": "https://www.fun.com/minecraft-steve-vacuform-mask.html?mpid=191051&srsltid=AYJSbAfU8d_TRhvnvhvi9-U79_BB8bgh_dTHGkD75Dt6mq8nK0apj3hUOjY",
"product_link": "https://www.google.com/shopping/product/15914996745618368243?gl=us",
"product_id": "15914996745618368243",
"serpapi_product_api": "https://serpapi.com/search.json?device=desktop&engine=google_product&gl=us&google_domain=google.com&hl=en&location=Dallas&product_id=15914996745618368243",
"source": "Fun.com",
"price": "$12.99",
"extracted_price": 12.99,
"rating": 4.1,
"reviews": 40,
"extensions": [
"15% OFF"
],
"thumbnail": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQe1LOeSKWgFvhVt_bct6rRohpAvl2023AqbnqE78dxwocrz7Sbre-tQ5s9M26_4q8bp86eRzI9PvfXwaBLmaESZlXwxH5HF9monqhr7jyChYqSLHWo9PcUFmU&usqp=CAE",
"tag": "15% OFF",
"delivery": "$4.99 delivery"
}
]
如果您想更好地了解所示代码的作用,可以阅读专门的博客文章使用 Python 抓取 Google Shopping Tab https://serpapi.com/blog/web-scraping-google-shopping-tab-in-python/#what_will_be_scraped.
免责声明,我为 SerpApi 工作。