await page.waitFor(5000);
在这种情况下会导致竞争条件。如果页面在 5 秒内未加载,您可能会得到漏报。如果页面加载速度超过 5 秒,那么您就无缘无故地浪费了时间。仅选择任意延迟作为最后的手段,或者它是应用程序逻辑的预期部分。
更好的方法是使用page.waitForSelector https://devdocs.io/puppeteer/#pagewaitforselectorselector-options or page.waitForNavigation https://devdocs.io/puppeteer/#pagewaitfornavigationoptions.
其次,我没有看到选择器的结果.rc .r
。我不确定 Google 的 CSS 选择器有多稳定,但是.LC20lb
目前粗略看来是安全的。
把它放在一起给出:
const puppeteer = require("puppeteer"); // ^19.6.3
let browser;
(async () => {
const searchQuery = "stack overflow";
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setRequestInterception(true);
page.on("request", request => {
request.resourceType() === "document" ?
request.continue() : request.abort();
});
await page.goto("https://www.google.com/", {waitUntil: "domcontentloaded"});
await page.waitForSelector('input[aria-label="Search"]', {visible: true});
await page.type('input[aria-label="Search"]', searchQuery);
await Promise.all([
page.waitForNavigation({waitUntil: "domcontentloaded"}),
page.keyboard.press("Enter"),
]);
await page.waitForSelector(".LC20lb", {visible: true});
const searchResults = await page.$$eval(".LC20lb", els =>
els.map(e => ({title: e.innerText, link: e.parentNode.href}))
);
console.log(searchResults);
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
输出(您的输出可能会有所不同,具体取决于运行脚本时 Google 显示的内容):
[
{
title: 'Stack Overflow - Where Developers Learn, Share, & Build ...',
link: 'https://stackoverflow.com/'
},
{
title: 'Stack Overflow - Wikipedia',
link: 'https://en.wikipedia.org/wiki/Stack_Overflow'
},
{
title: 'Stack Overflow Blog - Essays, opinions, and advice on the act ...',
link: 'https://stackoverflow.blog/'
},
{
title: 'The Stack Overflow Podcast - Stack Overflow Blog',
link: 'https://stackoverflow.blog/podcast/'
},
{
title: 'Stack Overflow | LinkedIn',
link: 'https://www.linkedin.com/company/stack-overflow'
}
]
另一种方法是将搜索词编码为 URL 查询参数并直接导航到https://www.google.com/search?q=your+query+here
,避免导航和潜在的选择器事故。
与许多抓取任务一样,由于目标是从文档中获取简单的 href,因此您可以尝试切换到fetch
/cheerio
并使用静态 HTML。在我的机器上,以下脚本的运行速度比具有两次导航的 Puppeteer 快约 5 倍,比直接导航到搜索结果的 Puppeteer 快约 3 倍。
const cheerio = require("cheerio"); // 1.0.0-rc.12
const query = "stack overflow";
const url = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
fetch(url, { // Node 18 or install node-fetch
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
}
})
.then(res => res.text())
.then(html => {
const $ = cheerio.load(html);
const searchResults = [...$(".LC20lb")].map(e => ({
title: $(e).text().trim(),
link: e.parentNode.attribs.href,
}));
console.log(searchResults);
});
也可以看看使用 Puppeteer 单击第一个 Google 搜索结果上的元素 https://stackoverflow.com/questions/64470495/click-an-element-on-first-google-search-result-using-puppeteer/67523820#67523820.