我正在尝试获取使用延迟加载的网页上的整个 html。我尝试过的是一直滚动到底部,然后使用 page.content()。我还尝试在滚动到底部后滚动回页面顶部,然后使用 page.content()。两种方法都会抓取表格的一些行,但不是全部,这是我的主要目标。我相信该网页使用了react.js 的延迟加载。
const puppeteer = require('puppeteer');
const url = 'https://www.torontopearson.com/en/departures';
const fs = require('fs');
puppeteer.launch().then(async browser => {
const page = await browser.newPage();
await page.goto(url);
await page.waitFor(300);
//scroll to bottom
await autoScroll(page);
await page.waitFor(2500);
//scroll to top of page
await page.evaluate(() => window.scrollTo(0, 50));
let html = await page.content();
await fs.writeFile('scrape.html', html, function(err){
if (err) throw err;
console.log("Successfully Written to File.");
});
await browser.close();
});
//method used to scroll to bottom, referenced from user visualxcode on https://github.com/GoogleChrome/puppeteer/issues/305
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 300;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
我在这方面不太擅长,但经过长时间的搜索后,我发现一种解决方案可以为我的要求提供良好的结果。这是我用来处理延迟加载场景的代码片段。
const bodyHandle = await page.$('body');
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();
console.log('Handling viewport...')
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate(_viewportHeight => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(30);
viewportIncr = viewportIncr + viewportHeight;
}
console.log('Handling Scroll operations')
await page.evaluate(_ => {
window.scrollTo(0, 0);
});
await wait(100);
await page.screenshot({path: 'GoogleHome.jpg', fullPage: true});
由此我什至可以截取长截图。希望对你有帮助。
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)