开始编写代码之前你应了解ajax 和python基础语法和库,知道异步加载!熟悉html,js。本人ide用的是vscode,浏览器是chrome,python3.7,主要用到了selenium自动化测试工具。
一:先看效果:(这里以“手机”为例,为了简单起见,也为了提高运行速度就不加载照片了,只爬取商品描述信息,而且不加载照片)
……海量的数据,未完……
二:引用的库
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
完整代码如下:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import time
url = "https://www.jd.com"
# 生成对象
driver = webdriver.Chrome()
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
def splider():
"""入口"""
# 请求主页
driver.get(url)
keys = '手机'
driver.find_element_by_id("key").send_keys(keys)
driver.find_element_by_id("key").send_keys(Keys.ENTER)
WebDriverWait(driver, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'gl-item')
)
)
# 先获取一个有多少页
all_page = eval(driver.find_element_by_css_selector('span.p-skip em b').text)
# 设置一个计数器(记录当前页数))
count = 0
# 无限循环
while True:
try:
count+=1
# 等待
WebDriverWait(driver, 1000).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'gl-item')
)
)
# 等待
time.sleep(2)
# 模拟滚动条下滑
start = 10
js = "window.scrollTo(0,{num})"
while start < 20000:
try:
# print(start)
driver.execute_script(js.format(num = str(start)))
start += 100
except Exception as e:
break
#
html = driver.page_source
# print(html)
doc = pq(html)
#print(type(doc))
#获取dom
items_ = doc('.gl-item a em').items()
for i in items_:
# regx = '<em>(.*?)<font class="skcolor_ljg">(.*?)<font class="skcolor_ljg">(.*?)</font>(.*?)</em>'
print(i.text())
except Exception as e:
continue
# 如果count==all_page就退出循环
if count == all_page:
break
# 找到下一页的元素pn-next
fp_next = driver.find_element_by_css_selector('a.pn-next')
# 点击下一页
fp_next.click()
def main():
splider()
if __name__ == '__main__':
try:
main()
except Exception as e:
print("入口异常{error}".format(error = str(e)))
三:总结
后续会有分析数据处理数据的(统计图,词云 ),敬请期待哦!!