python3.7+win10
import requests
from lxml import etree
import xlwt
from spider_seting import *
import time
from selenium import webdriver
import random
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class Excel:
_current_row = 1
def __init__(self, sheet_name='sheet1'):
title_label = ['商品编号', '商品名称', '图片路径', '价格', '商家', '商品详情地址']
self.write_work = xlwt.Workbook(encoding='ascii')
self.write_sheet = self.write_work.add_sheet(sheet_name)
for item in range(len(title_label)):
self.write_sheet.write(0, item, label=title_label[item])
def write_content(self, content):
for item in range(len(content)):
self.write_sheet.write(self._current_row, item, label=content[item])
self._current_row += 1
def save_file(self, file_url='./dj_data.xls'):
try:
self.write_work.save(file_url)
print("文件保存成功!文件路径为:" + file_url)
except IOError:
print("文件保存失败!")
def get_html(url):
chrome_driver_path = "C:/Users/Administrator/Desktop/TEM/chromedriver.exe"
option = webdriver.ChromeOptions()
option.binary_location = "G:/360browser/360Chrome/Chrome/Application/360chrome.exe"
option.add_argument('--no-sandbox')
option.add_argument('--disable-gpu')
option.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path=chrome_driver_path, chrome_options=option)
browser.get(url)
time.sleep(random.randint(1, 2))
browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(random.randint(1, 3))
source = browser.page_source
html = etree.HTML(source)
browser.quit()
return html
def save_data(search_url):
html = get_html(search_url)
goods_list = html.xpath(r'.//li[@class="gl-item"]')
serial_num_list = html.xpath(r"//li[@class='gl-item']/@data-sku")
for li,serial_num in zip(goods_list,serial_num_list):
name ="".join(li.xpath(r"div/div[@class='p-name p-name-type-2']/a/em/text()"))
img_url ="https:"+"".join(li.xpath(r'.//div/div/a/img/@src')) if "".join(li.xpath(r'.//div/div/a/img/@data-lazy-img'))=="done" else "https:"+"".join(li.xpath(r'.//div/div/a/img/@data-lazy-img'))
price = li.xpath(r".//div/div/strong/i/text()")
shop = li.xpath(r".//div/span/a/text()")
detail_addr = "https:"+"".join(li.xpath(r".//div/div[1]/a/@href"))
goods = [serial_num, name, img_url, price, shop, detail_addr]
excel.write_content(goods)
excel.write_work.save("./dj_data.xls")
if __name__ == '__main__':
excel = Excel()
keyword =input("输入搜索的关键词=")
for page in range(1,200,2):
search_url= 'https://search.jd.com/Search?keyword=' + keyword + "&page="+str(page)+'&enc=utf-8'
print(search_url)
time.sleep(2)
save_data(search_url)
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)