这是打开和关闭选项卡/浏览器的方法。
from datetime import datetime
from scrape_linkedin import ProfileScraper
import random #new import made
from selenium import webdriver #new import made
import pandas as pd
import json
import os
import re
import time
my_profile_list = ['https://www.linkedin.com/in/williamhgates/', 'https://www.linkedin.com/in/christinelagarde/',
'https://www.linkedin.com/in/ursula-von-der-leyen/']
myLI_AT_Key = 'INSERT LI_AT Key'
for link in my_profile_list:
my_driver = webdriver.Chrome() #if you don't have Chromedrive in the environment path then use the next line instead of this
#my_driver = webdriver.Chrome(executable_path=r"C:\path\to\chromedriver.exe")
#sending our driver as the driver to be used by srape_linkedin
#you can also create driver options and pass it as an argument
ps = ProfileScraper(cookie=myLI_AT_Key, scroll_increment=random.randint(10,50), scroll_pause=0.8 + random.uniform(0.8,1),driver=my_driver) #changed name, default driver and scroll_pause time and scroll_increment made a little random
print('Currently scraping: ', link, 'Time: ', datetime.now())
profile = ps.scrape(url=link) #changed name
dataJSON = profile.to_dict()
profileName = re.sub('https://www.linkedin.com/in/', '', link)
profileName = profileName.replace("?originalSubdomain=es", "")
profileName = profileName.replace("?originalSubdomain=pe", "")
profileName = profileName.replace("?locale=en_US", "")
profileName = profileName.replace("?locale=es_ES", "")
profileName = profileName.replace("?originalSubdomain=uk", "")
profileName = profileName.replace("/", "")
with open(os.path.join(os.getcwd(), 'ScrapedLinkedInprofiles', profileName + '.json'), 'w') as json_file:
json.dump(dataJSON, json_file)
time.sleep(10 + random.randint(0,5)) #added randomness to the sleep time
#this will close your browser at the end of every iteration
my_driver.quit()
print('The first observation scraped was:', my_profile_list[0:])
print('The last observation scraped was:', my_profile_list[-1:])
print('END')
该刮刀默认使用Chrome作为浏览器,还可以自由选择您想要在所有可能的地方使用的浏览器,例如CompanyScraper
, ProfileScraper
, etc.
我刚刚更改了在初始化时传递的默认参数ProfileScrapper()
类并让您的驱动程序运行浏览器并关闭它而不是默认浏览器,按照您的要求在等待/睡眠间隔中添加一些随机时间(您可以根据您的需要调整它。您可以更改Random Noise
我已经增加了你的舒适度。
没有必要使用scrape_in_parallel()
正如我在评论中所建议的,但如果您愿意,您可以定义浏览器实例的数量(num_instances
)你想与你自己的驱动程序字典一起运行,也有它自己的选项(在另一个字典中):
from scrape_linkedin import scrape_in_parallel, CompanyScraper
from selenium import webdriver
driver1 = webdriver.Chrome()
driver2 = webdriver.Chrome()
driver3 = webdriver.Chrome()
driver4 = webdriver.Chrome()
my_drivers = [driver1,driver2,driver3,driver4]
companies = ['facebook', 'google', 'amazon', 'microsoft', ...]
driver_dict = {}
for i in range(1,len(my_drivers)+1):
driver_dict[i] = my_drivers[i-1]
#Scrape all companies, output to 'companies.json' file, use 4 browser instances
scrape_in_parallel(
scraper_type=CompanyScraper,
items=companies,
output_file="companies.json",
num_instances=4,
driver= driver_dict
)
它是开源代码,并且由于它仅用 Python 编写,因此您可以非常轻松地理解源代码。这是一个非常有趣的刮刀,谢谢你让我知道它!
NOTE:
正如 GitHub 问题选项卡中所述,此模块中有一些未解决的问题。如果我是你,如果这不能正常工作,我会等待更多的分叉和更新。