爬取前程无忧招聘信息
本文是关于招聘数据爬取,我们选取的网站是前程无忧。
百度直接搜索前程无忧,或者51job。我们将看到搜索栏,在搜索栏中输入“数据分析师”将可以看到工作信息。
至于分析网站在这里就不在解释了,本爬虫只是简单爬取一点数据,所以并没有怎么做出伪装爬虫机制。所以本文仅供参考学习。如果真的对这网站想要爬取,请联系博主,我会详细写出一篇来,下面是代码和数据仅供参考。
# !/usr/bin/python
# -*- coding: utf-8 -*-
'''
@File : qianchengwu_crab.py
@Time : 2020/03/15 21:21:18
@Author : Qingxiang Zhang
@Version : 1.0
@Contact : 344285081@qq.com
@Desc :
@Software: Vscode
'''
import urllib
import requests
import re
import csv
import json
def main():
for i in range(1, 60):
print('正在爬取第{}页信息'.format(i))
baseurl = "https://search.51job.com/list/000000,000000,0130%252c7501%252c7506%252c7502,01%252c32%252c38,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html".format(i)#全国+keyword
html = askURL(baseurl)
# print(html)
# print(bs)
re_soup=re.search(r'window.__SEARCH_RESULT__ =(.*?)</script>',html)
json_data=json.loads(re_soup.group(1))
# print(json_data)
for items in json_data["engine_search_result"]:
job_name=items["job_name"]
# print(job_name)
company_name=items["company_name"]
jobwelf=items["jobwelf"]
providesalary_text=items["providesalary_text"]
#存储成csv格式
with open("./result.csv","a",encoding="utf-8",newline="") as f:
csv_write=csv.writer(f)
csv_write.writerow([job_name,company_name,providesalary_text,jobwelf])
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode('gbk', 'ignore')
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__ == '__main__':
main()
数据样式: