逆向爬虫06 bs4,xpath,pyquery实战
原本想要详细的对比一下这三个模块的使用方法,但是在实战的时候发现,只要遵循一个套路,抓取静态网页(即网页信息直接放在html源代码中),就比较容易了,一些使用细节上的问题,每个人遇到的都会不一样,只有自己实实在在去练习了,才能掌握这三个工具。
套路就是遵循路飞学城逆向爬虫课程第二章数据解析中,《xpath实战案例_猪八戒》《pyquery实战案例》两节视频中的方法,懒得说了,大家自己慢慢练吧,这玩意儿光看不练是没用的。
下面直接上代码,仅供学习使用,代码可能具有时效性,过段时间若html代码结构发生变化,就不好用了。
bs4抓取猪八戒招标大厅的外包任务信息
"""
目标:爬猪八戒招标大厅中的 价格,任务标题,任务详情
url: https://task.zbj.com/page1.html
"""
import requests
from bs4 import BeautifulSoup
import time
def get_html_source(url):
resp = requests.get(url)
resp.encoding = "utf-8"
# with open("source.html", mode="w", encoding="utf-8") as f:
# f.write(resp.text)
return resp.text
def get_data_from_html(html):
page = BeautifulSoup(html, "html.parser")
div_list = page.find_all("div", attrs={"class": "result-search-item"})
# with open("result-search-item.html", mode="w", encoding="utf-8") as f:
# f.write(str(div_list))
for div in div_list:
h4 = div.find("h4")
work_title = h4.get("title")
div_detail = div.find("div", attrs={"class": "pub-desc text-line-overflow-two"})
work_detail = div_detail.text
span_price = div.find("span", attrs={"class": "price"})
price = span_price.text
work_detail = work_detail.replace("\n", "")
# print(f"{price},{work_title},{work_detail}")
with open("result.csv", mode="a", encoding="utf-8") as f:
f.write(f"{price},{work_title},{work_detail}\n")
if __name__ == "__main__":
for i in range(3400): # 根据招标大厅下面的总页数来填写
url = f"https://task.zbj.com/page{i+1}.html"
html = get_html_source(url)
get_data_from_html(html)
time.sleep(3)
print("猪八戒招标大厅信息爬取完成!")
xpath抓取程序员客栈的程序员信息
"""
目标:爬程序员客栈上程序员的信息
url: https://www.proginn.com/cat/page/1/
"""
import requests
from lxml import etree
import time
def get_html_source(url):
headers = {
# 添加一个请求头信息UA,如果没有请求头,目标服务器会拒绝我们访问,这是一个最简单的反爬手段,只需要在http请求头中添加浏览器信息,就可以骗过目标服务器。
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
}
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
# with open("source.html", mode="w", encoding="utf-8") as f:
# f.write(resp.text)
return resp.text
def get_data_from_html(html):
et = etree.HTML(html)
user_info = '/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="user-info fl"]'
name_list = et.xpath(user_info + '/div[@class="title"]/a/span/text()')
skill_list = et.xpath(f"{user_info}/div[2]/p[2]/span/text()|{user_info}/div[2]/p[2]/span[not(text())]")
workspace_list = et.xpath(f"{user_info}/div[2]/div/div[1]/span[2]/text()|{user_info}/div[2]/div/div[1]/span[not(text())]")
worktime_list = et.xpath(user_info + '/div[2]/div/div[2]/span[2]/text()')
salary_list = et.xpath('/html/body/div[@class="main"]/div[@class="main_body"]/div/div[@class="ui divided items proginn-user-list"]/div[@class="item J_user"]/div[@class="hire-info fl"]/p[1]/span/text()')
href = et.xpath(user_info + '/div[@class="title"]/a/@href')
for idx in range(15):
detail_html = get_html_source(href[idx])
detail_et = etree.HTML(detail_html)
try:
detail = detail_et.xpath('/html/head/meta[@name="description"]/@content')[0]
detail = detail.replace("\n","")
detail = detail.replace("- ","")
detail = detail.replace(" ","")
except Exception as e:
print("没有详情!")
if type(skill_list[idx]) != type(salary_list[idx]):
skill_list[idx] = "无"
else:
skill_list[idx] = skill_list[idx].replace(","," ")
if type(workspace_list[idx]) != type(salary_list[idx]):
workspace_list[idx] = "无"
print(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]}")
with open("程序员客栈程序员信息.csv", mode="a", encoding="utf-8") as f:
f.write(f"{salary_list[idx]},{workspace_list[idx]},{worktime_list[idx]},{name_list[idx]},{skill_list[idx]},{detail}\n")
if __name__ == "__main__":
for i in range(1,101): # 根据程序员客栈下面的总页数来填写
url = f"https://www.proginn.com/cat/page/{i}/"
html = get_html_source(url)
get_data_from_html(html)
pyquery抓取猎聘网爬虫岗位信息
"""
目标:爬猎聘网站上的爬虫岗位信息
url: https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB¤tPage=0&scene=page
"""
from pyquery import PyQuery
import requests
import time
def get_html_source(url):
headers = {
# 添加一个请求头信息UA,如果没有请求头,目标服务器会拒绝我们访问,这是一个最简单的反爬手段,只需要在http请求头中添加浏览器信息,就可以骗过目标服务器。
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57"
}
resp = requests.get(url, headers=headers)
resp.encoding = "utf-8"
# with open("source.html", mode="w", encoding="utf-8") as f:
# f.write(resp.text)
return resp.text
def get_data_from_html(html):
# 加载html内容
doc = PyQuery(html)
doc = doc(".left-list-box > ul:nth-child(1) li").items()
for item in doc:
work_href = item("div.job-detail-box > a:nth-child(1)").attr("href")
job_name = item("div.job-title-box > div:nth-child(1)").text()
area = item("div.job-title-box > div:nth-child(2) > span:nth-child(2)").text()
salary = item("span.job-salary").text()
gener_skill = item("span.labels-tag").items()
request = []
for skill in gener_skill:
request.append(skill.text())
request = "|".join(request)
company_href = item("div.job-detail-box > a:nth-child(2)").attr("href")
company_name = item("span.company-name").text()
area_people = []
spans = item("div.company-tags-box > span").items()
for span in spans:
area_people.append(span.text())
area_people = " ".join(area_people)
result = f"{salary},{area},{job_name},,{request},{company_name},{area_people},{work_href},{company_href}"
with open("猎聘爬虫岗位信息.csv", mode="a", encoding="utf-8") as f:
f.write(f"{result}\n")
print(result)
if __name__ == "__main__":
print("pyquery execise")
for i in range(10): # 猎聘只能查10页
url = f"https://www.liepin.com/zhaopin/?headId=1bd035b6a73e295eaafa5aedf960fe32&ckId=23fhmys0ecze35t8oork8bqoa4zydf9a&oldCkId=1bd035b6a73e295eaafa5aedf960fe32&fkId=tonyue22m6ifnzptvbka94m9o3x1nyha&skId=tonyue22m6ifnzptvbka94m9o3x1nyha&sfrom=search_job_pc&key=%E7%88%AC%E8%99%AB¤tPage={i}&scene=page"
html = get_html_source(url)
get_data_from_html(html)