- 直接能确定分页数的直接根据分页num构造
- 导航栏目主页不能直接确定分页数,需要通过下一页来确定最后一页
这边主要介绍第二类如果下一页还存在,放进分页列表、就不是最后一页,如果不存在就是最后一页
import requests
import requests.sessions
from lxml import etree
import urllib3
import time
import ssl
import random
import os
import socket
import math
import re
from w3lib.html import remove_tags
from Database import Database
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
ssl._create_default_https_context = ssl._create_unverified_context
urllib3.disable_warnings()
# 获取栏目列表
def get_category_url_list(url):
res = session.get(url, headers=headers, verify=False, timeout=100)
res.encoding = "utf-8"
html = etree.HTML(res.text)
category_url_list = html.xpath('//nav[@id="site-nav"]//a/@href')
if len(category_url_list) != 0:
category_url_list.pop(0)
category_url_list.pop(0)
# 如果category_url_list为[],就只有一个栏目,把url放进category_url_list里面
if len(category_url_list) == 0:
category_url_list.append(url)
return category_url_list
def get_page_next(url):
res = session.get(url, headers=headers, verify=False, timeout=100)
res.encoding = "utf-8"
html = etree.HTML(res.text)
page_next = html.xpath('//a[@class="next page-numbers"]/@href')
return page_next
# 获取每个栏目下面的分页列表
def get_page_url_list(url):
page_url_list = []
page_url_list.append(url)
basic_url = url
i = 1
page_next = get_page_next(url)
# 如果len(page_next) > 0,有下一页,就将url放入page_url_list
while len(page_next) > 0:
i = i + 1
url = basic_url + "page/{}/"
page_next = get_page_next(url.format(i))
page_url_list.append(url.format(i))
return page_url_list
# 获取每个栏目下面的每个分页里面的每个详情页列表
def get_detail_url_list(page_url_list, sleeptime, db, sql, domain):
exception_page_url_list = []
for i in page_url_list:
try:
res = session.get(i, headers=headers, verify=False, timeout=100)
res.encoding = "utf-8"
time.sleep(sleeptime)
html = etree.HTML(res.text)
detail_url_list = html.xpath('//h2[@class="entry-title"]//a/@href')
print(detail_url_list)
get_content_url_list(detail_url_list, db, sql, domain)
except Exception as e:
exception_page_url_list.append(i)
# print(exception_page_url_list)
print(e)
continue
# 替换特殊字符
def replace_entity(str):
CHAR_ENTITIES_dict = {
" ": "",
" ": "",
" ": "",
"<": "<",
">": ">",
"&": "&",
""": '"',
"“": "“",
"”": "”",
"©": "©",
"®": "™",
"×": "×",
"÷": "÷",
"—": "—"
}
for i in list(CHAR_ENTITIES_dict.keys()):
if i in str:
str = str.replace(i, list(CHAR_ENTITIES_dict.values())[list(CHAR_ENTITIES_dict.keys()).index(i)])
return str
def filter_tag(html_str):
# 去除script
re_script = '<script[^>]*?>[\s\S]*?</script>'
html_str = re.sub(re_script, "", html_str, re.S)
# 只保留p、span、div标签
html_str = remove_tags(html_str, which_ones=(), keep=("p", 'span', 'div'))
html_str = html_str.lower()
# 匹配换行、制表等空白字符
re_blank = '\s*'
# 去除样式
re_style = r'style=".*?"'
# 匹配class选择器相关内容
re_class = r'class=[\"\'].*?[\"\']'
# 匹配id选择器等内容
re_id = r'id=".*?"'
# 匹配可见的独立左右的样式等内容
re_align = r'align=".*"'
re_data_witdth = r'data-width=".*?"'
# pip连续重复出现的<p>标签:类似于<p><p>,<p><p><p>若干个连续<p>字符串
re_p_pre_repeat = "<p[><p]+p>"
re_p_next_repeat = "</p[/p<>]+/p"
html_str = re.sub(re_style, "", html_str)
html_str = re.sub(re_blank, "", html_str)
html_str = re.sub(re_class, "", html_str)
html_str = re.sub(re_id, "", html_str)
html_str = re.sub(re_data_witdth, "", html_str)
# html_str = re.sub(re_center, "", html_str)
html_str = re.sub(re_align, "", html_str)
# .replace("<span>", "<p>").replace("</span>", "</p>")
html_str = html_str.replace("翡翠王朝", "九玉网").replace(
"www.jaadee.com", "www.91yu.com").replace("<div>",
"<p>").replace(
"</div>", "</p>")
html_str = re.sub(re_p_pre_repeat, "<p>", html_str)
html_str = re.sub(re_p_next_repeat, "</p>", html_str)
html_str = replace_entity(html_str)
return html_str.replace(">>", ">").replace("<span></span>", "").replace("<p></p>", "").replace("翡翠产业网",
"九玉网").replace(
"http://fccyw.99114.com/", "www.91yu.com").lstrip().rstrip()
def get_data(content_url_list, domain):
info = {}
info["url"] = content_url_list[0]
info["askreocrd"] = 0
info["domain"] = domain
content_all_list = []
title_list = []
for i in content_url_list:
try:
resp = session.get(i, headers=headers)
resp.encoding = "utf-8"
html = etree.HTML(resp.text)
title = html.xpath('//h1/text()')[0]
# title = re.findall('<div class="conl lf overf">.*<h1>(.*?)</h1>', resp.text)[0].lstrip().rstrip()
title = re.sub(r'\s*', "", title)
# print(title)
title_list.append(title)
content_list = re.findall(
r'id="js_content".*?>(.*?)<div id="js_sponsor_ad_area"',
resp.text, re.S)
if len(content_list) == 0:
content_list = re.findall(
r'class="single-content">(.*?)<div class="s-weixin">',
resp.text, re.S)
content_all_list.append(content_list)
except Exception as e:
print(e)
continue
info["title"] = title_list[0]
content_all_list = [i for k in content_all_list for i in k]
info["content"] = ','.join(content_all_list)
info["content"] = info["content"].replace(",", "")
info_list = [info["title"], info["content"], info["url"], info["domain"], info["askreocrd"]]
info_list[1] = filter_tag(info_list[1])
return info_list
def insert_into_database(db, sql, values):
is_exits = judge_title_is_exits(db, title=values[0])
# 把数据插入数据库之前判断文章的标题是不是已经有,标题或者内容有一个为空不插入
if is_exits == 0 and (values[0] != "" and values[1] != ""):
print(values)
values = tuple(values)
sql = sql.format(values)
try:
db.insert(sql)
except Exception as e:
print(e)
# 判断标题数据库中是否存在
def judge_title_is_exits(db, title):
judge_title_exits_sql = 'select 1 from cj_article WHERE cj_title = "{}" limit 1;'
judge_title_exits_sql = judge_title_exits_sql.format(title)
data = db.select(judge_title_exits_sql)
# 如果数据库查询返回空元组,则不存在,将is_exits的值设为0
if data == ():
is_exits = 0
else:
is_exits = data[0][0]
return is_exits
def get_content_url_list(detail_url_list, db, sql, domain):
for i in detail_url_list:
content_url_list = []
try:
response = requests.get(i, headers=headers, verify=False, timeout=100)
response.encoding = "utf-8"
html = etree.HTML(response.text)
content_num = html.xpath('//div[@class="page5"]/a[1]/b[2]/text()')
if len(content_num) == 0:
content_url_list.append(i)
else:
for num in range(int(content_num[0])):
content_url_list.append(i.split(".html")[0] + "_" + str(num + 1) + ".html")
content_url_list[0] = i
record = get_data(content_url_list, domain)
insert_into_database(db, sql, record)
except Exception as e:
pass
continue
if __name__ == '__main__':
session = requests.Session()
db = Database()
insert_sql = "insert into cj_article (cj_title,cj_content,cj_url,cj_domain,cj_askrecord) values {}"
url = 'http://www.fufeicui.com/'
domain = url.replace("http://", "").split("/")[0]
print(domain)
category_url_list = get_category_url_list(url)
print(category_url_list)
sleeptime = 0
for i in category_url_list:
page_url_list = get_page_url_list(i)
print(page_url_list)
get_detail_url_list(page_url_list, sleeptime, db, insert_sql, domain)
# 关闭数据库连接
db.close()