1.网页分析。进入https://www.qidian.com/,点击全部,进行翻页,你就会发现一个规律,
url=https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=0(1,2,3,……)
那么我么可以这样写
url = https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+'
if __name__=='__main__':
for i in range(1,6):
gethtml(start=i*1)
2.获取小说列表页面源码。
import requests
from lxml import etree
import os
def gethtml(start):
url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+''
html = requests.get(url)
page = etree.HTML(html.text)
titlelist = page.xpath('//div[@class="book-mid-info"]/h4/a/text()')
titlelinklist = page.xpath('//div[@class="book-mid-info"]/h4/a/@href')
for title,titlelink in zip(titlelist,titlelinklist):
if os.path.exists(title) == False: # 如果以该小说名为名字的文件夹不存在
os.mkdir(title) # 则新建以该小说名为名的文件夹
get_son_html(title,titlelink)
3.获取小说页面源码。
def get_son_html(title,titlelink):
html = requests.get('https:'+titlelink)
page = etree.HTML(html.text)
son_titlelist = page.xpath('//ul[@class="cf"]/li/a/text()')
son_linklist = page.xpath('//ul[@class="cf"]/li/a/@href')
for son_title,son_link in zip(son_titlelist,son_linklist):
save(son_title,son_link,title)
4.获取小说内容也源码并保存。
def save(son_title,son_link,title):
html = requests.get('https:'+son_link)
page = etree.HTML(html.text)
content = '\n'.join(page.xpath('//div[@class="read-content j_readContent"]/p/text()'))
filename = title + '\\' +son_title + '.txt' #小说名,先存在小说文件夹中,再以章节取名,存为.txt文件
print('正在保存小说……',filename)
open(filename,'w',encoding='utf-8').write(content)
5.完整代码。
import requests
from lxml import etree
import os
def gethtml(start):
url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page='+str(start)+''
html = requests.get(url)
page = etree.HTML(html.text)
titlelist = page.xpath('//div[@class="book-mid-info"]/h4/a/text()')
titlelinklist = page.xpath('//div[@class="book-mid-info"]/h4/a/@href')
for title,titlelink in zip(titlelist,titlelinklist):
if os.path.exists(title) == False: # 如果以该小说名为名字的文件夹不存在
os.mkdir(title) # 则新建以该小说名为名的文件夹
get_son_html(title,titlelink)
def get_son_html(title,titlelink):
html = requests.get('https:'+titlelink)
page = etree.HTML(html.text)
son_titlelist = page.xpath('//ul[@class="cf"]/li/a/text()')
son_linklist = page.xpath('//ul[@class="cf"]/li/a/@href')
for son_title,son_link in zip(son_titlelist,son_linklist):
save(son_title,son_link,title)
def save(son_title,son_link,title):
html = requests.get('https:'+son_link)
page = etree.HTML(html.text)
content = '\n'.join(page.xpath('//div[@class="read-content j_readContent"]/p/text()'))
filename = title + '\\' +son_title + '.txt' #小说名,先存在小说文件夹中,再以章节取名,存为.txt文件
print('正在保存小说……',filename)
open(filename,'w',encoding='utf-8').write(content)
if __name__=='__main__':
for i in range(1,6): #爬取5页
gethtml(start=i*1)