1. 案例 爬取糗百中的段子内容和作者名称
from lxml import etree
url = 'https://www.qiushibaike.com/text/'
page_text = requests.get(url,headers=headers).text
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@class="article block untagged mb15 typs_long"]')
for div in div_list:
author = div.xpath('//h2/text()')[0]
content = div.xpath('//a[1]/div/span//text()')
content = ''.join(content)
print(author,content)
2. http://pic.netbian.com/4kmeinv/中文乱码的处理
dirName = './meinvLibs'
if not os.path.exists(dirName):
os.mkdir(dirName)
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(1,11):
if page == 1:
new_url = 'http://pic.netbian.com/4kmeinv/'
else:
new_url = format(url%page)
page_text = requests.get(new_url,headers=headers).text
tree = etree.HTML(page_text)
a_list = tree.xpath('//div[@class="slist"]/ul/li/a')
for a in a_list:
img_src = 'http://pic.netbian.com'+a.xpath('./img/@src')[0]
img_name = a.xpath('./b/text()')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')
img_data = requests.get(img_src,headers=headers).content
imgPath = dirName+'/'+img_name+'.jpg'
with open(imgPath,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功!!!')
3. 所有城市
page_text = requests.get('https://www.aqistudy.cn/historydata/',headers=headers).text
tree = etree.HTML(page_text)
cities = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')
cities
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)