电影天堂爬虫之网页分析
from lxml import etree
import requests
BASE_DOMAIN = 'http://www.ygdy8.net'
url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
#response.text
#response.context
#requests库,默认会使用自己猜测是编码方式将抓取下来
# 的网页进行解码,然后存储到text属性中
#在电影天堂的网页中,因为编码方式,requests库猜错了,所以产生乱码
response = requests.get(url,headers=headers)
text = response.content.decode('gbk')
# etree = html.etree
html = etree.HTML(text)
#//代表的是子孙节点,table[@class='tbspan']代表的是class为tbspan的table,//a/@href这是获取a标签下的href属性值
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
for detail_url in detail_urls:
print(BASE_DOMAIN+detail_url)
# print(text)
电影天堂爬虫之爬取详情页url
from lxml import etree
import requests
BASE_DOMAIN = 'http://www.ygdy8.net'
url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
def get_detail_urls(url):
#response.text
#response.context
#requests库,默认会使用自己猜测是编码方式将抓取下来
# 的网页进行解码,然后存储到text属性中
#在电影天堂的网页中,因为编码方式,requests库猜错了,所