自学的python和爬虫技术。使用到Redis,MySQL数据库;request请求模块;re,Xpath解析模块;dumps, loads序列化和反序列化。还可以配合代理池使用。
爬取的是https://weixin.sogou.com/。网站只能微信扫码登陆,不登录只能访问十页,这里
使用Session保持会话实现登陆状态爬取100页,cookie是自己添加的。
第一次发文,试一试页面效果hhhh,有机会再具体解析思路。
算是我自学的一个阶段性成果,因为我是外行,代码中肯定有些愚蠢的地方,贴出源代码来供大家批判指正。
python3.6环境,运行确保安装相关python库以及数据库
from requests import Request, ConnectionError, ReadTimeout
from requests import Session
from pickle import dumps, loads
from urllib.parse import urlencode
from lxml import etree
import pymysql
#控制代理池的开启
from run import main
import time
import re
from redis import StrictRedis
TIMEOUT = 10
MAX_FAILED_TIME = 3
REDIS_KEY = 'weixinrequests_liu'
REDIS_HOST = "localhost"
REDIS_PORT = 6379
REDIS_PASSWORD = None
MYSQL_HOST = "localhost"
MYSQL_NAME = 'root'
MYSQL_PASSWORD = '*****'
MYSQL_DATABASE = 'samp_db'
MYSQL_PORT = 3306
PROXY_POOL_URL = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=2f80314147e34f04bf5705006d837a86&orderno=YZ2018112477906eltR9&returnType=1&count=10'
# PROXY_POOL_URL = 'http://localhost:5555/random'
KEY_WORDS = "杨超越"
COOKIE = 'SUV=0081272D655E8569592959DA71D95636; SUID=8AB1E7652F20910A00000000595B0BDA; ld=4Zllllllll2bLZA5lllllVsKxm1lllllBqf3vZllll9lllllxllll5@@@@@@@@@@; ABTEST=3|1543134403|v1; IPLOC=CN3100; weixinIndexVisited=1; sct=1; ppinf=5|1543150392|1544359992|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTYlOUQlOEUlRTUlOEQlOUF8Y3J0OjEwOjE1NDMxNTAzOTJ8cmVmbmljazoxODolRTYlOUQlOEUlRTUlOEQlOUF8dXNlcmlkOjQ0Om85dDJsdUgyb3R2dks1bFoydFpTU1Q3MDJWY0VAd2VpeGluLnNvaHUuY29tfA; pprdig=QPtri4HHDEm4Gz9hxTvUj8MO9ymKgOe2EgkwA3uuYG0JIjd1IM8NnWkE6f1vrIt4mlMoC1Nmomb6ntUbGAjANhGeEkJOid0_Yk4g0yBHTA0FQ3_WMsbYhS0SQN_Sbmvj66AVXN93ZhbqAzmflqUoyNdw7YGswhD3tZ7J0xu0i0U; sgid=12-38075013-AVv6mzhP6ltMX5qfqx8JPJc; SUIR=AC6DB982E7ED939ECB44C2A1E85C066B; SNUID=AA6ABE84E0E49B9DDE6C0D11E196941C; JSESSIONID=aaaZoBR-hzstpadzRe7Cw; ppmdig=1543197423000000309d9113ade27bf0e5a2598fd5f1caba'
START_ID = 10
DIC_TIME = 0
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': COOKIE,
# 'Host':'weixin.sogou.com',
'Upgrade-Insure-Request': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
TABLE_NAME = 'seshi3'
MAX_PAGE = 100
PROXY_ON = False
class MySQL():
def __init__(self, host=MYSQL_HOST, username=MYSQL_NAME, password=MYSQL_PASSWORD, port=MYSQL_PORT, database=MYSQL_DATABASE):
try:
self.db = pymysql.connect(host=host, user=username, password=password, db=database, port=port)
self.cursor = self.db.cursor()
except Exception as e:
print(e)
def insert(self, table, data):
keys = ','.join(data.keys())
values = ','.join(len(data)*['%s'])
print(values)
sql_query = 'INSERT INTO {}({}) values ({})'.format(table, keys, values)
try:
self.cursor.execute(sql_query, tuple(data.values()))
self.db.commit()
print('真的插入成功了')
except Exception as e:
print(e,'插入失败')
self.db.rollback()
class WeixinRequest(Request):
def __init__(self, url, callback, method='GET', headers=None, need_proxy=False, fail_time=0, timeout=TIMEOUT):
Request.__init__(self, method, url, headers)
self.callback = callback
self.need_proxy = need_proxy
self.fail_time = fail_time
self.timeout = timeout
class RedisQueue():
def __init__(self):
self.db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
def add(self, request):
if isinstance(request, WeixinRequest):
return self.db.rpush(REDIS_KEY, dumps(request))
else:
print("meiyoujiaaa")
return False
def pop(self):
if self.db.llen(REDIS_KEY):
return loads(self.db.lpop(REDIS_KEY))
else:
return False
def empty(self):
return self.db.llen(REDIS_KEY) == 0
def get_proxy():
if PROXY_ON:
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print('获得代理', response.text)
return response.text
return None
except Exception as e:
print(e)
return None
False
class Spider():
base_url = r'https://weixin.sogou.com/weixin'
session = Session()
queue = RedisQueue()
mysql = MySQL()
def __init__(self, dictime, start_id, table_name, headers, key_words, max_page):
self.rtime = start_id
self.dictime = dictime
self.start_id = start_id
self.table_name = table_name
self.headers = headers
self.key_words = key_words
self.max_page = max_page
def start(self):
self.session.headers.update(self.headers)
start_url = 'https://weixin.sogou.com/weixin?query={}&_sug_type_=&sut=4722&lkt=1%2C1542979381375%2C1542979381375&s_from=input&_sug_=y&type=2&sst0=1542979381478&page={}&ie=utf8&w=01019900&dr=1'.format(self.key_words, self.start_id)
wexin_request = WeixinRequest(start_url, callback=self.parse_index, need_proxy=False)
self.queue.add(wexin_request)
print(wexin_request.url, "是开始获取的url")
def parse_index(self, r):
html = etree.HTML(r.text)
html_li = html.xpath("//ul[@class='news-list']/li")
for url_list in html_li:
url = url_list.xpath(".//h3/a/@href")[0]
print('parese_index获取到这个微信url(dictime,rtime)', (self.dictime,self.rtime), url)
self.dictime += 1
yield WeixinRequest(url=url, callback=self.parse_detail, need_proxy=True, headers=self.headers, method='GET')
pattern2 = re.compile(r'''sogou_next.*?="(.*?)"''')
next = re.findall(pattern2, r.text)[0] if len(re.findall(pattern2, r.text)) >= 1 else 0
if next:
next_url = self.base_url + next
self.rtime += 1
print('哈哈哈parese_index自动获取到这个下一页url(dictime,rtime)', (self.dictime, self.rtime), next_url)
yield WeixinRequest(url=next_url, callback=self.parse_index, need_proxy=True, headers=self.headers)
else:
next_url = 'https://weixin.sogou.com/weixin?query={}&_sug_type_=&sut=4722&lkt=1%2C1542979381375%2C1542979381375&s_from=input&_sug_=y&type=2&sst0=1542979381478&page={}&ie=utf8&w=01019900&dr=1'.format(self.key_words,self.rtime+1)
self.rtime += 1
if self.rtime <= self.max_page:
print('啊啊啊自动构造这个下一页url(dictime,rtime)', (self.dictime, self.rtime), next_url)
yield WeixinRequest(url=next_url, callback=self.parse_index, need_proxy=True, headers=self.headers)
def parse_detail(self, response):
r = response.text
pattern = re.compile(r'publish_time = "(.*?)"')
date = re.findall(pattern, r)[0] if len(re.findall(pattern, r))>=1 else "日期出错"
# print(r.text)
html = etree.HTML(r)
title = html.xpath("//h2[@class='rich_media_title']//text()")[0].strip() if len(html.xpath("//h2[@class='rich_media_title']//text()"))>=1 else "标题出错"
lend = len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span"))
wechat = html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()]/a/text()")[0].strip('\n').strip() if len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()]/a/text()"))>=1 else "wechat出错"
pattern3 = re.compile(r'''js_name.*?>(.*?)
nickname = re.findall(pattern3,r)[0].strip('\n').strip() if len(re.findall(pattern3,r))>0 else 'nickname出错'
if lend == 3:
original = \
html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()-2]/text()")[0].strip('\n').strip() if len(html.xpath("//div[contains(@id,'meta_content') and @class='rich_media_meta_list']/span[last()-2]/text()"))>=1 else "original出错"
else:
original = ''
content_list = html.xpath("//div[@id='img-content']/div[2]//p//text()") if len(html.xpath("//div[@id='img-content']/div[2]//p//text()")) >=1 else "内容出错"
neirong = ""
for i in content_list:
neirong += i.strip()
data = {
'title':title,
'original':original,
'nickname':nickname,
'date': date,
'wechat':wechat,
'content':neirong
}
yield data
def schedule(self):
while not self.queue.empty():
print("开始调度次", "\n", "="*20)
weixin_request = self.queue.pop()
print("调度选出了这个url", weixin_request.url, '选出了这个url')
# 存在两种可能的请求
callback = weixin_request.callback
# 先请求
response = self.request(weixin_request)
print(response, '获得相应*********')
if hasattr(response, 'status_code'):
print(response.status_code, '上面url的状态')
if response and (response.status_code == 200):
# 再解析
results = list(callback(response))
if results:
# print("解析结果",results)
for result in results:
if isinstance(result, WeixinRequest):
self.queue.add(result)
print('解析结果,搜索页文章列表', result.url)
# self.rtime += 1
if isinstance(result, dict):
self.mysql.insert(table=self.table_name, data=result)
print("插入到MYSQL",result)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
def error(self, weixin_request):
# 接受一个请求
print("休息二秒")
weixin_request.fail_time += 1
print("请求失败", weixin_request.fail_time, "次", weixin_request.url)
time.sleep(20)
if weixin_request.fail_time < MAX_FAILED_TIME:
self.queue.add(weixin_request)
def request(self,weixin_request):
try:
proxy = get_proxy()
print("代理状态,", proxy)
if proxy:
proxies ={
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
return self.session.send(weixin_request.prepare(), proxies=proxies)
time.sleep(3)
return self.session.send(weixin_request.prepare())
except (ConnectionError, ReadTimeout) as e:
print(e)
return False
def run(self):
self.start()
self.schedule()
if __name__ == '__main__':
#mysql内容编码utf8m4
spider = Spider(
dictime=DIC_TIME,
start_id=START_ID,
table_name=TABLE_NAME,
key_words=KEY_WORDS,
headers=HEADERS,
max_page=MAX_PAGE
)
spider.run()
print(spider.rtime, "请求页数")
print(spider.dictime, "请求篇数")
# MySQL存储结果
# 运行结果