在这之前从未了解过小红书,然后习惯性地百度了一下。发现是这样的
研究发现,这玩意没有pc端的接口,也就是说没办法直接从pc端抓取数据。好吧,放弃。不过pc端还是有用处的
打开社区精选,点开几个推送详情页看了看,发现所有的文章url都是https://www.xiaohongshu.com/discovery/item/ + 文章绑定的一串字符,这个很关键。然后pc端不行,就只能从手机端想办法,下载了小红书的app,又看了一下微信的小红书小程序,试着用fidder抓包,然后发现小程序的更好抓,还是得借马爸爸的门啊!
采集路径有了,第一个问题顺利解决,然后开始fidder抓包。
json文件找到了。观察一下json文件里的参数,会发现。跟每一个title的同级都有一个绑定的id。看起来有点眼熟啊,是不是跟刚才社区精选看到的url的id很像?于是复制一个到pc端测试,果然
好了,第二个问题解决,现在就要思考如何拿到json文件的数据。然后批量访问
试了一下添加cookie,ua,referer。结果失败
然后一股脑把所有参数都带上了去访问,结果成功
没挨个测试反正能成功就行,有兴趣的朋友可以挨个测试,看看哪些是必要参数。
交个底,大部分参数都是固定的,只有请求头里的keyword跟X-Sign是变动的,需要手动获取。自此大部分问题解决
还有个最操蛋的问题,就是小红书对访问频率太敏感,正常的套路去爬基本上两三篇文章最多就给你弹验证,然后程序就挂了。试过添加随机请求头,不过不管用。唯一的两个办法是使用代理,这样的话基本上你要爬多少篇文章就得准备多少代理,免费ip不好用,又没钱买付费代理,这条路我不走了,有条件的朋友可以去试试,我是用的第二个方法,selenium+time降低访问速度。反正不要求效率,让程序慢慢跑就完事了,省钱省事不香吗?哈哈哈
好了,直接贴代码。
import time
import pymysql
import requests
import re
import urllib3
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import json
class Fuck_xhs(object):
def __init__(self):
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
self.conn = pymysql.connect(xxxxx, 自己写)
self.cursor = self.conn.cursor()
self.url = 'https://www.xiaohongshu.com/fe_api/burdock/weixin/v2/search/notes?'
self.key_dict = {
'1': 'Xd44b23783ad8f18c2e41c045a0cda867',
'2': 'Xe8b3f71b7585c080e9ca55e7d1b034e0',
'3': 'X2351ff0514bb05145e8171975fe1d96d',
'4': 'X2422fd5312cf50b12c722e1d63b2f9aa',
'5': 'X44d5cf63fb658c609be10404b77291d5',
}
with open('小红书url.txt', 'r', encoding='utf-8')as f:
r = f.read().replace('\ufeff', '')
self.old_list = r.split('\n')
print(self.old_list)
options = Options()
options.add_argument('--headless')
self.chrome = Chrome(options=options)
def get_detail_url(self):
for key, value in self.key_dict.items():
headers = {
'Host': 'www.xiaohongshu.com',
'Connection': 'keep-alive',
'Authorization': 'wxmp.4aad8f54-3422-4d76-b440-5f4cce8d0907',
'Device-Fingerprint': 'WHJMrwNw1k/Ff2NfArpikjizTJkAdQe2Y1P0AQTa74gJcSlBSWoMjTXYq+VUDRGsE9VCMBXrfD5W9YT2GqNMbnISuxoWerClbdCW1tldyDzmauSxIJm5Txg==1487582755342',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'X-Sign': value,
'content-type': 'application/json',
'Referer': 'https://servicewechat.com/wxffc08ac7df482a27/378/page-frame.html',
'Accept-Encoding': 'gzip, deflate, br',
}
params = {
'keyword': '揭阳周边游',
'sortBy': 'general',
'page': key,
'pageSize': '20',
'needGifCover': 'true',
}
res = requests.get(self.url, headers=headers, params=params, verify=False).text
print(res)
res_dict = json.loads(res)
notes = res_dict['data']['notes']
for note in notes:
id = note['id']
print(id)
self.detail_url = 'https://www.xiaohongshu.com/discovery/item/' + id
print(self.detail_url)
if self.detail_url in self.old_list:
print('链接已存在。')
continue
else:
with open('小红书url.txt', 'a', encoding='utf-8')as w:
w.write('\n')
w.write(self.detail_url)
self.get_detail()
continue
self.conn.close()
def get_detail(self):
self.chrome.get(self.detail_url)
time.sleep(1.5)
try:
video = self.chrome.find_element_by_xpath('//div[@class="videoframe"]')
if video:
return None
except:
pass
self.content_pic = '<ul>' + str(self.chrome.find_element_by_class_name("slide").get_attribute('innerHTML')) + '</ul>'
print(self.content_pic)
urls = re.findall(r'style="background-image.*?;"', self.content_pic, re.DOTALL)
for ur in urls:
print('ur的值为%s' % ur)
u = ''.join(re.findall(r'url\((.*?)\)', ur))
url = 'http:' + u.replace('"', '').replace('"', '').replace('https:', '').replace('http:',
'') + '.jpg'
print(url)
self.content_pic = str(self.content_pic).replace(ur, 'src=' + '"' + url + '"').replace('span', 'img').replace(
'<i data',
'<img data').replace(
'</i>', '</img>')
print(self.content_pic)
self.content = self.chrome.find_element_by_class_name('content').get_attribute('innerHTML')
try:
self.author = self.chrome.find_element_by_class_name('name-detail').text
print(self.author)
except:
self.author = ' '
try:
self.title = self.chrome.find_element_by_class_name('title').text
if not self.title:
self.title = self.chrome.find_element_by_class_name('as-p').text
print(self.title)
except:
self.title = ' '
try:
span = self.chrome.find_elements_by_xpath('//div[@class="operation-block"]/span')
self.like = span[0].find_element_by_xpath('./span').text
self.comment = span[1].find_element_by_xpath('./span').text
self.star = span[2].find_element_by_xpath('./span').text
print(self.like, self.comment, self.star)
except:
self.like = ' '
self.comment = ' '
self.star = ' '
try:
self.b_q = self.chrome.find_elements_by_xpath('//div[@class="keywords"]/a[@class="keyword category"]')
print(self.b_q)
a_l = []
for bq in self.b_q:
a = bq.text
a_l.append(a)
self.a_l = str(a_l).replace('[', '').replace(']', '').replace("'", '').replace(',', ',')
print(self.a_l)
except:
self.a_l = ' '
try:
self.pub_time = str(self.chrome.find_element_by_xpath('//div[@class="publish-date"]/span').text).replace(
'发布于', '')
print(self.pub_time)
except:
self.pub_time = ' '
try:
self.author_img = self.chrome.find_element_by_xpath('//div[@class="left-img"]/img').get_attribute('src')
print(self.author_img)
except:
self.author_img = ' '
time.sleep(5)
self.create_time = time.strftime("%Y-%m-%d %H:%M:%S")
print(self.create_time)
self.is_import = '0'
time.sleep(3)
self.deposit_mysql()
def deposit_mysql(self):
sql = "insert into xhs_article(id, author, author_img, title, text_img, content, like_count, review_count, collect_count, org_url, publish_time, keyword_tag, create_time, is_import, import_time) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,null)"
self.cursor.execute(sql, (
str(self.author), str(self.author_img), str(self.title), str(self.content_pic), str(self.content),
str(self.like), str(self.comment),
str(self.star), str(self.detail_url), str(self.pub_time), str(self.a_l), str(self.create_time),
str(self.is_import)))
self.conn.commit()
return None
if __name__ == '__main__':
xhs = Fuck_xhs()
xhs.get_detail_url()
入库
代码2020/08/14下午刚做了优化,测试可用,分享。
2020-10-12记 如果报错,先考虑参数过期情况,毕竟都两月了,自行更换最新的cookie再尝试!