小红书爬取实战指南

一、小红书爬取特点与难点

小红书(Xiaohongshu)作为流行的社交电商平台，具有以下特点：

内容以图文/短视频为主
强用户互动属性（点赞、收藏、评论）
严格的反爬机制（包括但不限于）：

请求头验证
行为指纹检测
频繁弹验证码
账号封禁策略

二、合法合规前提

重要提示：

严格遵守小红书用户协议
不爬取用户隐私数据
控制请求频率（建议≤1请求/3秒）
仅用于学习研究目的

三、实战爬取方案

3.1 方案一：Web端模拟（需登录）

import requests
from bs4 import BeautifulSoup
import time
import random

# 登录后的cookie（需手动获取）
cookies = {
            
    'xhsTrackerId': 'your_tracker_id',
    'xhsTrackerId.sig': 'your_sig',
    'a1': 'your_a1_cookie'
}

headers = {
            
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Referer': 'https://www.xiaohongshu.com/'
}

def get_note_detail(note_id):
    url = f'https://www.xiaohongshu.com/explore/{
              note_id}'
    try:
        response = requests.get(url, headers=headers, cookies=cookies, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 解析页面数据
            title = soup.find('h1', class_='title').text if soup.find('h1', class_='title') else '无标题'
            author = soup.find('a', class_='nickname').text if soup.find('a', class_='nickname') else '匿名'
            
            print(f'标题: {
              title}')
            print(f'作者: {
              author}')
            
        elif response.status_code == 403:
            print('触发反爬，需要验证码或更换cookie')
    except Exception as e:
        print(f'请求失败: {
              str(e)}')
    
    time.sleep(random.uniform(2, 5))

# 示例：爬取指定笔记
get_note_detail('63fd8e4b000000001f03cd15')

3.2 方案二：移动端API逆向（推荐）

通过抓包获取APP接口：

import requests
import json

# 接口参数（需定期更新）
params = {
            
    'note_id': '63fd8e4b000000001f03cd15',
    'source': 'note',
    'app_version': '7.25.0',
    'deviceId': 'your_device_id',
    't': str(int(time.time()*1000))
}

headers = {
            
    'User-Agent': 'Xiaohongshu/7.25.0 (iPhone; iOS 15.4; Scale/3.00)',
    'Authorization': 'your_auth_token',
    'X-Sign': generate_x_sign(params)  # 需要逆向生成
}

def get_note_api():
    url = 'https://edith.xiaohongshu.com/api/sns/web/v1/note'
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            note_info = data['data']['items'][0]['note']
            
            print(f"标题: {
              note_info['title']}")
            print(f"描述: {
              note_info['desc']}")
            print(f"点赞数: {
              note_info['liked_count']}")
            
            # 下载图片
            for img in note_info['images']:
                download_image(img['url'])
                
    except Exception as e:
        print(f'API请求失败: {
              str(e)}')

def generate_x_sign(params):
    """需要逆向APP算法生成签名"""
    # 这里应该是你的签名算法实现
    return 'generated_signature'

def download_image(url):
    # 实现图片下载
    pass

3.3 方案三：Selenium模拟（高仿真实操）

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
import random

chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15")

driver = webdriver.Chrome(options=chrome_options)

def human_like_interaction():
    # 模拟人类滚动
    for _ in range(random.randint(2, 5)):
        ActionChains(driver).scroll_by_amount(0, random.randint(300, 800)).perform()
        time.sleep(random.uniform(0.5, 2))
    
    # 随机移动鼠标
    action = ActionChains(driver)
    for _ in range(3):
        action.move_by_offset(random.randint(-50, 50), random.randint(-50, 50)).perform()
        time.sleep(0.2)

def scrape_xhs():
    try:
        driver.get('https://www.xiaohongshu.com/explore')
        time.sleep(5)
        
        # 执行人类行为模拟
        human_like_interaction()
        
        # 获取页面内容
        notes = driver.find_elements(By.CSS_SELECTOR, '.note-item')
        for note in notes[:5]:  # 限制爬取数量
            title = note.find_element(By.CSS_SELECTOR, '.title').text
            author = note.find_element(By.CSS_SELECTOR, '.nickname').text
            print(f'发现笔记: {
              title} - 作者: {
              author}')
            
            time.sleep(random.uniform(1, 3))
            
    finally:
        driver.quit()

scrape_xhs()

四、关键反反爬策略

4.1 设备指纹伪装

// 在Selenium中执行的JS代码
Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined})
Object.defineProperty(navigator, 'plugins', {
            get: () => [1, 2, 3]})
Object.defineProperty(navigator, 'languages', {
            get: () => ['zh-CN', 'zh']})

4.2 验证码处理方案

# 使用2Captcha服务处理验证码
def solve_captcha(site_key, url):
    api_key = "YOUR_2CAPTCHA_KEY"
    solver = TwoCaptcha(api_key)
    
    try:
        result = solver.recaptcha(
            sitekey=site_key,
            url=url
        )
        return result['code']
    except Exception as e:
        print(f"验证码解决失败: {
              str(e)}")
        return None

4.3 请求参数加密逆向

小红书APP的X-Sign生成算法示例（需逆向工程）：

def generate_x_sign(params):
    """
    模拟小红书签名算法（示例，实际需要逆向APP）
    真实算法通常包含：
    - 参数排序
    - 添加salt
    - MD5/HMAC加密
    """
    sorted_params = sorted(params.items(), key=lambda x: x[0])
    param_str = '&'.join([f'{
              k}={
              v}' for k, v in sorted_params])
    salt = 'xhssaltvalue'  # 需要逆向获取真实salt
    sign = hashlib.md5((param_str + salt).encode()).hexdigest()
    return sign

五、数据存储方案

5.1 MongoDB存储示例

from pymongo import MongoClient
from datetime import datetime

client = MongoClient('mongodb://localhost:27017/')
db = client['xiaohongshu']
notes_collection = db['notes']

def save_to_mongodb(note_data):
    note_data['crawl_time'] = datetime.now()
    result = notes_collection.insert_one(note_data)
    print(f'插入成功，ID: {
              result.inserted_id}')

# 示例数据
note_example = {
            
    'note_id': '63fd8e4b000000001f03cd15',
    'title': '夏日穿搭分享',
    'author': '时尚达人',
    'likes': 2450,
    'tags': ['穿搭', '夏季', '时尚']
}
save_to_mongodb(note_example)

六、实战建议

账号管理

准备多个低活跃度账号轮换
每个账号每日请求<100次
模拟真实用户行为（浏览、点赞间隔）

IP策略

使用高质量住宅代理
每个IP每小时请求<30次
避免突然大量请求

监控机制

def check_anti_spider(response):
    if '验证码' in response.text:
        raise Exception('触发验证码')
    if response.status_code == 403:
        raise Exception('IP被封禁')

数据去重

from hashlib import md5

def get_content_hash(content):
    return md5(content.encode()).hexdigest()

if get_content_hash(new_content) not in existing_hashes:
    process_content(new_content)