TrialCommend/TR/TrialRecommend/Notes_Request.py

import json
import re
from csv import DictWriter, DictReader
import requests
from bs4 import BeautifulSoup

def notes_request(url):
    url='https://www.xiaohongshu.com/explore/'+url
    headers = {
        'authority':'edith.xiaohongshu.com',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Content-Type':'text/html; charset=utf-8',
        'Origin':'https://www.xiaohongshu.com',
        'Referer':'https://www.xiaohongshu.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
    cookies = {
        'abRequestId':'a3c8a855-ed85-57f6-91fe-3e0acedffde8',
        'a1':'18f9b67ecdbh41bri2la0dgguminrd7dmmrk4pxf750000215521',
        'webId':'6d97ebd3e6051489537e7a6aa1d1cf99',
        'gid':'yYijDKYydJ6JyYijDKWdS6UCfDx4yDF3J108FiWfkk763T28IFfWf4888Jy22Jy8WYf0SyjD',
        'webBuild':'4.16.1',
        'web_session':'040069b3643d32585fff79ee79344bd4b89bc6',
        'acw_tc':'25f299cc55249ae472ab6eab4f8404ce338ad4b4544c7f87a592375d787ede3a',
        'websectiga':'16f444b9ff5e3d7e258b5f7674489196303a0b160e16647c6c2b4dcb609f4134',
        'sec_poison_id':'9e6926ce-0bb8-4335-992a-0ec7875fff69',
        'xsecappid':'xhs-pc-web'

    }
    f=open('notes.csv', mode='a+', encoding='utf-8', newline='')
    csv_writer=DictWriter(f,fieldnames=['标题','内容','发布时间','收藏数','喜欢数','分享数'])

    response = requests.get(url=url, headers=headers, cookies=cookies)
    soup = BeautifulSoup(response.text, 'lxml')
    long_string=soup.prettify()
    pattern = r'"interactInfo":{[^}]*}'
    match = re.search(pattern, long_string)
    js=match.group(0)[15:]
    #将字符串转化为字典
    data_dict = json.loads(js)

    if '请输入验证码' in soup.text:
        print('需要输入验证码或更新Cookie')
    else:
        title = soup.find('title').text
        publish_time = soup.find('span', class_='date').text
        collectedCount=data_dict['collectedCount']
        likedCount=data_dict['likedCount']
        shareCount=data_dict['shareCount']
        # 找到目标div并获取其所有内容（包括<br>标签）
        desc_div = soup.find('div', class_='desc')
        # 使用正则表达式匹配标签内的文本
        match = re.findall(r'<span [^>]*="">.*?<\/span>', str(desc_div.contents))
        if match:
            for text_inside in match:
                if '&nbsp'not in text_inside and len(text_inside)>50 and 'class="note-content-user"' not in text_inside:
                    pattern = r'="">(.*?)<\/span>'
                    text_inside = re.findall(pattern, text_inside)
                    dic={
                        '标题':title.replace('- 小红书',''),
                        '内容':text_inside[0],
                        '发布时间':publish_time,
                        '收藏数':collectedCount,
                        '喜欢数':likedCount,
                        '分享数':shareCount
                    }
                    csv_writer.writerow(dic)
                    print('笔记拉取成功')
                    break

    f.close()


if __name__ == '__main__':
    notes_request('654a541e000000001e029a6a')