You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
3.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import re
from csv import DictWriter, DictReader
import requests
from bs4 import BeautifulSoup
def notes_request(url):
url='https://www.xiaohongshu.com/explore/'+url
headers = {
'authority':'edith.xiaohongshu.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language':'zh-CN,zh;q=0.9',
'Content-Type':'text/html; charset=utf-8',
'Origin':'https://www.xiaohongshu.com',
'Referer':'https://www.xiaohongshu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
cookies = {
'abRequestId':'a3c8a855-ed85-57f6-91fe-3e0acedffde8',
'a1':'18f9b67ecdbh41bri2la0dgguminrd7dmmrk4pxf750000215521',
'webId':'6d97ebd3e6051489537e7a6aa1d1cf99',
'gid':'yYijDKYydJ6JyYijDKWdS6UCfDx4yDF3J108FiWfkk763T28IFfWf4888Jy22Jy8WYf0SyjD',
'webBuild':'4.16.1',
'web_session':'040069b3643d32585fff79ee79344bd4b89bc6',
'acw_tc':'25f299cc55249ae472ab6eab4f8404ce338ad4b4544c7f87a592375d787ede3a',
'websectiga':'16f444b9ff5e3d7e258b5f7674489196303a0b160e16647c6c2b4dcb609f4134',
'sec_poison_id':'9e6926ce-0bb8-4335-992a-0ec7875fff69',
'xsecappid':'xhs-pc-web'
}
f=open('notes.csv', mode='a+', encoding='utf-8', newline='')
csv_writer=DictWriter(f,fieldnames=['标题','内容','发布时间','收藏数','喜欢数','分享数'])
response = requests.get(url=url, headers=headers, cookies=cookies)
soup = BeautifulSoup(response.text, 'lxml')
long_string=soup.prettify()
pattern = r'"interactInfo":{[^}]*}'
match = re.search(pattern, long_string)
js=match.group(0)[15:]
#将字符串转化为字典
data_dict = json.loads(js)
if '请输入验证码' in soup.text:
print('需要输入验证码或更新Cookie')
else:
title = soup.find('title').text
publish_time = soup.find('span', class_='date').text
collectedCount=data_dict['collectedCount']
likedCount=data_dict['likedCount']
shareCount=data_dict['shareCount']
# 找到目标div并获取其所有内容包括<br>标签)
desc_div = soup.find('div', class_='desc')
# 使用正则表达式匹配标签内的文本
match = re.findall(r'<span [^>]*="">.*?<\/span>', str(desc_div.contents))
if match:
for text_inside in match:
if '&nbsp'not in text_inside and len(text_inside)>50 and 'class="note-content-user"' not in text_inside:
pattern = r'="">(.*?)<\/span>'
text_inside = re.findall(pattern, text_inside)
dic={
'标题':title.replace('- 小红书',''),
'内容':text_inside[0],
'发布时间':publish_time,
'收藏数':collectedCount,
'喜欢数':likedCount,
'分享数':shareCount
}
csv_writer.writerow(dic)
print('笔记拉取成功')
break
f.close()
if __name__ == '__main__':
notes_request('654a541e000000001e029a6a')