import json
import re
from csv import DictWriter, DictReader
import requests
from bs4 import BeautifulSoup
def notes_request(url):
url='https://www.xiaohongshu.com/explore/'+url
headers = {
'authority':'edith.xiaohongshu.com',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language':'zh-CN,zh;q=0.9',
'Content-Type':'text/html; charset=utf-8',
'Origin':'https://www.xiaohongshu.com',
'Referer':'https://www.xiaohongshu.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
cookies = {
'abRequestId':'a1d1855c-4c32-5307-a9f1-9d3f34a192cf',
'a1':'1900bc790f5y2g9t1mkcnviv0sy4y0pfv4afgtuj250000186611',
'webId':'b7d9160688e1684d896a4dc0dab65e36',
'gid':'yj88DSWfdJhjyj88DSWj8497i2vJkj9y6TSI0VJh3h8Mvi284v8Uik888yYKKyy8Wq48fY4i',
'webBuild':'4.20.1',
'web_session':'040069b3643d32585fffe2225c344b136354be',
'acw_tc':'5a6bd4ac6c786b806cb96212649c76bb7a0ac62792834403c50f508645293c50',
'websectiga':'3633fe24d49c7dd0eb923edc8205740f10fdb18b25d424d2a2322c6196d2a4ad',
'sec_poison_id':'b2862aa3-62f2-48ce-9b27-2a44baa4b714',
'xsecappid':'xhs-pc-web'
}
f=open('notes.csv', mode='a+', encoding='utf-8', newline='')
csv_writer=DictWriter(f,fieldnames=['标题','内容','发布时间','收藏数','喜欢数','分享数'])
response = requests.get(url=url, headers=headers, cookies=cookies)
soup = BeautifulSoup(response.text, 'lxml')
long_string=soup.prettify()
pattern = r'"interactInfo":{[^}]*}'
match = re.search(pattern, long_string)
js=match.group(0)[15:]
#将字符串转化为字典
data_dict = json.loads(js)
if '请输入验证码' in soup.text:
print('需要输入验证码或更新Cookie')
else:
title = soup.find('title').text
publish_time = soup.find('span', class_='date').text
collectedCount=data_dict['collectedCount']
likedCount=data_dict['likedCount']
shareCount=data_dict['shareCount']
# 找到目标div并获取其所有内容(包括
标签)
desc_div = soup.find('div', class_='desc')
# 使用正则表达式匹配标签内的文本
match = re.findall(r']*="">.*?<\/span>', str(desc_div.contents))
if match:
for text_inside in match:
if ' 'not in text_inside and len(text_inside)>50 and 'class="note-content-user"' not in text_inside:
pattern = r'="">(.*?)<\/span>'
text_inside = re.findall(pattern, text_inside)
dic={
'标题':title.replace('- 小红书',''),
'内容':text_inside[0],
'发布时间':publish_time,
'收藏数':collectedCount,
'喜欢数':likedCount,
'分享数':shareCount
}
csv_writer.writerow(dic)
print('笔记拉取成功')
break
f.close()
if __name__ == '__main__':
notes_request('654a541e000000001e029a6a')