|
|
import json
|
|
|
import re
|
|
|
from csv import DictWriter, DictReader
|
|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def notes_request(url):
|
|
|
url='https://www.xiaohongshu.com/explore/'+url
|
|
|
headers = {
|
|
|
'authority':'edith.xiaohongshu.com',
|
|
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
|
'Accept-Language':'zh-CN,zh;q=0.9',
|
|
|
'Content-Type':'text/html; charset=utf-8',
|
|
|
'Origin':'https://www.xiaohongshu.com',
|
|
|
'Referer':'https://www.xiaohongshu.com',
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
|
|
|
cookies = {
|
|
|
'abRequestId':'a3c8a855-ed85-57f6-91fe-3e0acedffde8',
|
|
|
'a1':'18f9b67ecdbh41bri2la0dgguminrd7dmmrk4pxf750000215521',
|
|
|
'webId':'6d97ebd3e6051489537e7a6aa1d1cf99',
|
|
|
'gid':'yYijDKYydJ6JyYijDKWdS6UCfDx4yDF3J108FiWfkk763T28IFfWf4888Jy22Jy8WYf0SyjD',
|
|
|
'webBuild':'4.17.2',
|
|
|
'web_session':'040069b3643d32585fff79ee79344bd4b89bc6',
|
|
|
'acw_tc':'459a6388281f21d6215ff80f1402afbfdf47eb5a1ef949b426ace99565a6fc50',
|
|
|
'websectiga':'7750c37de43b7be9de8ed9ff8ea0e576519e8cd2157322eb972ecb429a7735d4',
|
|
|
'sec_poison_id':'b1e1329b-735f-427c-bfd0-698a912c0366',
|
|
|
'xsecappid':'xhs-pc-web'
|
|
|
|
|
|
}
|
|
|
f=open('notes.csv', mode='a+', encoding='utf-8', newline='')
|
|
|
csv_writer=DictWriter(f,fieldnames=['标题','内容','发布时间','收藏数','喜欢数','分享数'])
|
|
|
|
|
|
response = requests.get(url=url, headers=headers, cookies=cookies)
|
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
|
long_string=soup.prettify()
|
|
|
pattern = r'"interactInfo":{[^}]*}'
|
|
|
match = re.search(pattern, long_string)
|
|
|
js=match.group(0)[15:]
|
|
|
#将字符串转化为字典
|
|
|
data_dict = json.loads(js)
|
|
|
|
|
|
if '请输入验证码' in soup.text:
|
|
|
print('需要输入验证码或更新Cookie')
|
|
|
else:
|
|
|
title = soup.find('title').text
|
|
|
publish_time = soup.find('span', class_='date').text
|
|
|
collectedCount=data_dict['collectedCount']
|
|
|
likedCount=data_dict['likedCount']
|
|
|
shareCount=data_dict['shareCount']
|
|
|
# 找到目标div并获取其所有内容(包括<br>标签)
|
|
|
desc_div = soup.find('div', class_='desc')
|
|
|
# 使用正则表达式匹配标签内的文本
|
|
|
match = re.findall(r'<span [^>]*="">.*?<\/span>', str(desc_div.contents))
|
|
|
if match:
|
|
|
for text_inside in match:
|
|
|
if ' 'not in text_inside and len(text_inside)>50 and 'class="note-content-user"' not in text_inside:
|
|
|
pattern = r'="">(.*?)<\/span>'
|
|
|
text_inside = re.findall(pattern, text_inside)
|
|
|
dic={
|
|
|
'标题':title.replace('- 小红书',''),
|
|
|
'内容':text_inside[0],
|
|
|
'发布时间':publish_time,
|
|
|
'收藏数':collectedCount,
|
|
|
'喜欢数':likedCount,
|
|
|
'分享数':shareCount
|
|
|
}
|
|
|
csv_writer.writerow(dic)
|
|
|
print('笔记拉取成功')
|
|
|
break
|
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
notes_request('654a541e000000001e029a6a')
|