|
|
import requests
|
|
|
from bs4 import BeautifulSoup
|
|
|
import pandas as pd
|
|
|
import re
|
|
|
import json
|
|
|
import openpyxl
|
|
|
import collections
|
|
|
import matplotlib
|
|
|
import wordcloud
|
|
|
import PIL
|
|
|
import numpy
|
|
|
import logging
|
|
|
import jieba
|
|
|
|
|
|
|
|
|
def get_bvid(page, pos):
|
|
|
"""
|
|
|
通过搜索api“https://api.bilibili.com/x/web-interface/search/all/v2?page=&keyword=”获取前300个视频的bvid,
|
|
|
其中page为1-15,keyword为“日本核污染水排海”
|
|
|
"""
|
|
|
# 构造搜索视频的API请求URL
|
|
|
_url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page=' + str(page + 1) + '&keyword=2024巴黎奥运会'
|
|
|
|
|
|
# 构造请求头,包括用户代理信息和cookie
|
|
|
_headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
|
"Referer": "https://www.bilibili.com/"
|
|
|
}
|
|
|
|
|
|
_cookies = {
|
|
|
"SESSDATA": "8b46ceda%2C1741779927%2C40dae%2A92CjD5jbBJ-_80j97J0rskP9tHUPaAjKq6h3Uw5DEMNjROxJsJxX9_K5brff6wdSLk1-cSVi1tRjVNRVU4YjItVVJOOXNZSE4wOWU4WllwSmJsemJPb1RZekEzTkRzVXpDNkdXd1dWWnRWSVRkc0h1cXJVMjBkemFNbEtqWEdzU3RJRUI2Q2RBQ3VRIIEC" # 你可以从浏览器获取登录后的cookies
|
|
|
}
|
|
|
res = requests.get(url=_url, headers=_headers,cookies=_cookies).text # 发起GET请求,获取响应结果的文本形
|
|
|
json_dict = json.loads(res) # 将获取的响应结果解析为Python字典对象
|
|
|
return json_dict["data"]["result"][11]["data"][pos]["bvid"] # 返回视频的bvid号
|
|
|
|
|
|
|
|
|
def get_cid(bvid):
|
|
|
# 构造API请求URL
|
|
|
url = 'https://api.bilibili.com/x/player/pagelist?bvid=' + str(bvid) + '&jsonp=jsonp'
|
|
|
res = requests.get(url).text # 发起GET请求,获取响应结果的文本形式
|
|
|
json_dict = json.loads(res) # 将获取的网页json编码字符串转换为Python对象
|
|
|
print(json_dict) # 打印整个转换后的结果,用于调试
|
|
|
return json_dict["data"][0]["cid"] # 返回获取到的视频信息的cid值
|
|
|
|
|
|
|
|
|
def get_data(cid):
|
|
|
try:
|
|
|
# 构造API请求URL
|
|
|
final_url = "https://api.bilibili.com/x/v1/dm/list.so?oid=" + str(cid)
|
|
|
final_res = requests.get(final_url) # 发起GET请求获取数据
|
|
|
final_res.encoding = 'utf-8' # 设置编码为utf-8
|
|
|
final_res = final_res.text # 获取文本形式
|
|
|
pattern = re.compile('<d.*?>(.*?)</d>') # 使用正则表达式提取数据
|
|
|
data = pattern.findall(final_res)
|
|
|
return data # 返回数据
|
|
|
|
|
|
# 异常处理
|
|
|
except Exception as e:
|
|
|
print("执行get_data失败:", e) # 打印错误信息
|
|
|
|
|
|
|
|
|
def save_to_file(data):
|
|
|
try:
|
|
|
# 打开 CSV 文件,以追加模式写入数据
|
|
|
with open('danmu_data.csv', 'a', newline='', encoding='utf-8') as file:
|
|
|
writer = csv.writer(file) # 创建 CSV writer 对象
|
|
|
# 遍历数据列表,逐行写入 CSV 文件
|
|
|
for d in data:
|
|
|
writer.writerow([d])
|
|
|
|
|
|
# 异常处理
|
|
|
except Exception as e:
|
|
|
print("执行保存文件报错:", e) # 打印错误信息
|
|
|
|