You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2024-Summer-Olympics/Scraping Bilibili Danmaku (...

287 lines
13 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
import re
import pandas as pd
from collections import Counter
import jieba
import wordcloud
def extract_ids_from_url(url, head, output_file='aid.txt'):
"""
从给定的 URL 中提取 IDs 并将其保存到指定的文件中。
参数:
url (str): 要请求的 URL。
head (dict): 请求头,用于发起 HTTP 请求。
output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'
"""
try:
# 发起 GET 请求
response = requests.get(url=url, headers=head)
# 确保请求成功,状态码在 200 到 299 之间
response.raise_for_status()
# 将响应内容解析为 JSON 格式
data = response.json()
# 检查响应数据是否包含 'data' 和 'result' 键
if 'data' in data and 'result' in data['data']:
items = data['data']['result']
# 提取每个条目的 'id' 字段
ids = [item['id'] for item in items]
# 以追加模式打开文件,并写入每个 ID
with open(output_file, 'a') as file:
for aid in ids:
file.write(f"{aid}\n")
print(f"IDs have been saved to {output_file}")
else:
print("Unexpected response format") # 如果响应格式不符合预期,输出提示信息
except requests.RequestException as e:
# 捕获并打印请求相关的错误
print(f"Request error: {e}")
except KeyError as e:
# 捕获并打印键错误
print(f"Key error: {e}")
except Exception as e:
# 捕获并打印其他类型的异常
print(f"An error occurred: {e}")
def process_urls(urls1, headers1, output_file='aid.txt'):
"""
遍历 URL 列表,并对每个 URL 调用 extract_ids_from_url 函数进行处理。
参数:
urls1 (list): 包含 URL 的列表。
headers1 (dict): 请求头,用于发起 HTTP 请求。
output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'
"""
for url in urls1:
extract_ids_from_url(url, headers1, output_file)
def process_aid_and_cid(aid_file_path, cid_file_path, headers):
# 打开 aid 文件,并读取其中的所有 aid
with open(aid_file_path, 'r') as file:
aids = [line.strip() for line in file if line.strip()]
count = 0
# 打开 cid 文件(以追加模式),准备写入 cid 数据
with open(cid_file_path, 'a') as file:
# 遍历每个 aid构造请求 URL 并获取对应的数据
for aid in aids:
url = f'https://api.bilibili.com/x/player/pagelist?aid={aid}'
response = requests.get(url=url, headers=headers).json()
# 遍历响应数据中的每个条目,提取 cid
for item in response.get('data', []):
cid = item['cid']
# 将 cid 写入文件
file.write(f"{cid}\n")
count += 1
# 输出处理进度
print(f"Processed: {count} CIDs")
def remove_duplicates(file_path):
# 读取 cid 文件中的所有 cid
with open(file_path, 'r') as file:
cids = [line.strip() for line in file if line.strip()]
# 使用字典去除重复的 cid
unique_cids = list(dict.fromkeys(cids))
# 将去重后的 cid 写回文件
with open(file_path, 'w') as file:
for cid in unique_cids:
file.write(cid + '\n')
# 输出去重完成的提示
# 调用 remove_duplicates 函数,去除 cid 文件中的重复项
remove_duplicates(cid_file_path)
def fetch_danmu():
# 读取 cid 文件
print("开始爬取弹幕")
with open('cid.txt', 'r') as file:
cids = [line.strip() for line in file if line.strip()]
for cid in cids:
url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid={cid}&date=2024-08-31'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
# 匹配弹幕内容
content_list = re.findall('[\u4e00-\u9fa5]+', response.text)
content = '\n'.join(content_list)
# 将弹幕写入 comment.txt
with open('comment.txt', mode='a', encoding='utf-8') as f:
f.write(content + '\n')
# 定义需要过滤的关键词或短语
keywords_to_remove = [
'出错啦',
'错误号',
'由于触发哔哩哔哩安全风控策略',
'该次访问请求被拒绝'
]
# 定义一个正则表达式模式,用于匹配需要删除的内容
pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_to_remove))
def clean_file(input_file, output_file):
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
# 如果行中不包含需要过滤的关键词,则写入输出文件
if not pattern.search(line):
outfile.write(line)
def analyze_keywords_in_comments(comments_file, keywords_file, output_excel_file):
# 读取评论文件
with open(comments_file, 'r', encoding='utf-8') as file:
comments = file.readlines()
# 读取关键词列表
with open(keywords_file, 'r', encoding='utf-8') as file:
keywords = [line.strip() for line in file]
# 定义一个列表用于存储评论中的 AI 技术应用
ai_technologies = []
# 遍历评论,统计每个关键词的出现次数
for comment in comments:
for keyword in keywords:
if keyword in comment:
ai_technologies.append(keyword)
# 统计每个技术的出现次数
tech_counts = Counter(ai_technologies)
# 将统计结果转换为 DataFrame
df = pd.DataFrame(tech_counts.items(), columns=['AI Technology', 'Count'])
# 将 DataFrame 写入 Excel 文件
df.to_excel(output_excel_file, index=False)
# 排序并提取前 8 名的数据
top_8 = df.sort_values(by='Count', ascending=False).head(8)
# 输出前 8 名的数据
print(top_8)
def generate_wordcloud(text_file, stopwords_file, output_image_file, font_path='msyh.ttc'):
# 加载停用词
def load_stopwords(file_path):
with open(file_path, encoding='utf-8') as f:
stopwords = set(f.read().strip().split('\n'))
return stopwords
# 读取停用词
stopwords = load_stopwords(stopwords_file)
# 读取文本文件
with open(text_file, encoding='utf-8') as f:
txt = f.read()
# 分词并过滤停用词
words = jieba.lcut(txt)
filtered_words = [word for word in words if word not in stopwords]
# 将处理后的词汇拼接成字符串
word_string = ' '.join(filtered_words)
# 生成词云
wc = wordcloud.WordCloud(
width=700,
height=700,
background_color='white',
font_path=font_path
)
wc.generate(word_string)
# 保存词云图
wc.to_file(output_image_file)
urls = ['https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=10&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=324&web_location=1430654&w_rid=420b5e834d7dd54d76f4fba1b7b1e665&wts=1725152144',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=8&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=252&web_location=1430654&w_rid=7fdf1d4b3f7d534c993f50173d02de3f&wts=1725152135',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=7&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=216&web_location=1430654&w_rid=6749123b8b393589cc7c80c1e93ada58&wts=1725152132',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=6&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=180&web_location=1430654&w_rid=74f00cf5195a9ec7ef3d57e347704770&wts=1725152128',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=5&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=144&web_location=1430654&w_rid=e914e50a0da59031c553d631ac5f1fde&wts=1725152124',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=4&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=108&web_location=1430654&w_rid=c622f59f9e1360765b62f0e0bc858fa1&wts=1725152121',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=3&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=72&web_location=1430654&w_rid=a60e99a470fa19919a071c865dd1583f&wts=1725152115',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=2&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=36&web_location=1430654&w_rid=8fc24d10311ce5e5730a84daadbbb6b3&wts=1725152102',
'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=9&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=288&web_location=1430654&w_rid=a9cbd6c813f6d27561d5f0d583c0ed76&wts=1725153457',
] # 替换为实际的URL
headers = {
'cookie':'buvid4=686CE350-75FA-4921-C069-8D0E582FF02993159-024082507-y91msXDi8JTSAtvVtdhJkQ%3D%3D; buvid3=313C6A34-4C14-0939-EBE8-332F809D2EF655028infoc; b_nut=1725087454; CURRENT_FNVAL=4048; _uuid=10E7EC991-7B18-9A8B-78AA-C95F55102347103610infoc; rpdid=|(JlklRl)~Y|0J\'u~kl|)~l|l; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; fingerprint=f90b71618c196fb8806f458403d943fb; buvid_fp_plain=undefined; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU4Njk1NzEsImlhdCI6MTcyNTYxMDMxMSwicGx0IjotMX0.x0CsQ6o6lx4IcK82uHYJjDq_WMedyzoqa081au5YPug; bili_ticket_expires=1725869511; bp_t_offset_1074062089=974427929414991872; buvid_fp=f90b71618c196fb8806f458403d943fb; SESSDATA=e74a05df%2C1741267229%2Ce876a%2A91CjDqLgub8fAVML6ADiSzb56IvMh3z61KnSnawN0g_c1h5emTp3cU9qrpFxgDEzzpawASVkpfc01rblFpaUxDRHViNXpJdGhweEdNY2VDdEJ0N1hvMU92SWdLcG5Dclg5dlZmV29aMWZfX2ZSWHJ5VVN3ZHRkc0ZaLU9COHdmeDR2T0tmSXlvdmt3IIEC; bili_jct=addb604342937a4322aa12322c11bc2c; DedeUserID=3546758143544046; DedeUserID__ckMd5=65316417021aa6ed; sid=7yti0jp9; b_lsid=D810C241D_191CEE2FE76; bsource=search_bing; home_feed_column=5; browser_resolution=1455-699',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
}
def main():
#获取视频aid
process_urls(urls, headers)
print("获取视频aid完毕")
#将视频aid转换成cid
process_aid_and_cid('aid.txt', 'cid.txt', headers)
print("将aid转换成cid完毕cid去重完成结果已写回文件。")
#获取视频弹幕
fetch_danmu()
print("弹幕爬取完成。")
# 调用函数进行文件清理
print('开始清洗弹幕')
clean_file('comment.txt', 'cleaned_comment.txt')
print("弹幕清洗完毕")
#数据统计输出
print('开始数据统计')
analyze_keywords_in_comments('cleaned_comment.txt', 'keywords.txt', 'ai_technologies_count.xlsx')
#输出词云图
print("开始构建词云图")
generate_wordcloud('cleaned_comment.txt', 'stopwords.txt', '词云.png')
print("构建词云图完毕")
if __name__ == "__main__":
main()