diff --git a/danmu.py b/danmu.py index af373c2..c6cc24c 100644 --- a/danmu.py +++ b/danmu.py @@ -1,120 +1,124 @@ -import re -import requests -import json -from wordcloud import WordCloud -from collections import Counter -import pandas as pd -import jieba - -# 用于获得请求的头,应对B站的反爬虫机制 -headers = { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", - "Referer": "https://www.bilibili.com/video/BV1ji421r7PB/?spm_id_from=333.337.search-card.all.click&vd_source=6d043574d81a3f216c41cc281767c59e", - "Cookie": "buvid4=F793982E-B1BE-6615-D504-4A3BF6DC079274572-022081216-zrNi2ZH69Dhc8fOw09IIaA%3D%3D; buvid_fp_plain=undefined; DedeUserID=254629681; DedeUserID__ckMd5=23182279794f6f43; header_theme_version=CLOSE; enable_web_push=DISABLE; rpdid=0zbfVHb84m|bujW4qg1|H3r|3w1RkXNz; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; hit-dyn-v2=1; CURRENT_FNVAL=4048; LIVE_BUVID=AUTO3317219069887705; buvid3=575DAC67-0FE1-B3C3-8A9F-32190F231A1F24035infoc; b_nut=1723377925; _uuid=24521A4B-895E-10E6A-2555-A9EF812A1AC136123infoc; fingerprint=45b7e451fad392b858fc6e95bbb4231d; PVID=1; SESSDATA=7bb24682%2C1741874111%2Cf9490%2A92CjB3NDHhJCIr34LBlg2TTS7vPGQ72Kh3s3DVcaEn0JkQ_hXc7q6MgsK9Wjb-Z6NTf68SVnlnTkJQZmtmRXMxZnc1RDZpYUFnS1BlTmptUklUSkcyQUNxaWg4Smpla1VCZHdkLVc4dW9OWU4zZm1PaVhuMzFtcnNvQ2JMQzFiTFVleC1HbXc0Sm53IIEC; bili_jct=895f9e091815ee492e57345dc62b8757; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY1ODEzMjMsImlhdCI6MTcyNjMyMjA2MywicGx0IjotMX0.47Js9n97GQecalPWdc-MNrDVUOz2-FfY_DlB72DUlZM; bili_ticket_expires=1726581263; CURRENT_QUALITY=32; bp_t_offset_254629681=977402825627664384; buvid_fp=45b7e451fad392b858fc6e95bbb4231d; home_feed_column=5; browser_resolution=1528-704; b_lsid=18C6BCCB_191F83A85C1" -} - -def get_bvlist(): - # 获取BV号 - bvlist = [] - for page in range(1, 11): - # 搜索“2024巴黎奥运会”的URL - url = f'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page={page}&page_size=30&pubtime_begin_s=0&pubtime_end_s=0&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A'' - # 发送请求 - response = requests.get(url, headers=headers) - response.encoding='urf-8' - # 使用正则表达式得到BV号 - tem_bvlist = re.findall('"bvid":"(.*?)"', response.text) - bvlist.extend(list(set(tem_bvlist))) - print("BV号获取完成") - return bvlist - -def get_cidlist(bvlist): - # 传入获取到的BV号,获取对应的cid - cidlist=[] - for bvid in bvlist: - # 由于获取cid的URL - url = f'https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp' - # 发送请求 - res = requests.get(url,headers=headers) - res.encoding = 'urf-8' - # 利用JSON提取cid - json_dict = json.loads(res.text) - cidlist.append(json_dict["data"][0]["cid"]) - print("cid获取完成") - return cidlist - -def get_danmu(cidlist): - # 利用传入的cid,获取对应的弹幕 - count = 1 - for cid in cidlist: - # 弹幕所在的URL - url = f'http://comment.bilibili.com/{cid}.xml' - # 发送请求 - response = requests.get(url, headers=headers) - response.encoding = 'utf-8' - # 利用正则表达式获取弹幕 - content_list = re.findall('(.*?)', response.text) - for danmu in content_list: - with open('弹幕.txt', 'a', encoding='utf-8') as f: - f.write(danmu + '\n') - print(f'----------第{count}个视频弹幕获取完毕!----------') - count +=1 - -def get_ai_related_danmu(): - # 统计与AI技术应用相关的弹幕 - ai_related_danmu = [] - # AI相关的关键词列表 - ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络', - '自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'} - # 读取弹幕文件 - with open('弹幕.txt', 'r', encoding='utf-8') as file: - danmu_lines = file.readlines() - # 去除原本的换行符和空格,方便统计 - danmu_cleaned = [line.strip() for line in danmu_lines] - # 统计与AI技术应用相关的弹幕 - for danmu in danmu_cleaned: - if any(keyword in danmu.lower() for keyword in ai_keywords): - # 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断 - if 'ai' in danmu.lower() and re.search(r'(?(.*?)', response.text) + for danmu in content_list: + with open('弹幕.txt', 'a', encoding='utf-8') as f: + f.write(danmu + '\n') + print(f'----------第{count}个视频弹幕获取完毕!----------') + count +=1 + +def get_ai_related_danmu(): + # 统计与AI技术应用相关的弹幕 + ai_related_danmu = [] + # AI相关的关键词列表 + ai_keywords = {'ai', 'ai生成', 'ai合成', 'ai配音', 'gpt', '人工智能', '机器学习', '深度学习', '神经网络', + '自然语言处理', '计算机视觉', '语音识别', '人脸识别', '无人驾驶'} + # 读取弹幕文件 + with open('弹幕.txt', 'r', encoding='utf-8') as file: + danmu_lines = file.readlines() + # 去除原本的换行符和空格,方便统计 + danmu_cleaned = [line.strip() for line in danmu_lines] + # 统计与AI技术应用相关的弹幕 + for danmu in danmu_cleaned: + if any(keyword in danmu.lower() for keyword in ai_keywords): + # 对含有“ai”关键字但内容并不是AI技术应用相关的弹幕需另外判断 + if 'ai' in danmu.lower() and re.search(r'(?