import requests import re import pandas as pd from collections import Counter import jieba import wordcloud def extract_ids_from_url(url, head, output_file='aid.txt'): """ 从给定的 URL 中提取 IDs 并将其保存到指定的文件中。 参数: url (str): 要请求的 URL。 head (dict): 请求头,用于发起 HTTP 请求。 output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 """ try: # 发起 GET 请求 response = requests.get(url=url, headers=head) # 确保请求成功,状态码在 200 到 299 之间 response.raise_for_status() # 将响应内容解析为 JSON 格式 data = response.json() # 检查响应数据是否包含 'data' 和 'result' 键 if 'data' in data and 'result' in data['data']: items = data['data']['result'] # 提取每个条目的 'id' 字段 ids = [item['id'] for item in items] # 以追加模式打开文件,并写入每个 ID with open(output_file, 'a') as file: for aid in ids: file.write(f"{aid}\n") print(f"IDs have been saved to {output_file}") else: print("Unexpected response format") # 如果响应格式不符合预期,输出提示信息 except requests.RequestException as e: # 捕获并打印请求相关的错误 print(f"Request error: {e}") except KeyError as e: # 捕获并打印键错误 print(f"Key error: {e}") except Exception as e: # 捕获并打印其他类型的异常 print(f"An error occurred: {e}") def process_urls(urls1, headers1, output_file='aid.txt'): """ 遍历 URL 列表,并对每个 URL 调用 extract_ids_from_url 函数进行处理。 参数: urls1 (list): 包含 URL 的列表。 headers1 (dict): 请求头,用于发起 HTTP 请求。 output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 """ for url in urls1: extract_ids_from_url(url, headers1, output_file) def process_aid_and_cid(aid_file_path, cid_file_path, headers): # 打开 aid 文件,并读取其中的所有 aid with open(aid_file_path, 'r') as file: aids = [line.strip() for line in file if line.strip()] count = 0 # 打开 cid 文件(以追加模式),准备写入 cid 数据 with open(cid_file_path, 'a') as file: # 遍历每个 aid,构造请求 URL 并获取对应的数据 for aid in aids: url = f'{aid}' response = requests.get(url=url, headers=headers).json() # 遍历响应数据中的每个条目,提取 cid for item in response.get('data', []): cid = item['cid'] # 将 cid 写入文件 file.write(f"{cid}\n") count += 1 # 输出处理进度 print(f"Processed: {count} CIDs") def remove_duplicates(file_path): # 读取 cid 文件中的所有 cid with open(file_path, 'r') as file: cids = [line.strip() for line in file if line.strip()] # 使用字典去除重复的 cid unique_cids = list(dict.fromkeys(cids)) # 将去重后的 cid 写回文件 with open(file_path, 'w') as file: for cid in unique_cids: file.write(cid + '\n') # 输出去重完成的提示 # 调用 remove_duplicates 函数,去除 cid 文件中的重复项 remove_duplicates(cid_file_path) def fetch_danmu(): # 读取 cid 文件 print("开始爬取弹幕") with open('cid.txt', 'r') as file: cids = [line.strip() for line in file if line.strip()] for cid in cids: url = f'{cid}&date=2024-08-31' response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' # 匹配弹幕内容 content_list = re.findall('[\u4e00-\u9fa5]+', response.text) content = '\n'.join(content_list) # 将弹幕写入 comment.txt with open('comment.txt', mode='a', encoding='utf-8') as f: f.write(content + '\n') # 定义需要过滤的关键词或短语 keywords_to_remove = [ '出错啦', '错误号', '由于触发哔哩哔哩安全风控策略', '该次访问请求被拒绝' ] # 定义一个正则表达式模式,用于匹配需要删除的内容 pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_to_remove)) def clean_file(input_file, output_file): with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: for line in infile: # 如果行中不包含需要过滤的关键词,则写入输出文件 if not outfile.write(line) def analyze_keywords_in_comments(comments_file, keywords_file, output_excel_file): # 读取评论文件 with open(comments_file, 'r', encoding='utf-8') as file: comments = file.readlines() # 读取关键词列表 with open(keywords_file, 'r', encoding='utf-8') as file: keywords = [line.strip() for line in file] # 定义一个列表用于存储评论中的 AI 技术应用 ai_technologies = [] # 遍历评论,统计每个关键词的出现次数 for comment in comments: for keyword in keywords: if keyword in comment: ai_technologies.append(keyword) # 统计每个技术的出现次数 tech_counts = Counter(ai_technologies) # 将统计结果转换为 DataFrame df = pd.DataFrame(tech_counts.items(), columns=['AI Technology', 'Count']) # 将 DataFrame 写入 Excel 文件 df.to_excel(output_excel_file, index=False) # 排序并提取前 8 名的数据 top_8 = df.sort_values(by='Count', ascending=False).head(8) # 输出前 8 名的数据 print(top_8) def generate_wordcloud(text_file, stopwords_file, output_image_file, mask_image_file=None, font_path='msyh.ttc'): # 加载停用词 def load_stopwords(file_path): with open(file_path, encoding='utf-8') as f: stopwords = set('\n')) return stopwords # 读取停用词 stopwords = load_stopwords(stopwords_file) # 读取文本文件 with open(text_file, encoding='utf-8') as f: txt = # 分词并过滤停用词 words = jieba.lcut(txt) filtered_words = [word for word in words if word not in stopwords] # 将处理后的词汇拼接成字符串 word_string = ' '.join(filtered_words) # 设置词云参数 wc_kwargs = { 'width': 700, 'height': 700, 'background_color': 'white', 'font_path': font_path, } if mask_image_file: # 读取掩模图像并确保其为二值图像 mask_image = np.array('L')) # 反转黑白区域,黑色区域变为0,白色区域变为255 mask = np.where(mask_image == 0, 0, 255).astype(np.uint8) wc_kwargs.update({ 'mask': mask, # 应用掩模图像 'contour_color': 'white', # 轮廓颜色 'contour_width': 0, }) wc = wordcloud.WordCloud(**wc_kwargs) wc.generate(word_string) # 保存词云图 wc.to_file(output_image_file) urls = ['', '', '', '', '', '', '', '', '', ] # 替换为实际的URL headers = { 'cookie':'buvid4=686CE350-75FA-4921-C069-8D0E582FF02993159-024082507-y91msXDi8JTSAtvVtdhJkQ%3D%3D; buvid3=313C6A34-4C14-0939-EBE8-332F809D2EF655028infoc; b_nut=1725087454; CURRENT_FNVAL=4048; _uuid=10E7EC991-7B18-9A8B-78AA-C95F55102347103610infoc; rpdid=|(JlklRl)~Y|0J\'u~kl|)~l|l; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; fingerprint=f90b71618c196fb8806f458403d943fb; buvid_fp_plain=undefined; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU4Njk1NzEsImlhdCI6MTcyNTYxMDMxMSwicGx0IjotMX0.x0CsQ6o6lx4IcK82uHYJjDq_WMedyzoqa081au5YPug; bili_ticket_expires=1725869511; bp_t_offset_1074062089=974427929414991872; buvid_fp=f90b71618c196fb8806f458403d943fb; SESSDATA=e74a05df%2C1741267229%2Ce876a%2A91CjDqLgub8fAVML6ADiSzb56IvMh3z61KnSnawN0g_c1h5emTp3cU9qrpFxgDEzzpawASVkpfc01rblFpaUxDRHViNXpJdGhweEdNY2VDdEJ0N1hvMU92SWdLcG5Dclg5dlZmV29aMWZfX2ZSWHJ5VVN3ZHRkc0ZaLU9COHdmeDR2T0tmSXlvdmt3IIEC; bili_jct=addb604342937a4322aa12322c11bc2c; DedeUserID=3546758143544046; DedeUserID__ckMd5=65316417021aa6ed; sid=7yti0jp9; b_lsid=D810C241D_191CEE2FE76; bsource=search_bing; home_feed_column=5; browser_resolution=1455-699', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36 Edg/' } def main(): #获取视频aid process_urls(urls, headers) print("获取视频aid完毕") #将视频aid转换成cid process_aid_and_cid('aid.txt', 'cid.txt', headers) print("将aid转换成cid完毕,cid去重完成,结果已写回文件。") #获取视频弹幕 fetch_danmu() print("弹幕爬取完成。") # 调用函数进行文件清理 print('开始清洗弹幕') clean_file('comment.txt', 'cleaned_comment.txt') print("弹幕清洗完毕") #数据统计输出 print('开始数据统计') analyze_keywords_in_comments('cleaned_comment.txt', 'keywords.txt', 'ai_technologies_count.xlsx') #输出词云图 print("开始构建词云图") generate_wordcloud('cleaned_comment.txt', 'stopwords.txt', '词云.png', 'img.png') print("构建词云图完毕") if __name__ == "__main__": main()