diff --git a/Scraping Bilibili Danmaku (bullet comments).py b/Scraping Bilibili Danmaku (bullet comments).py index 39aa21d..dd4d2e1 100644 --- a/Scraping Bilibili Danmaku (bullet comments).py +++ b/Scraping Bilibili Danmaku (bullet comments).py @@ -1,286 +1,300 @@ -import requests -import re -import pandas as pd -from collections import Counter -import jieba -import wordcloud - -def extract_ids_from_url(url, head, output_file='aid.txt'): - """ - 从给定的 URL 中提取 IDs 并将其保存到指定的文件中。 - - 参数: - url (str): 要请求的 URL。 - head (dict): 请求头,用于发起 HTTP 请求。 - output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 - """ - try: - # 发起 GET 请求 - response = requests.get(url=url, headers=head) - # 确保请求成功,状态码在 200 到 299 之间 - response.raise_for_status() - - # 将响应内容解析为 JSON 格式 - data = response.json() - - # 检查响应数据是否包含 'data' 和 'result' 键 - if 'data' in data and 'result' in data['data']: - items = data['data']['result'] - # 提取每个条目的 'id' 字段 - ids = [item['id'] for item in items] - - # 以追加模式打开文件,并写入每个 ID - with open(output_file, 'a') as file: - for aid in ids: - file.write(f"{aid}\n") - - print(f"IDs have been saved to {output_file}") - else: - print("Unexpected response format") # 如果响应格式不符合预期,输出提示信息 - - except requests.RequestException as e: - # 捕获并打印请求相关的错误 - print(f"Request error: {e}") - except KeyError as e: - # 捕获并打印键错误 - print(f"Key error: {e}") - except Exception as e: - # 捕获并打印其他类型的异常 - print(f"An error occurred: {e}") - - -def process_urls(urls1, headers1, output_file='aid.txt'): - """ - 遍历 URL 列表,并对每个 URL 调用 extract_ids_from_url 函数进行处理。 - - 参数: - urls1 (list): 包含 URL 的列表。 - headers1 (dict): 请求头,用于发起 HTTP 请求。 - output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 - """ - for url in urls1: - extract_ids_from_url(url, headers1, output_file) - - - - -def process_aid_and_cid(aid_file_path, cid_file_path, headers): - # 打开 aid 文件,并读取其中的所有 aid - with open(aid_file_path, 'r') as file: - aids = [line.strip() for line in file if line.strip()] - - count = 0 - # 打开 cid 文件(以追加模式),准备写入 cid 数据 - with open(cid_file_path, 'a') as file: - # 遍历每个 aid,构造请求 URL 并获取对应的数据 - for aid in aids: - url = f'https://api.bilibili.com/x/player/pagelist?aid={aid}' - response = requests.get(url=url, headers=headers).json() - # 遍历响应数据中的每个条目,提取 cid - for item in response.get('data', []): - cid = item['cid'] - # 将 cid 写入文件 - file.write(f"{cid}\n") - count += 1 - # 输出处理进度 - print(f"Processed: {count} CIDs") - - def remove_duplicates(file_path): - # 读取 cid 文件中的所有 cid - with open(file_path, 'r') as file: - cids = [line.strip() for line in file if line.strip()] - # 使用字典去除重复的 cid - unique_cids = list(dict.fromkeys(cids)) - # 将去重后的 cid 写回文件 - with open(file_path, 'w') as file: - for cid in unique_cids: - file.write(cid + '\n') - # 输出去重完成的提示 - - - # 调用 remove_duplicates 函数,去除 cid 文件中的重复项 - remove_duplicates(cid_file_path) - - - - - - -def fetch_danmu(): - # 读取 cid 文件 - print("开始爬取弹幕") - with open('cid.txt', 'r') as file: - cids = [line.strip() for line in file if line.strip()] - - for cid in cids: - url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid={cid}&date=2024-08-31' - response = requests.get(url=url, headers=headers) - response.encoding = 'utf-8' - # 匹配弹幕内容 - content_list = re.findall('[\u4e00-\u9fa5]+', response.text) - content = '\n'.join(content_list) - # 将弹幕写入 comment.txt - with open('comment.txt', mode='a', encoding='utf-8') as f: - f.write(content + '\n') - - - - -# 定义需要过滤的关键词或短语 -keywords_to_remove = [ - '出错啦', - '错误号', - '由于触发哔哩哔哩安全风控策略', - '该次访问请求被拒绝' -] - -# 定义一个正则表达式模式,用于匹配需要删除的内容 -pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_to_remove)) - -def clean_file(input_file, output_file): - with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: - for line in infile: - # 如果行中不包含需要过滤的关键词,则写入输出文件 - if not pattern.search(line): - outfile.write(line) - - - - - -def analyze_keywords_in_comments(comments_file, keywords_file, output_excel_file): - # 读取评论文件 - with open(comments_file, 'r', encoding='utf-8') as file: - comments = file.readlines() - - # 读取关键词列表 - with open(keywords_file, 'r', encoding='utf-8') as file: - keywords = [line.strip() for line in file] - - # 定义一个列表用于存储评论中的 AI 技术应用 - ai_technologies = [] - - # 遍历评论,统计每个关键词的出现次数 - for comment in comments: - for keyword in keywords: - if keyword in comment: - ai_technologies.append(keyword) - - # 统计每个技术的出现次数 - tech_counts = Counter(ai_technologies) - - # 将统计结果转换为 DataFrame - df = pd.DataFrame(tech_counts.items(), columns=['AI Technology', 'Count']) - - # 将 DataFrame 写入 Excel 文件 - df.to_excel(output_excel_file, index=False) - - # 排序并提取前 8 名的数据 - top_8 = df.sort_values(by='Count', ascending=False).head(8) - - # 输出前 8 名的数据 - print(top_8) - - - - - - -def generate_wordcloud(text_file, stopwords_file, output_image_file, font_path='msyh.ttc'): - # 加载停用词 - def load_stopwords(file_path): - with open(file_path, encoding='utf-8') as f: - stopwords = set(f.read().strip().split('\n')) - return stopwords - - # 读取停用词 - stopwords = load_stopwords(stopwords_file) - - # 读取文本文件 - with open(text_file, encoding='utf-8') as f: - txt = f.read() - - # 分词并过滤停用词 - words = jieba.lcut(txt) - filtered_words = [word for word in words if word not in stopwords] - - # 将处理后的词汇拼接成字符串 - word_string = ' '.join(filtered_words) - - # 生成词云 - wc = wordcloud.WordCloud( - width=700, - height=700, - background_color='white', - font_path=font_path - ) - wc.generate(word_string) - - # 保存词云图 - wc.to_file(output_image_file) - - - - - - -urls = ['https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=10&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=324&web_location=1430654&w_rid=420b5e834d7dd54d76f4fba1b7b1e665&wts=1725152144', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=8&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=252&web_location=1430654&w_rid=7fdf1d4b3f7d534c993f50173d02de3f&wts=1725152135', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=7&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=216&web_location=1430654&w_rid=6749123b8b393589cc7c80c1e93ada58&wts=1725152132', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=6&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=180&web_location=1430654&w_rid=74f00cf5195a9ec7ef3d57e347704770&wts=1725152128', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=5&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=144&web_location=1430654&w_rid=e914e50a0da59031c553d631ac5f1fde&wts=1725152124', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=4&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=108&web_location=1430654&w_rid=c622f59f9e1360765b62f0e0bc858fa1&wts=1725152121', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=3&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=72&web_location=1430654&w_rid=a60e99a470fa19919a071c865dd1583f&wts=1725152115', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=2&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=36&web_location=1430654&w_rid=8fc24d10311ce5e5730a84daadbbb6b3&wts=1725152102', - 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=9&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=288&web_location=1430654&w_rid=a9cbd6c813f6d27561d5f0d583c0ed76&wts=1725153457', - ] # 替换为实际的URL -headers = { - - 'cookie':'buvid4=686CE350-75FA-4921-C069-8D0E582FF02993159-024082507-y91msXDi8JTSAtvVtdhJkQ%3D%3D; buvid3=313C6A34-4C14-0939-EBE8-332F809D2EF655028infoc; b_nut=1725087454; CURRENT_FNVAL=4048; _uuid=10E7EC991-7B18-9A8B-78AA-C95F55102347103610infoc; rpdid=|(JlklRl)~Y|0J\'u~kl|)~l|l; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; fingerprint=f90b71618c196fb8806f458403d943fb; buvid_fp_plain=undefined; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU4Njk1NzEsImlhdCI6MTcyNTYxMDMxMSwicGx0IjotMX0.x0CsQ6o6lx4IcK82uHYJjDq_WMedyzoqa081au5YPug; bili_ticket_expires=1725869511; bp_t_offset_1074062089=974427929414991872; buvid_fp=f90b71618c196fb8806f458403d943fb; SESSDATA=e74a05df%2C1741267229%2Ce876a%2A91CjDqLgub8fAVML6ADiSzb56IvMh3z61KnSnawN0g_c1h5emTp3cU9qrpFxgDEzzpawASVkpfc01rblFpaUxDRHViNXpJdGhweEdNY2VDdEJ0N1hvMU92SWdLcG5Dclg5dlZmV29aMWZfX2ZSWHJ5VVN3ZHRkc0ZaLU9COHdmeDR2T0tmSXlvdmt3IIEC; bili_jct=addb604342937a4322aa12322c11bc2c; DedeUserID=3546758143544046; DedeUserID__ckMd5=65316417021aa6ed; sid=7yti0jp9; b_lsid=D810C241D_191CEE2FE76; bsource=search_bing; home_feed_column=5; browser_resolution=1455-699', - 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' -} - -def main(): - #获取视频aid - process_urls(urls, headers) - print("获取视频aid完毕") - #将视频aid转换成cid - process_aid_and_cid('aid.txt', 'cid.txt', headers) - print("将aid转换成cid完毕,cid去重完成,结果已写回文件。") - #获取视频弹幕 - fetch_danmu() - print("弹幕爬取完成。") - # 调用函数进行文件清理 - print('开始清洗弹幕') - clean_file('comment.txt', 'cleaned_comment.txt') - print("弹幕清洗完毕") - #数据统计输出 - print('开始数据统计') - analyze_keywords_in_comments('cleaned_comment.txt', 'keywords.txt', 'ai_technologies_count.xlsx') - #输出词云图 - print("开始构建词云图") - generate_wordcloud('cleaned_comment.txt', 'stopwords.txt', '词云.png') - print("构建词云图完毕") -if __name__ == "__main__": - main() - - - - - - - - - - - - - - - - - - - - - +import requests +import re +import pandas as pd +from collections import Counter +import jieba +import wordcloud + +def extract_ids_from_url(url, head, output_file='aid.txt'): + """ + 从给定的 URL 中提取 IDs 并将其保存到指定的文件中。 + + 参数: + url (str): 要请求的 URL。 + head (dict): 请求头,用于发起 HTTP 请求。 + output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 + """ + try: + # 发起 GET 请求 + response = requests.get(url=url, headers=head) + # 确保请求成功,状态码在 200 到 299 之间 + response.raise_for_status() + + # 将响应内容解析为 JSON 格式 + data = response.json() + + # 检查响应数据是否包含 'data' 和 'result' 键 + if 'data' in data and 'result' in data['data']: + items = data['data']['result'] + # 提取每个条目的 'id' 字段 + ids = [item['id'] for item in items] + + # 以追加模式打开文件,并写入每个 ID + with open(output_file, 'a') as file: + for aid in ids: + file.write(f"{aid}\n") + + print(f"IDs have been saved to {output_file}") + else: + print("Unexpected response format") # 如果响应格式不符合预期,输出提示信息 + + except requests.RequestException as e: + # 捕获并打印请求相关的错误 + print(f"Request error: {e}") + except KeyError as e: + # 捕获并打印键错误 + print(f"Key error: {e}") + except Exception as e: + # 捕获并打印其他类型的异常 + print(f"An error occurred: {e}") + + +def process_urls(urls1, headers1, output_file='aid.txt'): + """ + 遍历 URL 列表,并对每个 URL 调用 extract_ids_from_url 函数进行处理。 + + 参数: + urls1 (list): 包含 URL 的列表。 + headers1 (dict): 请求头,用于发起 HTTP 请求。 + output_file (str): 存储提取的 ID 的文件路径,默认为 'aid.txt'。 + """ + for url in urls1: + extract_ids_from_url(url, headers1, output_file) + + + + +def process_aid_and_cid(aid_file_path, cid_file_path, headers): + # 打开 aid 文件,并读取其中的所有 aid + with open(aid_file_path, 'r') as file: + aids = [line.strip() for line in file if line.strip()] + + count = 0 + # 打开 cid 文件(以追加模式),准备写入 cid 数据 + with open(cid_file_path, 'a') as file: + # 遍历每个 aid,构造请求 URL 并获取对应的数据 + for aid in aids: + url = f'https://api.bilibili.com/x/player/pagelist?aid={aid}' + response = requests.get(url=url, headers=headers).json() + # 遍历响应数据中的每个条目,提取 cid + for item in response.get('data', []): + cid = item['cid'] + # 将 cid 写入文件 + file.write(f"{cid}\n") + count += 1 + # 输出处理进度 + print(f"Processed: {count} CIDs") + + def remove_duplicates(file_path): + # 读取 cid 文件中的所有 cid + with open(file_path, 'r') as file: + cids = [line.strip() for line in file if line.strip()] + # 使用字典去除重复的 cid + unique_cids = list(dict.fromkeys(cids)) + # 将去重后的 cid 写回文件 + with open(file_path, 'w') as file: + for cid in unique_cids: + file.write(cid + '\n') + # 输出去重完成的提示 + + + # 调用 remove_duplicates 函数,去除 cid 文件中的重复项 + remove_duplicates(cid_file_path) + + + + + + +def fetch_danmu(): + # 读取 cid 文件 + print("开始爬取弹幕") + with open('cid.txt', 'r') as file: + cids = [line.strip() for line in file if line.strip()] + + for cid in cids: + url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid={cid}&date=2024-08-31' + response = requests.get(url=url, headers=headers) + response.encoding = 'utf-8' + # 匹配弹幕内容 + content_list = re.findall('[\u4e00-\u9fa5]+', response.text) + content = '\n'.join(content_list) + # 将弹幕写入 comment.txt + with open('comment.txt', mode='a', encoding='utf-8') as f: + f.write(content + '\n') + + + + +# 定义需要过滤的关键词或短语 +keywords_to_remove = [ + '出错啦', + '错误号', + '由于触发哔哩哔哩安全风控策略', + '该次访问请求被拒绝' +] + +# 定义一个正则表达式模式,用于匹配需要删除的内容 +pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_to_remove)) + +def clean_file(input_file, output_file): + with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile: + for line in infile: + # 如果行中不包含需要过滤的关键词,则写入输出文件 + if not pattern.search(line): + outfile.write(line) + + + + + +def analyze_keywords_in_comments(comments_file, keywords_file, output_excel_file): + # 读取评论文件 + with open(comments_file, 'r', encoding='utf-8') as file: + comments = file.readlines() + + # 读取关键词列表 + with open(keywords_file, 'r', encoding='utf-8') as file: + keywords = [line.strip() for line in file] + + # 定义一个列表用于存储评论中的 AI 技术应用 + ai_technologies = [] + + # 遍历评论,统计每个关键词的出现次数 + for comment in comments: + for keyword in keywords: + if keyword in comment: + ai_technologies.append(keyword) + + # 统计每个技术的出现次数 + tech_counts = Counter(ai_technologies) + + # 将统计结果转换为 DataFrame + df = pd.DataFrame(tech_counts.items(), columns=['AI Technology', 'Count']) + + # 将 DataFrame 写入 Excel 文件 + df.to_excel(output_excel_file, index=False) + + # 排序并提取前 8 名的数据 + top_8 = df.sort_values(by='Count', ascending=False).head(8) + + # 输出前 8 名的数据 + print(top_8) + + + + + + +def generate_wordcloud(text_file, stopwords_file, output_image_file, mask_image_file=None, font_path='msyh.ttc'): + # 加载停用词 + def load_stopwords(file_path): + with open(file_path, encoding='utf-8') as f: + stopwords = set(f.read().strip().split('\n')) + return stopwords + + # 读取停用词 + stopwords = load_stopwords(stopwords_file) + + # 读取文本文件 + with open(text_file, encoding='utf-8') as f: + txt = f.read() + + # 分词并过滤停用词 + words = jieba.lcut(txt) + filtered_words = [word for word in words if word not in stopwords] + + # 将处理后的词汇拼接成字符串 + word_string = ' '.join(filtered_words) + + # 设置词云参数 + wc_kwargs = { + 'width': 700, + 'height': 700, + 'background_color': 'white', + 'font_path': font_path, + } + + if mask_image_file: + # 读取掩模图像并确保其为二值图像 + mask_image = np.array(Image.open(mask_image_file).convert('L')) + + # 反转黑白区域,黑色区域变为0,白色区域变为255 + mask = np.where(mask_image == 0, 0, 255).astype(np.uint8) + + wc_kwargs.update({ + 'mask': mask, # 应用掩模图像 + 'contour_color': 'white', # 轮廓颜色 + 'contour_width': 0, + }) + + wc = wordcloud.WordCloud(**wc_kwargs) + wc.generate(word_string) + # 保存词云图 + wc.to_file(output_image_file) + + + + + + +urls = ['https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=10&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=324&web_location=1430654&w_rid=420b5e834d7dd54d76f4fba1b7b1e665&wts=1725152144', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=8&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=252&web_location=1430654&w_rid=7fdf1d4b3f7d534c993f50173d02de3f&wts=1725152135', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=7&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=216&web_location=1430654&w_rid=6749123b8b393589cc7c80c1e93ada58&wts=1725152132', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=6&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=180&web_location=1430654&w_rid=74f00cf5195a9ec7ef3d57e347704770&wts=1725152128', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=5&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=144&web_location=1430654&w_rid=e914e50a0da59031c553d631ac5f1fde&wts=1725152124', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=4&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=108&web_location=1430654&w_rid=c622f59f9e1360765b62f0e0bc858fa1&wts=1725152121', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=3&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=72&web_location=1430654&w_rid=a60e99a470fa19919a071c865dd1583f&wts=1725152115', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=2&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=36&web_location=1430654&w_rid=8fc24d10311ce5e5730a84daadbbb6b3&wts=1725152102', + 'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=9&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=288&web_location=1430654&w_rid=a9cbd6c813f6d27561d5f0d583c0ed76&wts=1725153457', + ] # 替换为实际的URL +headers = { + + 'cookie':'buvid4=686CE350-75FA-4921-C069-8D0E582FF02993159-024082507-y91msXDi8JTSAtvVtdhJkQ%3D%3D; buvid3=313C6A34-4C14-0939-EBE8-332F809D2EF655028infoc; b_nut=1725087454; CURRENT_FNVAL=4048; _uuid=10E7EC991-7B18-9A8B-78AA-C95F55102347103610infoc; rpdid=|(JlklRl)~Y|0J\'u~kl|)~l|l; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; fingerprint=f90b71618c196fb8806f458403d943fb; buvid_fp_plain=undefined; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU4Njk1NzEsImlhdCI6MTcyNTYxMDMxMSwicGx0IjotMX0.x0CsQ6o6lx4IcK82uHYJjDq_WMedyzoqa081au5YPug; bili_ticket_expires=1725869511; bp_t_offset_1074062089=974427929414991872; buvid_fp=f90b71618c196fb8806f458403d943fb; SESSDATA=e74a05df%2C1741267229%2Ce876a%2A91CjDqLgub8fAVML6ADiSzb56IvMh3z61KnSnawN0g_c1h5emTp3cU9qrpFxgDEzzpawASVkpfc01rblFpaUxDRHViNXpJdGhweEdNY2VDdEJ0N1hvMU92SWdLcG5Dclg5dlZmV29aMWZfX2ZSWHJ5VVN3ZHRkc0ZaLU9COHdmeDR2T0tmSXlvdmt3IIEC; bili_jct=addb604342937a4322aa12322c11bc2c; DedeUserID=3546758143544046; DedeUserID__ckMd5=65316417021aa6ed; sid=7yti0jp9; b_lsid=D810C241D_191CEE2FE76; bsource=search_bing; home_feed_column=5; browser_resolution=1455-699', + 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0' +} + +def main(): + #获取视频aid + process_urls(urls, headers) + print("获取视频aid完毕") + #将视频aid转换成cid + process_aid_and_cid('aid.txt', 'cid.txt', headers) + print("将aid转换成cid完毕,cid去重完成,结果已写回文件。") + #获取视频弹幕 + fetch_danmu() + print("弹幕爬取完成。") + # 调用函数进行文件清理 + print('开始清洗弹幕') + clean_file('comment.txt', 'cleaned_comment.txt') + print("弹幕清洗完毕") + #数据统计输出 + print('开始数据统计') + analyze_keywords_in_comments('cleaned_comment.txt', 'keywords.txt', 'ai_technologies_count.xlsx') + #输出词云图 + print("开始构建词云图") + generate_wordcloud('cleaned_comment.txt', 'stopwords.txt', '词云.png', 'img.png') + print("构建词云图完毕") +if __name__ == "__main__": + main() + + + + + + + + + + + + + + + + + + + + +