Primary tasks

11 months ago · dd322771a6
parent 3af668f091
commit dd322771a6
1 changed files with 286 additions and 0 deletions
--- a/comments).py
+++ b/comments).py
@ -0,0 +1,286 @@
+import requests
+import re
+import pandas as pd
+from collections import Counter
+import jieba
+import wordcloud
+
+def extract_ids_from_url(url, head, output_file='aid.txt'):
+    """
+    从给定的 URL 中提取 IDs 并将其保存到指定的文件中。
+
+    参数:
+    url (str): 要请求的 URL。
+    head (dict): 请求头，用于发起 HTTP 请求。
+    output_file (str): 存储提取的 ID 的文件路径，默认为 'aid.txt'。
+    """
+    try:
+        # 发起 GET 请求
+        response = requests.get(url=url, headers=head)
+        # 确保请求成功，状态码在 200 到 299 之间
+        response.raise_for_status()
+
+        # 将响应内容解析为 JSON 格式
+        data = response.json()
+
+        # 检查响应数据是否包含 'data' 和 'result' 键
+        if 'data' in data and 'result' in data['data']:
+            items = data['data']['result']
+            # 提取每个条目的 'id' 字段
+            ids = [item['id'] for item in items]
+
+            # 以追加模式打开文件，并写入每个 ID
+            with open(output_file, 'a') as file:
+                for aid in ids:
+                    file.write(f"{aid}\n")
+
+            print(f"IDs have been saved to {output_file}")
+        else:
+            print("Unexpected response format")  # 如果响应格式不符合预期，输出提示信息
+
+    except requests.RequestException as e:
+        # 捕获并打印请求相关的错误
+        print(f"Request error: {e}")
+    except KeyError as e:
+        # 捕获并打印键错误
+        print(f"Key error: {e}")
+    except Exception as e:
+        # 捕获并打印其他类型的异常
+        print(f"An error occurred: {e}")
+
+
+def process_urls(urls1, headers1, output_file='aid.txt'):
+    """
+    遍历 URL 列表，并对每个 URL 调用 extract_ids_from_url 函数进行处理。
+
+    参数:
+    urls1 (list): 包含 URL 的列表。
+    headers1 (dict): 请求头，用于发起 HTTP 请求。
+    output_file (str): 存储提取的 ID 的文件路径，默认为 'aid.txt'。
+    """
+    for url in urls1:
+        extract_ids_from_url(url, headers1, output_file)
+
+
+
+
+def process_aid_and_cid(aid_file_path, cid_file_path, headers):
+    # 打开 aid 文件，并读取其中的所有 aid
+    with open(aid_file_path, 'r') as file:
+        aids = [line.strip() for line in file if line.strip()]
+
+    count = 0
+    # 打开 cid 文件（以追加模式），准备写入 cid 数据
+    with open(cid_file_path, 'a') as file:
+        # 遍历每个 aid，构造请求 URL 并获取对应的数据
+        for aid in aids:
+            url = f'https://api.bilibili.com/x/player/pagelist?aid={aid}'
+            response = requests.get(url=url, headers=headers).json()
+            # 遍历响应数据中的每个条目，提取 cid
+            for item in response.get('data', []):
+                cid = item['cid']
+                # 将 cid 写入文件
+                file.write(f"{cid}\n")
+                count += 1
+                # 输出处理进度
+                print(f"Processed: {count} CIDs")
+
+    def remove_duplicates(file_path):
+        # 读取 cid 文件中的所有 cid
+        with open(file_path, 'r') as file:
+            cids = [line.strip() for line in file if line.strip()]
+        # 使用字典去除重复的 cid
+        unique_cids = list(dict.fromkeys(cids))
+        # 将去重后的 cid 写回文件
+        with open(file_path, 'w') as file:
+            for cid in unique_cids:
+                file.write(cid + '\n')
+        # 输出去重完成的提示
+
+
+    # 调用 remove_duplicates 函数，去除 cid 文件中的重复项
+    remove_duplicates(cid_file_path)
+
+
+
+
+
+
+def fetch_danmu():
+    # 读取 cid 文件
+    print("开始爬取弹幕")
+    with open('cid.txt', 'r') as file:
+        cids = [line.strip() for line in file if line.strip()]
+
+    for cid in cids:
+        url = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid={cid}&date=2024-08-31'
+        response = requests.get(url=url, headers=headers)
+        response.encoding = 'utf-8'
+        # 匹配弹幕内容
+        content_list = re.findall('[\u4e00-\u9fa5]+', response.text)
+        content = '\n'.join(content_list)
+        # 将弹幕写入 comment.txt
+        with open('comment.txt', mode='a', encoding='utf-8') as f:
+            f.write(content + '\n')
+
+
+
+
+# 定义需要过滤的关键词或短语
+keywords_to_remove = [
+    '出错啦',
+    '错误号',
+    '由于触发哔哩哔哩安全风控策略',
+    '该次访问请求被拒绝'
+]
+
+# 定义一个正则表达式模式，用于匹配需要删除的内容
+pattern = re.compile('|'.join(re.escape(keyword) for keyword in keywords_to_remove))
+
+def clean_file(input_file, output_file):
+    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
+        for line in infile:
+            # 如果行中不包含需要过滤的关键词，则写入输出文件
+            if not pattern.search(line):
+                outfile.write(line)
+
+
+
+
+
+def analyze_keywords_in_comments(comments_file, keywords_file, output_excel_file):
+    # 读取评论文件
+    with open(comments_file, 'r', encoding='utf-8') as file:
+        comments = file.readlines()
+
+    # 读取关键词列表
+    with open(keywords_file, 'r', encoding='utf-8') as file:
+        keywords = [line.strip() for line in file]
+
+    # 定义一个列表用于存储评论中的 AI 技术应用
+    ai_technologies = []
+
+    # 遍历评论，统计每个关键词的出现次数
+    for comment in comments:
+        for keyword in keywords:
+            if keyword in comment:
+                ai_technologies.append(keyword)
+
+    # 统计每个技术的出现次数
+    tech_counts = Counter(ai_technologies)
+
+    # 将统计结果转换为 DataFrame
+    df = pd.DataFrame(tech_counts.items(), columns=['AI Technology', 'Count'])
+
+    # 将 DataFrame 写入 Excel 文件
+    df.to_excel(output_excel_file, index=False)
+
+    # 排序并提取前 8 名的数据
+    top_8 = df.sort_values(by='Count', ascending=False).head(8)
+
+    # 输出前 8 名的数据
+    print(top_8)
+
+
+
+
+
+
+def generate_wordcloud(text_file, stopwords_file, output_image_file, font_path='msyh.ttc'):
+    # 加载停用词
+    def load_stopwords(file_path):
+        with open(file_path, encoding='utf-8') as f:
+            stopwords = set(f.read().strip().split('\n'))
+        return stopwords
+
+    # 读取停用词
+    stopwords = load_stopwords(stopwords_file)
+
+    # 读取文本文件
+    with open(text_file, encoding='utf-8') as f:
+        txt = f.read()
+
+    # 分词并过滤停用词
+    words = jieba.lcut(txt)
+    filtered_words = [word for word in words if word not in stopwords]
+
+    # 将处理后的词汇拼接成字符串
+    word_string = ' '.join(filtered_words)
+
+    # 生成词云
+    wc = wordcloud.WordCloud(
+        width=700,
+        height=700,
+        background_color='white',
+        font_path=font_path
+    )
+    wc.generate(word_string)
+
+    # 保存词云图
+    wc.to_file(output_image_file)
+
+
+
+
+
+
+urls = ['https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=10&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=324&web_location=1430654&w_rid=420b5e834d7dd54d76f4fba1b7b1e665&wts=1725152144',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=8&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=252&web_location=1430654&w_rid=7fdf1d4b3f7d534c993f50173d02de3f&wts=1725152135',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=7&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=216&web_location=1430654&w_rid=6749123b8b393589cc7c80c1e93ada58&wts=1725152132',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=6&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=180&web_location=1430654&w_rid=74f00cf5195a9ec7ef3d57e347704770&wts=1725152128',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=5&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=144&web_location=1430654&w_rid=e914e50a0da59031c553d631ac5f1fde&wts=1725152124',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=4&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=108&web_location=1430654&w_rid=c622f59f9e1360765b62f0e0bc858fa1&wts=1725152121',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=3&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=72&web_location=1430654&w_rid=a60e99a470fa19919a071c865dd1583f&wts=1725152115',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=2&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=36&web_location=1430654&w_rid=8fc24d10311ce5e5730a84daadbbb6b3&wts=1725152102',
+    'https://api.bilibili.com/x/web-interface/wbi/search/type?category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=9&page_size=42&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&qv_id=2vsSEmfb9hpudunbhxw9KMMxggdECjVp&source_tag=3&gaia_vtoken=&dynamic_offset=288&web_location=1430654&w_rid=a9cbd6c813f6d27561d5f0d583c0ed76&wts=1725153457',
+   ]  # 替换为实际的URL
+headers = {
+
+    'cookie':'buvid4=686CE350-75FA-4921-C069-8D0E582FF02993159-024082507-y91msXDi8JTSAtvVtdhJkQ%3D%3D; buvid3=313C6A34-4C14-0939-EBE8-332F809D2EF655028infoc; b_nut=1725087454; CURRENT_FNVAL=4048; _uuid=10E7EC991-7B18-9A8B-78AA-C95F55102347103610infoc; rpdid=|(JlklRl)~Y|0J\'u~kl|)~l|l; header_theme_version=CLOSE; enable_web_push=DISABLE; is-2022-channel=1; fingerprint=f90b71618c196fb8806f458403d943fb; buvid_fp_plain=undefined; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU4Njk1NzEsImlhdCI6MTcyNTYxMDMxMSwicGx0IjotMX0.x0CsQ6o6lx4IcK82uHYJjDq_WMedyzoqa081au5YPug; bili_ticket_expires=1725869511; bp_t_offset_1074062089=974427929414991872; buvid_fp=f90b71618c196fb8806f458403d943fb; SESSDATA=e74a05df%2C1741267229%2Ce876a%2A91CjDqLgub8fAVML6ADiSzb56IvMh3z61KnSnawN0g_c1h5emTp3cU9qrpFxgDEzzpawASVkpfc01rblFpaUxDRHViNXpJdGhweEdNY2VDdEJ0N1hvMU92SWdLcG5Dclg5dlZmV29aMWZfX2ZSWHJ5VVN3ZHRkc0ZaLU9COHdmeDR2T0tmSXlvdmt3IIEC; bili_jct=addb604342937a4322aa12322c11bc2c; DedeUserID=3546758143544046; DedeUserID__ckMd5=65316417021aa6ed; sid=7yti0jp9; b_lsid=D810C241D_191CEE2FE76; bsource=search_bing; home_feed_column=5; browser_resolution=1455-699',
+    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0'
+}
+
+def main():
+    #获取视频aid
+    process_urls(urls, headers)
+    print("获取视频aid完毕")
+    #将视频aid转换成cid
+    process_aid_and_cid('aid.txt', 'cid.txt', headers)
+    print("将aid转换成cid完毕，cid去重完成，结果已写回文件。")
+    #获取视频弹幕
+    fetch_danmu()
+    print("弹幕爬取完成。")
+    # 调用函数进行文件清理
+    print('开始清洗弹幕')
+    clean_file('comment.txt', 'cleaned_comment.txt')
+    print("弹幕清洗完毕")
+    #数据统计输出
+    print('开始数据统计')
+    analyze_keywords_in_comments('cleaned_comment.txt', 'keywords.txt', 'ai_technologies_count.xlsx')
+    #输出词云图
+    print("开始构建词云图")
+    generate_wordcloud('cleaned_comment.txt', 'stopwords.txt', '词云.png')
+    print("构建词云图完毕")
+if __name__ == "__main__":
+    main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+