pachong/组合.py

from audioop import avgpp
import requests
import time
import re, os
import jieba
from wordcloud import WordCloud
from imageio import imread
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
import jieba
import numpy as np
#通过cid获取弹幕
def spider_page(cid):
    url = f'http://comment.bilibili.com/{
    cid}.xml'
    headers = {

        'referer': 'xxxxx',
        'User-Agent': 'xxxxx',
        'cookie': "xxxxx"
    }
    resp = requests.get(url, headers=headers)
    resp.encoding = resp.apparent_encoding
    print(resp.text)
    if resp.status_code == 200:
        # 获取所有弹幕内容
        content_list = re.findall('<d p=".*?">(.*?)</d>', resp.text)
        for item in content_list:
            with open(comment_file_path, 'a', encoding='utf-8') as fin:
                fin.write(item + '\n')
                print(item)
        print('-------------弹幕获取完毕！-------------')
#查找cid
def extract_cid_number(text):
    # 定义正则表达式模式
    pattern = r'cid:(\d+)'
    match = re.search(pattern, text)
    if match:
        # 如果找到匹配项，返回匹配的数字部分
        return match.group(1)  # group(1) 表示第一个捕获组
    else:
        # 如果没有找到匹配项，返回None
        return None
# 爬取300个bv号
for page in range(1,11):
    url = ("https://api.bilibili.com/x/web-interface/wbi/search/type")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
        'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836; bp_t_offset_355329678=978470992584114176"
    }
    params = {
        'category_id': '',
        'search_type': 'video',
        'ad_resource': 5654,
        '__refresh__': 'true',
        'context': '',
        'page': page,
        'page_size': 30,
        'pubtime_begin_s': 0,
        'pubtime_end_s': 0,
        'from_source': '',
        'from_spmid': '333.337',
        'platform': 'pc',
        'highlight': 1,
        'single_column': 0,
        'keyword': '2024巴黎奥运会',
        'qv_id': 's5p9ZoGL8W8aU7TP3gWJ1xLzPA6njovt',
        'source_tag': 3,
        'gaia_vtoken': '',
        'dynamic_offset': 24,
        'web_location': 1430654,
        'w_rid': '5475898fdbb1cc8a359f93dd5826e3f9',
        'wts': 1726221162
    }
    response = requests.get(url,headers=headers, params=params)
    print(response.text)
    it = re.finditer(r'"bvid":"(BV[\d\w]{10})"',response.text)
    with open('bvnumbers.txt', 'a', encoding='utf-8') as file:
        for i in it:
            bv = i.group(1)
            file.write(bv + '\n')  # 每个BV号独占一行
    time.sleep(2)
    response.close()
comment_file_path = 'B站弹幕.csv'
output_file = 'cids.txt'#cid的存储文件
api_base_url="https://api.bilibili.com/x/player/pagelist?bvid="
headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
    'cookie':"buvid4=1AD93210-06E5-365A-934E-75A291B6908009721-022061720-eVf9MUhvco5nm9yolfHaMw%3D%3D; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1416573595037363; CURRENT_FNVAL=4048; header_theme_version=CLOSE; enable_web_push=DISABLE; PVID=1; rpdid=|(u))kkYuu|l0J'u~|J~Yu|)J; DedeUserID=355329678; DedeUserID__ckMd5=5e429bd71d91fb47; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=6FF33841-3A610-8313-10ED8-3AE107622425A15607infoc; buvid3=3FA03EE9-E5EA-3463-0D0B-073E3807CA0620579infoc; b_nut=1719137316; fingerprint=d1e7fdeb59c00ae7dcb9837e214f09e9; hit-dyn-v2=1; CURRENT_QUALITY=112; buvid_fp=d1e7fdeb59c00ae7dcb9837e214f09e9; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY4MjgwODUsImlhdCI6MTcyNjU2ODgyNSwicGx0IjotMX0.d1aocPfSyHx3EgbchHgcdqtxN94d5619BcEPlXsG5Jw; bili_ticket_expires=1726828025; SESSDATA=8b1a6c86%2C1742204953%2C96da2%2A92CjBnKA1rMd20hx2CampWCl9FHGYIlzAI-IoQMscqxdn8LQwXeAP7HKkOso7RLRlh4CYSVkZQUFBoQ2RzNno5QVRPcEU2LXdrZG9Qd0luRE11STBlbmoyN1l6cl91cTM1Wk9FclU2cDB4NDFnYVNIQ01Fai1qS2dRd0xGLWFIbUNkNXV2elowWGhBIIEC; bili_jct=72a134de4233a184cb823906d856daeb; bp_t_offset_355329678=978467990401974272; b_lsid=79C7A3B2_19204B808B2; bsource=search_bing; home_feed_column=5; browser_resolution=1659-836"
}
with open('extracted_bv_numbers.txt', 'r', encoding='utf-8') as file:
    bvnumbers = file.read().splitlines()
with open(output_file, 'w', encoding='utf-8') as file:
    pass
with open(output_file, 'a', encoding='utf-8') as file:
    for bvid in bvnumbers:#循环获取弹幕
        api_url = f"{api_base_url}{bvid}"
        try:
            response = requests.get(api_url,headers=headers)
            if response.status_code == 200:
                data = response.json()
                # 确保数据结构是预期的形式
                if 'data' in data and isinstance(data['data'], list):
                    for page in data['data']:
                        cid = page.get('cid')
                        if cid is not None:
                            spider_page(cid)
                            f = open('B站弹幕.csv', encoding='utf-8')
                            f.close()
            else:
                print(f"请求失败，状态码: {response.status_code}")
            time.sleep(1)
        except Exception as e:
            print(f"处理BV号 {bvid} 时发生错误: {e}")
        time.sleep(1)#休息一下
print("所有BV号已处理完毕。")
# 定义与2024年巴黎奥运会赛事应用AI技术相关的关键词
ai_olympic_keywords = [
    "巴黎奥运会AI", "AI技术", "智能", "子弹时间", "AI增强", "赛事转播", "辅助训练", "AI回放系统",
    "3d模型", "沉浸式虚拟重建","AI重塑","运动捕捉","特效渲染","AI修复","AI","人工智能"
]
# 字典存储包含关键词的弹幕及其出现次数
danmu_counts = Counter()
output_file_path = '包含关键词的弹幕.txt'
input_file_path = 'B站弹幕.csv'
# 打开弹幕文件并读取内容
with open('B站弹幕.csv', mode='r', encoding='utf-8') as file:
    # 遍历每一行数据
    for line in file:
        danmu = line.strip()  # 去除行尾的换行符
        # 检查弹幕是否包含任何一个关键词
        for keyword in ai_olympic_keywords:
            if keyword in danmu:
                danmu_counts[danmu] += 1
                break  # 一旦找到关键词，就跳过剩余的关键词检查
# 获取出现次数最多的前8条弹幕
# 打开输入文件以读取内容，打开输出文件以写入内容
with open(input_file_path, mode='r', encoding='utf-8') as input_file, \
     open(output_file_path, mode='w', encoding='utf-8') as output_file:
    # 遍历输入文件的每一行
    for line in input_file:
        danmu = line.strip()  # 去除行尾的换行符
        # 检查弹幕是否包含任何一个关键词
        if any(keyword in danmu for keyword in ai_olympic_keywords):
            # 如果包含，就写入输出文件
            output_file.write(danmu + '\n')  # 添加换行符以便每条弹幕占一行
top_8_danmus = danmu_counts.most_common(8)
# 将结果保存到文件
with open('top_ai_olympic_danmus.txt', 'w', encoding='utf-8') as output:
    for danmu, count in top_8_danmus:
        output.write(f"{danmu}\n")
f = open('包含关键词的弹幕.txt',encoding='utf-8')
text = f.read()
text_list = jieba.lcut(text)
print(text_list)
text_str = ''.join(text_list)
print(text_str)
#生成词云图
wc = WordCloud(
    width=2000,
    height=1000,
    background_color='white',
    font_path='C:\Windows\Fonts\SIMLI.TTF',
)
wc.generate(text_str)
wc.to_file('ciyun.png')
# 输出结果
print("出现次数最多的前8条关于2024巴黎奥运会赛事应用AI技术的弹幕:")
for danmu, count in top_8_danmus:
    print(f"'{danmu}' 出现了 {count} 次")