diff --git a/Get_Danmaku.py b/Get_Danmaku.py index b11b83c..25e3232 100644 --- a/Get_Danmaku.py +++ b/Get_Danmaku.py @@ -1,92 +1,92 @@ -import time -import re -import requests - -headers = { - "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", - "Cookie":"buvid4=D16319A3-9CCF-7099-A512-0A135AD5785B47915-023052812-b1nz50QSFWB5YJLoC8St7Q%3D%3D; buvid_fp_plain=undefined; enable_web_push=DISABLE; header_theme_version=CLOSE; DedeUserID=5493983; DedeUserID__ckMd5=5158623efb606499; CURRENT_FNVAL=16; blackside_state=0; CURRENT_BLACKGAP=0; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=65D18E104-D381-1D7B-4191-BB14FBF71016F05662infoc; CURRENT_QUALITY=116; buvid3=5C4356FC-CED4-B797-D0E9-0D465C57237867779infoc; b_nut=1723395567; rpdid=|(u)~Ju~J|kl0J'u~kJJml~u); LIVE_BUVID=AUTO8217257278253243; home_feed_column=5; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2NzExOTIsImlhdCI6MTcyNjQxMTkzMiwicGx0IjotMX0.HL4benw1AXqZyCPuhdJ0gWt_ntTJ11SaspK8rVkM4Nw; bili_ticket_expires=1726671132; SESSDATA=62b3d1f9%2C1741976682%2C01ac3%2A92CjBKLlT2PXjk2hmj_l4GXirVUtBEUJu-ii7ymQtB_b7urFREHxDukjCy1zRg6t4Fq_USVld0eW1pNGtoZ2prQWRCRUg3aTZZTzRUUE1wc1M2VkpGZDhTZzY3OHhwZ3BGTEQ4am9tOTJLMlUtZUduS2lCMjhpZS1HQU1laGRReUpoQ3VwdWdYS3FBIIEC; bili_jct=f1753715b455ce2db99ec05354cf2c00; fingerprint=262de2a503b05140a20bda05841f9755; buvid_fp=262de2a503b05140a20bda05841f9755; PVID=2; browser_resolution=2040-1026; bp_t_offset_5493983=978040246019031040; b_lsid=62D18E35_19203235E8B; sid=86fq5dr0", - "Referer":"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3" -} - -def has_duplicates(lst): #判断列表中是否有重复元素 - return len(lst) != len(set(lst)) - -def get_videos_cid(max_videos=300): #获取视频BV号 - bv_list = [] - c_list = [] - i = 1 - while len(bv_list) < max_videos: - url = f"https://search.bilibili.com/all?vt=45910958&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=3&page={i}" - - #为了防止被拦截,打个瞌睡 - time.sleep(2) - resp = requests.get(url, headers=headers) - resp.encoding = "utf-8" - - #错误判断 - if resp.status_code != 200: - print(f"第{i}页视频请求失败,状态码: {resp.status_code}") - else: - i += 1 - bv_id_re = r'BV\w+' - single_page_bv_ids = re.findall(bv_id_re, resp.text) - single_page_bv_ids = list(set(single_page_bv_ids)) - - #提取每页视频的BV号 - for bv_ids in single_page_bv_ids: - if len(bv_list) >= max_videos: - break - bv_list.append(bv_ids) - bv_list = list(set(bv_list)) - - for bv_ids in bv_list: - resp1 = requests.get(f"https://api.bilibili.com/x/player/pagelist?bvid={bv_ids}",headers=headers) - resp1.encoding = "utf-8" - - if resp1.status_code == 200: - c_id_re = r'"cid"\s*:\s*(\d+)' - single_page_c_ids = re.findall(c_id_re, resp1.text) - single_page_c_ids = list(set(single_page_c_ids)) - - #提取每个BV号对应的cid(由于某些视频有分p,导致有多个cid,所以这里需要遍历) - for c_ids in single_page_c_ids: - if len(c_list) >= max_videos: - break - c_list.append(c_ids) - c_list = list(set(c_list)) - - print(f"最终cid列表: {c_list}") - print(f"最终cid数量: {len(c_list)}") - if has_duplicates(c_list): - print("列表中有重复cid!") - else: - print("列表中没有重复cid!") - return (c_list) - -def get_danmu(c_list): #通过cid获取视频弹幕 - danmu_list = [] - - for c_ids in c_list: - url = f"https://comment.bilibili.com/{c_ids}.xml" - resp = requests.get(url, headers=headers) - resp.encoding = "utf-8" - - #错误判断 - if resp.status_code != 200: - print(f"cid为{c_ids}的视频获取弹幕失败,状态码: {resp.status_code}") - else: - danmu_re = r']*>(.*?)' - single_video_danmus = re.findall(danmu_re, resp.text) - single_video_danmus = list(single_video_danmus) - danmu_list.extend(single_video_danmus) - print(f"cid为{c_ids}的视频获取弹幕成功!") - - return danmu_list - -c_list = get_videos_cid() -danmu_list = get_danmu(c_list) - -#写入文件 -with open('danmu.txt', 'w', encoding="utf-8") as f: - for dm in danmu_list: - f.write(str(dm) + "\n") +import time +import re +import requests + +headers = { + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + "Cookie":"buvid4=D16319A3-9CCF-7099-A512-0A135AD5785B47915-023052812-b1nz50QSFWB5YJLoC8St7Q%3D%3D; buvid_fp_plain=undefined; enable_web_push=DISABLE; header_theme_version=CLOSE; DedeUserID=5493983; DedeUserID__ckMd5=5158623efb606499; CURRENT_FNVAL=16; blackside_state=0; CURRENT_BLACKGAP=0; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; _uuid=65D18E104-D381-1D7B-4191-BB14FBF71016F05662infoc; CURRENT_QUALITY=116; buvid3=5C4356FC-CED4-B797-D0E9-0D465C57237867779infoc; b_nut=1723395567; rpdid=|(u)~Ju~J|kl0J'u~kJJml~u); LIVE_BUVID=AUTO8217257278253243; home_feed_column=5; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY2NzExOTIsImlhdCI6MTcyNjQxMTkzMiwicGx0IjotMX0.HL4benw1AXqZyCPuhdJ0gWt_ntTJ11SaspK8rVkM4Nw; bili_ticket_expires=1726671132; SESSDATA=62b3d1f9%2C1741976682%2C01ac3%2A92CjBKLlT2PXjk2hmj_l4GXirVUtBEUJu-ii7ymQtB_b7urFREHxDukjCy1zRg6t4Fq_USVld0eW1pNGtoZ2prQWRCRUg3aTZZTzRUUE1wc1M2VkpGZDhTZzY3OHhwZ3BGTEQ4am9tOTJLMlUtZUduS2lCMjhpZS1HQU1laGRReUpoQ3VwdWdYS3FBIIEC; bili_jct=f1753715b455ce2db99ec05354cf2c00; fingerprint=262de2a503b05140a20bda05841f9755; buvid_fp=262de2a503b05140a20bda05841f9755; PVID=2; browser_resolution=2040-1026; bp_t_offset_5493983=978040246019031040; b_lsid=62D18E35_19203235E8B; sid=86fq5dr0", + "Referer":"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=3" +} + +def has_duplicates(lst): #判断列表中是否有重复元素 + return len(lst) != len(set(lst)) + +def get_videos_cid(max_videos=300): #获取视频cid + bv_list = [] + c_list = [] + i = 1 + while len(bv_list) < max_videos: + url = f"https://search.bilibili.com/all?vt=45910958&keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.788&search_source=3&page={i}" + + #为了防止被拦截,打个瞌睡 + time.sleep(2) + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" + + #错误判断 + if resp.status_code != 200: + print(f"第{i}页视频请求失败,状态码: {resp.status_code}") + else: + i += 1 + bv_id_re = r'BV\w+' + single_page_bv_ids = re.findall(bv_id_re, resp.text) + single_page_bv_ids = list(set(single_page_bv_ids)) + + #提取每页视频的BV号 + for bv_ids in single_page_bv_ids: + if len(bv_list) >= max_videos: + break + bv_list.append(bv_ids) + bv_list = list(set(bv_list)) + + for bv_ids in bv_list: + resp1 = requests.get(f"https://api.bilibili.com/x/player/pagelist?bvid={bv_ids}",headers=headers) + resp1.encoding = "utf-8" + + if resp1.status_code == 200: + c_id_re = r'"cid"\s*:\s*(\d+)' + single_page_c_ids = re.findall(c_id_re, resp1.text) + single_page_c_ids = list(set(single_page_c_ids)) + + #提取每个BV号对应的cid(由于某些视频有分p,导致有多个cid,所以这里需要遍历) + for c_ids in single_page_c_ids: + if len(c_list) >= max_videos: + break + c_list.append(c_ids) + c_list = list(set(c_list)) + + print(f"最终cid列表: {c_list}") + print(f"最终cid数量: {len(c_list)}") + if has_duplicates(c_list): + print("列表中有重复cid!") + else: + print("列表中没有重复cid!") + return (c_list) + +def get_danmu(c_list): #通过cid获取视频弹幕 + danmu_list = [] + + for c_ids in c_list: + url = f"https://comment.bilibili.com/{c_ids}.xml" + resp = requests.get(url, headers=headers) + resp.encoding = "utf-8" + + #错误判断 + if resp.status_code != 200: + print(f"cid为{c_ids}的视频获取弹幕失败,状态码: {resp.status_code}") + else: + danmu_re = r']*>(.*?)' + single_video_danmus = re.findall(danmu_re, resp.text) + single_video_danmus = list(single_video_danmus) + danmu_list.extend(single_video_danmus) + print(f"cid为{c_ids}的视频获取弹幕成功!") + + return danmu_list + +c_list = get_videos_cid() +danmu_list = get_danmu(c_list) + +#写入文件 +with open('danmu.txt', 'w', encoding="utf-8") as f: + for dm in danmu_list: + f.write(str(dm) + "\n")