获取相关视频的弹幕信息

11 months ago · b1d28ce221
parent 60242b72dd
commit b1d28ce221
1 changed files with 93 additions and 0 deletions
--- a/barrage/getBarrage.py
+++ b/barrage/getBarrage.py
@ -0,0 +1,93 @@
+'''
+获取相关视频弹幕信息
+由于B站反爬虫机制，对每一次数据爬取都手动规定了一定的随机延时，防止被ban
+
+searchWord是搜索关键词
+savePath将弹幕信息保存至xlsx文件中
+'''
+
+import re
+import time
+import random
+import requests
+import itertools
+from openpyxl import Workbook
+
+# 获取当前页码的视频链接地址
+def GetAllSearchVideoUrl(url, headers):
+    response = requests.get(url, headers = headers)
+    response.encoding = 'utf-8'
+    html_data = response.text
+    content_list = re.findall('<a href="(.*?)" .*? target="_blank" data-v-4caf9c8c><div class=".*?" data-v-4caf9c8c>', html_data)
+    return content_list
+
+# 获取当前视频的弹幕接口cid地址
+def GetVideoCid(url, headers):
+    response = requests.get(url, headers = headers)
+    response.encoding = 'utf-8'
+    html_data = response.text
+    content = re.findall('"dynamic":.*?,(.*?),"dimension":.*?', html_data)
+    back = re.search('"cid":', content[0])
+    num = back.span()[1]
+    cid = content[0][num:]
+    return cid
+
+# 获取当前cid地址下的视频弹幕数据
+def GetVideoBarrage(url, headers):
+    response = requests.get(url, headers = headers)
+    response.encoding = 'utf-8'
+    html_data = response.text
+    content_list = re.findall('<d p=".*?">(.*?)</d>', html_data)
+    return content_list
+
+
+
+def main():
+    savePath = './docs/barrage.xlsx'
+    searchWord = '2024巴黎奥运会'
+
+    headers = {
+        'Cookie': '',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36'
+        }
+    
+    content_list = []
+
+    for i in range(15):
+        page = i + 1
+        o = page * 30
+        url = f'https://search.bilibili.com/all?keyword={searchWord}&from_source=webtop_search&spm_id_from=333.934&search_source=5&page={page}&o={o}'
+        content = GetAllSearchVideoUrl(url, headers)
+        content_list.append(content)
+        print(f'现在正在获取第{page}页的视频数据')
+        time.sleep(1+random.random())
+
+    content_list = list(itertools.chain.from_iterable(content_list))
+    content_list = list(set(content_list))
+
+    cid_list = []
+
+    for i in range(300):
+        url = 'https:' + content_list[i]
+        cid = GetVideoCid(url, headers)
+        cid_list.append(cid)
+        print(f'现在正在获取第{i+1}个视频弹幕接口cid信息')
+        time.sleep(1+random.random())
+
+    wb = Workbook()
+    ws = wb.active
+    for i in range(300):
+        url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + cid_list[i]
+        barrage_list = GetVideoBarrage(url, headers)
+        print(f'现在正在获取第{i+1}个视频的弹幕信息')
+        time.sleep(1+random.random())
+        ws.cell(1, i+1, f'视频{i+1}')
+        for j in range(len(barrage_list)):
+            ws.cell(j+2, i+1, barrage_list[j])
+
+    wb.save(savePath)
+
+    print(cid_list)
+
+if __name__ == '__main__':
+    main()