From 1bc34bff476bd21236165532f45a38c10688626d Mon Sep 17 00:00:00 2001 From: pbk4qcfyv <1678854362@qq.com> Date: Wed, 18 Sep 2024 04:36:51 +0800 Subject: [PATCH] ADD file via upload --- 爬取前300条视频弹幕.py | 99 ++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 爬取前300条视频弹幕.py diff --git a/爬取前300条视频弹幕.py b/爬取前300条视频弹幕.py new file mode 100644 index 0000000..25deb77 --- /dev/null +++ b/爬取前300条视频弹幕.py @@ -0,0 +1,99 @@ +import requests +import re +from concurrent.futures import ThreadPoolExecutor, as_completed + +count = 0 + + +def get_page_url(n): + """ + 获取页面的URL + """ + page_url_list = [] + for i in range(n): + if i == 0: + page_url = "https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5" + else: + page_url = f"https://search.bilibili.com/all?keyword=2024%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5&page={i + 1}&o={i * 36}" + page_url_list.append(page_url) + return page_url_list + + +header = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"} + + +def fetch_page(url): + try: + response = requests.get(url=url, headers=header) + response.encoding = 'utf-8' + return response.text + except Exception as e: + print(f"请求失败: {e}") + return "" + + +def get_cid(page_url_list): + global count + cid_list = [] + + with ThreadPoolExecutor(max_workers=5) as executor: + future_to_url = {executor.submit(fetch_page, url): url for url in page_url_list} + for future in as_completed(future_to_url): + page_url = future_to_url[future] + try: + data = future.result() + content = re.findall('"//www.bilibili.com/video/(.*?)/"', data) + content = set(content) + content = list(content) + for bvid in content: + url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp" + response = fetch_page(url) + cids = re.findall('{"cid":(.*?),', response) + if cids: + cid_list.append(cids[0]) + count += 1 + print(f"已获取到 {count} 个cid") + if count >= 300: + break + if count >= 300: + break + except Exception as e: + print(f"处理失败: {e}") + return cid_list + + +def get_danmu(cid_list): + def fetch_danmu(cid): + try: + url = f"https://comment.bilibili.com/{cid}.xml" + response = requests.get(url=url, headers=header) + response.encoding = 'utf-8' + data = response.text + return re.findall('(.*?)', data) + except Exception as e: + print(f"请求失败: {e}") + return [] + + danmu_list = [] + with ThreadPoolExecutor(max_workers=5) as executor: + future_to_cid = {executor.submit(fetch_danmu, cid): cid for cid in cid_list} + for future in as_completed(future_to_cid): + cid = future_to_cid[future] + try: + danmu_list.extend(future.result()) + print(f"已获取到 {len(danmu_list)} 条弹幕") + except Exception as e: + print(f"处理失败: {e}") + return danmu_list + + +cid_list = get_cid(get_page_url(10)) +print("开始获取弹幕数据...") +danmu_list = get_danmu(cid_list) +print("弹幕数据爬取完成。") +with open('弹幕.txt', 'w', encoding='utf-8') as f: + for danmu in danmu_list: + f.write(danmu + '\n') + +print("弹幕已保存到 '弹幕.txt'")