From 0ea2669f26d9475dfb30975765d37e84588f0cb8 Mon Sep 17 00:00:00 2001 From: pioc37juv <1245880206@qq.com> Date: Wed, 18 Sep 2024 19:50:50 +0800 Subject: [PATCH] ADD file via upload --- (1)bilibili弹幕.py | 186 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 (1)bilibili弹幕.py diff --git a/(1)bilibili弹幕.py b/(1)bilibili弹幕.py new file mode 100644 index 0000000..c664e77 --- /dev/null +++ b/(1)bilibili弹幕.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +# 作者:Halcyon(王思平102201544) + +import requests +import re +from bs4 import BeautifulSoup +import operator +import traceback +import os +import pandas as pd +from lxml import etree +from time import sleep + +headers = { + # UA用户代理(用户头)||Cookie即网站存储在浏览器的密钥||referer表示你是从哪个网页来到这个网页的,可避开部分网站的反爬 + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + "Cookie": "buvid3=FC84FF24-F24C-8F6A-E270-A4E24DE7B0C526016infoc; b_nut=1698057526; i-wanna-go-back=-1; b_ut=7; _uuid=B8DE32105-3BF9-6647-BECB-E8C21CC6514325921infoc; DedeUserID=25050170; DedeUserID__ckMd5=c0021d4e8b0bf000; rpdid=|(u)YllY\~l\~|0J'uYm)|l\~\~u\~; buvid_fp_plain=undefined; LIVE_BUVID=AUTO1716983199366540; hit-dyn-v2=1; is-2022-channel=1; buvid4=A93706A8-E74B-3BA6-F690-5C6B9D598B0526885-023102318-ve0wVapqFYOkA%2BhrXJP7jQ%3D%3D; dy_spec_agreed=1; FEED_LIVE_VERSION=V8; enable_web_push=DISABLE; header_theme_version=CLOSE; theme_style=light; CURRENT_FNVAL=4048; PVID=3; kfcSource=cps_comments_5796501; msource=cps_comments_5796501; deviceFingerprint=df356baaacd70e52753b41f8d9c0d620; share_source_origin=COPY; fingerprint=7b3f4d837eec3b3f350f986969e4ab87; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTY4MDAsImlhdCI6MTcyNjQ5NzU0MCwicGx0IjotMX0.OL6GRuRoVoxpFDP_ZQf8D7DpV2G5jh--dzz7f29eqvs; bili_ticket_expires=1726756740; CURRENT_QUALITY=112; SESSDATA=f7aed23a%2C1742058181%2Cdd898%2A92CjALxu4cFuV7SGqcgyHI8Ys2NK_ZyALU5ig08Zm2NaHT44pIudmp4ZCgBU5VgNrKn9ASVnJ3TG9ra1JUblBEcHhtUDlpSjNrTmhUaXRhVV9BNUxtSXdDeGx4MnhTSDZRRWFlaWZ5R0Jmd2hDNW5QX0JiSms2UndUWHM1YW5pdGZTWVhHOF9pbFZnIIEC; bili_jct=533d358e51330aaf948908155179fc37; sid=6c7mrv3l; bsource=search_baidu; bmg_af_switch=1; bmg_src_def_domain=i2.hdslb.com; buvid_fp=7b3f4d837eec3b3f350f986969e4ab87; b_lsid=367A31104_19200B68A7A; browser_resolution=1330-84; home_feed_column=4", + 'Referer': 'https://www.bilibili.com/' +} +timeout = 5 + + +def getHTML(url): # 根据url读取页面html内容 + try: + response = requests.get(url=url, headers=headers, timeout=timeout) # 发送一个get请求到指定的url + response.encoding = response.apparent_encoding # 将响应的编码设置为自动检测到的编码,以便后续处理时不会出现乱码 + return response.text # 返回响应的文本内容(即网页的 HTML 源代码) + except Exception as e: + print(f"request url : {url} error: {e}") # f"...":这个f表示这是一个格式化字符串,其中的任何用{}括起来的部分会被替换为相应变量的值 + print(traceback.format_exc()) # 返回最近的异常的traceback(调用栈) + return None + + +def parsePage_search(content): # 分析得到的html页面文本内容 + try: + print("parsing...") + # 使用正则表达式提取BV号 + bv_list = re.findall('"bvid":"(.*?)"', content) + print(f"爬取到 BV 号成功,数量: {len(bv_list)}") + return bv_list # 返回爬取到的 BV 号列表 + except Exception as e: + print("parse error:", e) + return [] + + +def save_to_csv(bvids, filename): + # 创建 DataFrame 并保存为 CSV + df = pd.DataFrame(bvids, columns=["bvid"]) + df.to_csv(filename, index=False, encoding='utf-8-sig') + print(f"已将 {len(bvids)} 个 bvid 保存到 {filename}") + + +def parsePage(page): # 分析得到的html页面文本内容 + try: + print("parsing...") + html_ = etree.HTML(page) # 将传入的page字符串转换为一个可以使用XPath查询的HTML结构 + meta_title = html_.xpath('//meta[@name="title"]/@content')[0] # 使用 XPath 查询获取 标签中 name 属性为 title 的内容 + if meta_title == '视频去哪了呢?_哔哩哔哩_bilibili': + print(f'视频 404 not found') + return [], '视频 404 not found' + + syntax = [':', '='] # 创建一个列表 syntax,包含可能用于分割 CID 的字符(冒号和等号) + flag = 0 + keys = re.findall(r'"cid":[\d]*', page) # 在页面内容中查找 "cid":[数字] 格式的字符串,返回所有匹配的项。 + if not keys: + keys = re.findall(r'cid=[\d]*', page) # 如果起那么位于找到则使用另一种正则表达式 cid=[数字] 查找 CID + flag = 1 + + comments, title = {}, None # 初始化 + + if len(keys) < 2: + print("没有找到有效的 cid") + return comments, title + + keys = [keys[1]] # 只取第二个 cid + + for index, item in enumerate(keys): # 使用 enumerate 遍历 keys 列表,同时获取索引 index 和元素 item + key = item.split(syntax[flag])[1] # 使用 split 方法根据当前的分隔符(冒号或等号)分割字符串,获取 CID 值。 + print(f'{index + 1}/{len(keys)}: {key}') + comment_url = f'https://comment.bilibili.com/{key}.xml' # 弹幕地址 + comment_text = getHTML(comment_url) + bs4 = BeautifulSoup(comment_text, "xml") # 使用 xml 解析器 + + if not title: + # 使用 BeautifulSoup 解析原始页面内容,并提取