From fa607c4f1f1cea0134766e2391705ba2ccaac48f Mon Sep 17 00:00:00 2001 From: pzf8uwcp6 <13519963156@163.com> Date: Wed, 18 Sep 2024 21:18:13 +0800 Subject: [PATCH] Add all_func_in_one --- all_func_in_one | 191 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 all_func_in_one diff --git a/all_func_in_one b/all_func_in_one new file mode 100644 index 0000000..2e79432 --- /dev/null +++ b/all_func_in_one @@ -0,0 +1,191 @@ +import requests +from bs4 import BeautifulSoup +import time +import random +import openpyxl +from collections import Counter +import jieba +import wordcloud + + +headers = { + 'Connection': 'keep-alive', + 'Cache-Control': 'max-age=0', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4083.0 Safari/537.36 Edg/82.0.458.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Accept-Language': 'en-US,en;q=0.9' +} +def get_video_ids(api_urls): + video_ids=[] + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', + 'cookie' : 'BIDUPSID=B217BCFBC37D845BE7576B36283B7200; PSTM=1694936551; BAIDUID_BFESS=B217BCFBC37D845B41B0963C72424B9E:FG=1; ZFY=8a5uERFpWfnCU59:Bf7xfjug61O89yJEG1n4GF:BPaY1c:C; BDRCVFR[bPTzwF-RsLY]=mk3SLVN4HKm; H_PS_PSSID=60724_60360_60799; BD_HOME=1; BD_UPN=12314753; BA_HECTOR=058ka00hahalak0gak21a1akapkf6v1jelat81u' + } +#获取前300个热门视频bvid并保存 + # 设定计数器记录id的个数 + cnt = 0 + # 从多页数据中获取视频信息 + for page in range(1, 22): + # 执行翻页操作 + api_url = api_urls + str(page) + # 获取Datas + response = requests.get(api_url, headers=headers) + response.encoding = 'utf-8' + Json = response.json() + Datas = Json['data']['result'] + for Data in Datas: + # 通过try except跳过B站设置的断点 + try: + bvids = Data['bvid'] + video_ids.append(bvids) + with open("bv.txt", mode='a', encoding='utf-8') as f: # 执行写入操作 + f.write(bvids + '\n') + cnt += 1 + if (cnt >= 300): + break + except: + continue + if (cnt >= 300): + break + # print(bvids) + + return video_ids + + +# API URLs +api_urls = 'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page=' + +video_ids = get_video_ids(api_urls) +#从bvid获取单个cid的方法 +def get_cid_from_bv(bv_number, p_number=0): + try: + url = 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp' + response = requests.get(url.format(bv_number), headers=headers) + + if response.status_code == 200: + data = response.json()['data'] + + if p_number < len(data): + return data[p_number]['cid'] + else: + print(f'Error: Part number out of range for BV code {bv_number}.') + return None + else: + print(f'Error: Failed to retrieve CID for BV code {bv_number}. Status code: {response.status_code}') + return None + + except Exception as e: + print(f'Error: {str(e)} for BV code {bv_number}.') + return None + +#批量转换bvid到cid +def get_cids_from_bv_list(video_ids): + cid_list = [] + for bv_code in video_ids: + cid = get_cid_from_bv(bv_code) + if cid is not None: + cid_list.append(cid) + else: + cid_list.append(None) + return cid_list + +cids = get_cids_from_bv_list(video_ids) + +print(f'The corresponding CIDs for the provided BV codes are: {cids}') + +#通过cid批量获取弹幕链接 +def get_danmu(cids): + url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cids}' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + response = requests.get(url, headers=headers) + response.encoding = 'utf-8' + soup = BeautifulSoup(response.text, 'html.parser') + danmu_list = [] + for danmu in soup.find_all('d'): + danmu_list.append(danmu.text) + return danmu_list + +# 批量爬取弹幕 +with open('all_danmu.txt', 'w', encoding='utf-8') as f: + for video_id in cids: + danmu_list = get_danmu(video_id) + print(f'视频 {video_id} 的弹幕数量: {len(danmu_list)}') + + # 写入当前视频的弹幕到文件 + f.write(f'== 开始保存视频 {video_id} 的弹幕 ==\n') + f.write('\n'.join(danmu_list) + '\n') + + # 随机休眠1-3秒,防止被ban + time.sleep(random.uniform(1, 3)) + +print('所有视频的弹幕已保存到 all_danmu.txt 文件中') + + + + + +# 1. 读取弹幕数据文件 +filename = 'all_danmu.txt' + +with open(filename, 'r', encoding='utf-8') as f: + danmus = f.readlines() + +# 2. 过滤与AI技术相关的弹幕 +ai_keywords = ['AI', '人工智能'] +ai_related_danmus = [danmu for danmu in danmus if any(keyword in danmu for keyword in ai_keywords)] + +# 3. 统计每种弹幕数量 +danmu_counter = Counter(ai_related_danmus) + +# 4. 获取数量排名前8的弹幕 +top_danmus = danmu_counter.most_common(8) + +# 5. 将统计结果写入Excel表 +excel_filename = 'AI相关弹幕统计.xlsx' +wb = openpyxl.Workbook() +sheet = wb.active +sheet.title = 'AI相关弹幕统计' + +sheet['A1'] = '弹幕内容' +sheet['B1'] = '出现次数' + +# 写入前8的弹幕和次数 +for idx, (danmu, count) in enumerate(top_danmus, start=2): + sheet[f'A{idx}'] = danmu.strip() # 去除弹幕末尾的换行符 + sheet[f'B{idx}'] = count + +# 保存Excel文件 +wb.save(excel_filename) +print(f'AI相关弹幕统计已保存到 {excel_filename}') + + +# 读取弹幕数据文件 +filename = 'all_danmu.txt' + +with open(filename, 'r', encoding='utf-8') as f: + text = f.read() + +# 使用 jieba 分词 +word_list = jieba.lcut(text) +text_str = ' '.join(word_list) + +# 生成词云 +wc = wordcloud.WordCloud(font_path='C:/Windows/Fonts/微软雅黑/msyh.ttc', # 指定中文字体文件 + width=800, height=600, # 设置词云大小 + background_color='white', # 设置背景颜色 + stopwords={'哈哈','哈哈哈','的','是','了','我','和','这','也','你','啊','吧','就是','这个','吗','他','不是','真的','都','在','现在','感觉','看','有','不'}, + contour_width=1, contour_color='blue') +wc.generate(text_str) + +# 保存词云图到当前文件夹 + +wc.to_file('wordcloud.png') +