From 8fc628c92bef3f8ce4de06394e201f9110d1f72d Mon Sep 17 00:00:00 2001 From: pzf8uwcp6 <13519963156@163.com> Date: Wed, 18 Sep 2024 21:32:13 +0800 Subject: [PATCH] Delete 'all_func_in_one' --- all_func_in_one | 191 ------------------------------------------------ 1 file changed, 191 deletions(-) delete mode 100644 all_func_in_one diff --git a/all_func_in_one b/all_func_in_one deleted file mode 100644 index 2e79432..0000000 --- a/all_func_in_one +++ /dev/null @@ -1,191 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import time -import random -import openpyxl -from collections import Counter -import jieba -import wordcloud - - -headers = { - 'Connection': 'keep-alive', - 'Cache-Control': 'max-age=0', - 'DNT': '1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/82.0.4083.0 Safari/537.36 Edg/82.0.458.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-User': '?1', - 'Sec-Fetch-Dest': 'document', - 'Accept-Language': 'en-US,en;q=0.9' -} -def get_video_ids(api_urls): - video_ids=[] - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0', - 'cookie' : 'BIDUPSID=B217BCFBC37D845BE7576B36283B7200; PSTM=1694936551; BAIDUID_BFESS=B217BCFBC37D845B41B0963C72424B9E:FG=1; ZFY=8a5uERFpWfnCU59:Bf7xfjug61O89yJEG1n4GF:BPaY1c:C; BDRCVFR[bPTzwF-RsLY]=mk3SLVN4HKm; H_PS_PSSID=60724_60360_60799; BD_HOME=1; BD_UPN=12314753; BA_HECTOR=058ka00hahalak0gak21a1akapkf6v1jelat81u' - } -#获取前300个热门视频bvid并保存 - # 设定计数器记录id的个数 - cnt = 0 - # 从多页数据中获取视频信息 - for page in range(1, 22): - # 执行翻页操作 - api_url = api_urls + str(page) - # 获取Datas - response = requests.get(api_url, headers=headers) - response.encoding = 'utf-8' - Json = response.json() - Datas = Json['data']['result'] - for Data in Datas: - # 通过try except跳过B站设置的断点 - try: - bvids = Data['bvid'] - video_ids.append(bvids) - with open("bv.txt", mode='a', encoding='utf-8') as f: # 执行写入操作 - f.write(bvids + '\n') - cnt += 1 - if (cnt >= 300): - break - except: - continue - if (cnt >= 300): - break - # print(bvids) - - return video_ids - - -# API URLs -api_urls = 'https://api.bilibili.com/x/web-interface/search/type?search_type=video&keyword=巴黎奥运会&page=' - -video_ids = get_video_ids(api_urls) -#从bvid获取单个cid的方法 -def get_cid_from_bv(bv_number, p_number=0): - try: - url = 'https://api.bilibili.com/x/player/pagelist?bvid={}&jsonp=jsonp' - response = requests.get(url.format(bv_number), headers=headers) - - if response.status_code == 200: - data = response.json()['data'] - - if p_number < len(data): - return data[p_number]['cid'] - else: - print(f'Error: Part number out of range for BV code {bv_number}.') - return None - else: - print(f'Error: Failed to retrieve CID for BV code {bv_number}. Status code: {response.status_code}') - return None - - except Exception as e: - print(f'Error: {str(e)} for BV code {bv_number}.') - return None - -#批量转换bvid到cid -def get_cids_from_bv_list(video_ids): - cid_list = [] - for bv_code in video_ids: - cid = get_cid_from_bv(bv_code) - if cid is not None: - cid_list.append(cid) - else: - cid_list.append(None) - return cid_list - -cids = get_cids_from_bv_list(video_ids) - -print(f'The corresponding CIDs for the provided BV codes are: {cids}') - -#通过cid批量获取弹幕链接 -def get_danmu(cids): - url = f'https://api.bilibili.com/x/v1/dm/list.so?oid={cids}' - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' - } - response = requests.get(url, headers=headers) - response.encoding = 'utf-8' - soup = BeautifulSoup(response.text, 'html.parser') - danmu_list = [] - for danmu in soup.find_all('d'): - danmu_list.append(danmu.text) - return danmu_list - -# 批量爬取弹幕 -with open('all_danmu.txt', 'w', encoding='utf-8') as f: - for video_id in cids: - danmu_list = get_danmu(video_id) - print(f'视频 {video_id} 的弹幕数量: {len(danmu_list)}') - - # 写入当前视频的弹幕到文件 - f.write(f'== 开始保存视频 {video_id} 的弹幕 ==\n') - f.write('\n'.join(danmu_list) + '\n') - - # 随机休眠1-3秒,防止被ban - time.sleep(random.uniform(1, 3)) - -print('所有视频的弹幕已保存到 all_danmu.txt 文件中') - - - - - -# 1. 读取弹幕数据文件 -filename = 'all_danmu.txt' - -with open(filename, 'r', encoding='utf-8') as f: - danmus = f.readlines() - -# 2. 过滤与AI技术相关的弹幕 -ai_keywords = ['AI', '人工智能'] -ai_related_danmus = [danmu for danmu in danmus if any(keyword in danmu for keyword in ai_keywords)] - -# 3. 统计每种弹幕数量 -danmu_counter = Counter(ai_related_danmus) - -# 4. 获取数量排名前8的弹幕 -top_danmus = danmu_counter.most_common(8) - -# 5. 将统计结果写入Excel表 -excel_filename = 'AI相关弹幕统计.xlsx' -wb = openpyxl.Workbook() -sheet = wb.active -sheet.title = 'AI相关弹幕统计' - -sheet['A1'] = '弹幕内容' -sheet['B1'] = '出现次数' - -# 写入前8的弹幕和次数 -for idx, (danmu, count) in enumerate(top_danmus, start=2): - sheet[f'A{idx}'] = danmu.strip() # 去除弹幕末尾的换行符 - sheet[f'B{idx}'] = count - -# 保存Excel文件 -wb.save(excel_filename) -print(f'AI相关弹幕统计已保存到 {excel_filename}') - - -# 读取弹幕数据文件 -filename = 'all_danmu.txt' - -with open(filename, 'r', encoding='utf-8') as f: - text = f.read() - -# 使用 jieba 分词 -word_list = jieba.lcut(text) -text_str = ' '.join(word_list) - -# 生成词云 -wc = wordcloud.WordCloud(font_path='C:/Windows/Fonts/微软雅黑/msyh.ttc', # 指定中文字体文件 - width=800, height=600, # 设置词云大小 - background_color='white', # 设置背景颜色 - stopwords={'哈哈','哈哈哈','的','是','了','我','和','这','也','你','啊','吧','就是','这个','吗','他','不是','真的','都','在','现在','感觉','看','有','不'}, - contour_width=1, contour_color='blue') -wc.generate(text_str) - -# 保存词云图到当前文件夹 - -wc.to_file('wordcloud.png') -