diff --git a/t1.py b/t1.py deleted file mode 100644 index c924193..0000000 --- a/t1.py +++ /dev/null @@ -1,181 +0,0 @@ -import requests -import re -from bs4 import BeautifulSoup -from collections import Counter -from openpyxl import load_workbook -import pandas as pd -import jieba -import wordcloud -import imageio - -# 模拟浏览器 -headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", - "cookie": "CURRENT_FNVAL=4048; buvid_fp_plain=undefined; buvid4=04DF7AEF-34D9-CC62-690A-D369B35D458509591-023061415-%2FxwqHe8zHTWav6Q4ZiB1Ag%3D%3D; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; buvid3=D5B12366-476E-6163-1D79-774D300DF97306537infoc; b_nut=1718270506; _uuid=243B710F9-1010E3-9654-E867-4A8D8BB10AB1307743infoc; header_theme_version=CLOSE; rpdid=0zbfAHMKHr|S8rGMSwG|1uI|3w1Sum1G; fingerprint=042b265e3c7da3104d09a0692278e922; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1659-836; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5NDEwOTEsImlhdCI6MTcyNTY4MTgzMSwicGx0IjotMX0.j7rN8z5QOwH-7R7gPvyBxJzDLqymAWFfZeFF-QAXoTQ; bili_ticket_expires=1725941031; bp_t_offset_482950113=974463371485118464; buvid_fp=042b265e3c7da3104d09a0692278e922; b_lsid=DDE103767_191D4FCA152" -} - -def contains_ai_or_artificial_intelligence(text): - ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE) - return re.search(ai_pattern, text) - -# 获取html文本 -def get_html(url): - response = requests.get(url,headers=headers) - response.encoding = 'utf-8' - html=response.text - return html - -# 查找正确的api链接 -def seek_api_urls(html_data): - soup = BeautifulSoup(html_data, 'html.parser') - #创建列表储存筛选完的内容 - urls = set() - # 筛选a标签内容 - a_tags=soup.find_all('a', href=True) - for a_link in a_tags: - # 获取href的值 - link = a_link['href'] - urls.add(link) - # 筛选正确的链接 - pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?') - api_urls = [url_find for url_find in urls if pattern.match(url_find)] - #返回链接值 - return api_urls - -# 获取弹幕接口链接函数 -def get_api_urls(url): - response = requests.get(url, headers=headers) - if response.status_code == 200: - # 若请求成功则查找api链接 - html_data=response.text - api_urls=seek_api_urls(html_data) - return api_urls - else: - # 返回一个空列表作为默认值 - return [] - -# 获取视频接口函数 -def get_urls(page): - # 获得搜索页面url - url = f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}" - html_data=get_html(url) - soup = BeautifulSoup(html_data, 'html.parser') - # 创建列表储存筛选完的内容 - urls = set() - a_tags=soup.find_all('a', href=True) - for a_link in a_tags: - link = a_link['href'] - # 补全链接 - full_link=f'https:{link}' - urls.add(full_link) - # 筛选正确的链接 - pattern = re.compile(r'https://www\.bilibili\.com/video') - #7x42=294,前七页全部读取 - if page != 8: - vedieo_urls_f = [url_find for url_find in urls if pattern.match(url_find)] - return vedieo_urls_f - #第8页只读6个 - else: vedieo_urls_f = [] - num = 0 - for url_find in urls: - if pattern.match(url_find): - num = num + 1 - vedieo_urls_f.append(url_find) - if num == 6: - return vedieo_urls_f - -#获取接口链接 -def vedio_transform_port(url): - html_data = get_html(url) - soup = BeautifulSoup(html_data,"html.parser") - page_num = [] #储存总共的分p数 - span_tag = None #用做判断有无分p的flag - - # 分p视频部分源代码如下: - #
- #

视频选集

- # (1/12) - div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div - for tag in div_tags: - span_tag=(tag.findAll("span",attrs={"class":"cur-page"})) #再从中找到class=cur-page的span - - if span_tag == None: #值为None则为单个视频 - port_url = url.replace("bilibili.com", "ibilibili.com") - port_urls.add(port_url) - else: - for page in span_tag: - pages = jieba.lcut(page.get_text()) #取得span的内容“(x/y)",用jieba拆分成'(','x','/','y',')',其中y即为分p总数 - page_num = pages[3] #取得y的值 - # 替换每个分p视频的链接 - for page in range(1,int(page_num)+1): - port_url = f"{url}?p={page}" - port_urls.add(port_url.replace("bilibili.com", "ibilibili.com")) - -# 循环10页,每页42个视频,总300个 -for page in range(1,8): - # 获取视频链接 - vedio_urls=get_urls(page) - # 创建接口链接列表 - port_urls=set() - for vedio_url in vedio_urls: - # 将视频链接转换成接口链接 - port_url = vedio_transform_port(vedio_url) - - # 循环访问接口 - for url in port_urls: - #获取弹幕链接 - api_urls=get_api_urls(url) - # 检查列表是否为空 - if api_urls: - #不为空,则将获取弹幕链接 - api_url = api_urls[0] - html_data = get_html(api_url) - soup = BeautifulSoup(html_data, 'html.parser') - content_list =re.findall('(.*?)',html_data) - content='\n'.join(content_list) - with open('弹幕.txt',mode='a',encoding='utf-8') as f: - f.write(content) - -ai_list = [] #用于储存关于ai弹幕 -most_common_barrages = [] #储存数量前八弹幕 - -with open('弹幕.txt', 'r', encoding='utf-8') as file: - content_txt = file.readlines() # 按行读取弹幕 -for barrage in content_txt: - if contains_ai_or_artificial_intelligence(barrage): #筛选关于ai的弹幕 - ai_list.append(barrage.strip()) # 使用strip()去除每行的换行符 -# 使用Counter统计每个弹幕的出现次数 -counter = Counter(ai_list) -# 获取出现次数最多的前8个弹幕 -most_common_barrages = counter.most_common(8) -#转变类型才可以写入excel -ai_list1 = counter.most_common() -# 输出结果 -for barrage, count in most_common_barrages: - print(f'弹幕: {barrage} 出现次数: {count}') - -# 将数据转换为DataFrame -df = pd.DataFrame(ai_list1, columns=['弹幕', '出现次数']) -# 写入Excel文件 -excel_path = '弹幕统计.xlsx' -df.to_excel(excel_path, index=False, engine='openpyxl') -# 调整列宽 -wb = load_workbook(excel_path) -ws = wb.active -# 设置“弹幕”列的宽度 -ws.column_dimensions['A'].width = 60 -# 保存修改后的Excel文件 -wb.save(excel_path) - -ai_str = '\n'.join(ai_list) #分割成字符型 -#绘制词云图 -img = imageio.imread('test2.png') -wc = wordcloud.WordCloud( - width = 500, - height = 500, - mask=img, - background_color = 'white', - font_path = 'msyh.ttc' -) -wc.generate(ai_str) -wc.to_file('词云.png') \ No newline at end of file