diff --git a/t1.py b/t1.py new file mode 100644 index 0000000..cd6259b --- /dev/null +++ b/t1.py @@ -0,0 +1,181 @@ +import requests +import re +from bs4 import BeautifulSoup +from collections import Counter +from openpyxl import load_workbook +import pandas as pd +import jieba +import wordcloud +import imageio + +# 模拟浏览器 +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0", + "cookie": "CURRENT_FNVAL=4048; buvid_fp_plain=undefined; buvid4=04DF7AEF-34D9-CC62-690A-D369B35D458509591-023061415-%2FxwqHe8zHTWav6Q4ZiB1Ag%3D%3D; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; buvid3=D5B12366-476E-6163-1D79-774D300DF97306537infoc; b_nut=1718270506; _uuid=243B710F9-1010E3-9654-E867-4A8D8BB10AB1307743infoc; header_theme_version=CLOSE; rpdid=0zbfAHMKHr|S8rGMSwG|1uI|3w1Sum1G; fingerprint=042b265e3c7da3104d09a0692278e922; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1659-836; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5NDEwOTEsImlhdCI6MTcyNTY4MTgzMSwicGx0IjotMX0.j7rN8z5QOwH-7R7gPvyBxJzDLqymAWFfZeFF-QAXoTQ; bili_ticket_expires=1725941031; bp_t_offset_482950113=974463371485118464; buvid_fp=042b265e3c7da3104d09a0692278e922; b_lsid=DDE103767_191D4FCA152" +} + +def contains_ai_or_artificial_intelligence(text): + ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE) + return re.search(ai_pattern, text) + +# 获取html文本 +def get_html(url): + response = requests.get(url,headers=headers) + response.encoding = 'utf-8' + html=response.text + return html + +# 查找正确的api链接 +def seek_api_urls(html_data): + soup = BeautifulSoup(html_data, 'html.parser') + #创建列表储存筛选完的内容 + urls = set() + # 筛选a标签内容 + a_tags=soup.find_all('a', href=True) + for a_link in a_tags: + # 获取href的值 + link = a_link['href'] + urls.add(link) + # 筛选正确的链接 + pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?') + api_urls = [url_find for url_find in urls if pattern.match(url_find)] + #返回链接值 + return api_urls + +# 获取弹幕接口链接函数 +def get_api_urls(url): + response = requests.get(url, headers=headers) + if response.status_code == 200: + # 若请求成功则查找api链接 + html_data=response.text + api_urls=seek_api_urls(html_data) + return api_urls + else: + # 返回一个空列表作为默认值 + return [] + +# 获取视频接口函数 +def get_urls(page): + # 获得搜索页面url + url = f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page}" + html_data=get_html(url) + soup = BeautifulSoup(html_data, 'html.parser') + # 创建列表储存筛选完的内容 + urls = set() + a_tags=soup.find_all('a', href=True) + for a_link in a_tags: + link = a_link['href'] + # 补全链接 + full_link=f'https:{link}' + urls.add(full_link) + # 筛选正确的链接 + pattern = re.compile(r'https://www\.bilibili\.com/video') + #7x42=294,前七页全部读取 + if page != 8: + vedieo_urls_f = [url_find for url_find in urls if pattern.match(url_find)] + return vedieo_urls_f + #第8页只读6个 + else: vedieo_urls_f = [] + num = 0 + for url_find in urls: + if pattern.match(url_find): + num = num + 1 + vedieo_urls_f.append(url_find) + if num == 6: + return vedieo_urls_f + +#获取接口链接 +def vedio_transform_port(url): + html_data = get_html(url) + soup = BeautifulSoup(html_data,"html.parser") + page_num = [] #储存总共的分p数 + span_tag = None #用做判断有无分p的flag + + # 分p视频部分源代码如下: + #