You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
6.0 KiB

import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import Counter
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie": "CURRENT_FNVAL=4048; buvid_fp_plain=undefined; buvid4=04DF7AEF-34D9-CC62-690A-D369B35D458509591-023061415-%2FxwqHe8zHTWav6Q4ZiB1Ag%3D%3D; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; buvid3=D5B12366-476E-6163-1D79-774D300DF97306537infoc; b_nut=1718270506; _uuid=243B710F9-1010E3-9654-E867-4A8D8BB10AB1307743infoc; header_theme_version=CLOSE; rpdid=0zbfAHMKHr|S8rGMSwG|1uI|3w1Sum1G; fingerprint=042b265e3c7da3104d09a0692278e922; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1659-836; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5NDEwOTEsImlhdCI6MTcyNTY4MTgzMSwicGx0IjotMX0.j7rN8z5QOwH-7R7gPvyBxJzDLqymAWFfZeFF-QAXoTQ; bili_ticket_expires=1725941031; bp_t_offset_482950113=974463371485118464; buvid_fp=042b265e3c7da3104d09a0692278e922; b_lsid=DDE103767_191D4FCA152"
}
vedio_sum=0
def contains_ai_or_artificial_intelligence(text):
ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
return re.search(ai_pattern, text)
def filter_strings(strings_list):
return [string for string in strings_list if contains_ai_or_artificial_intelligence(string)]
def top_n_strings(strings_list, n):
filtered_strings = filter_strings(strings_list)
# 统计每个字符串的出现次数
counter = Counter(filtered_strings)
# 获取出现次数最多的前n个字符串
most_common = counter.most_common(n)
return most_common
def dm_urls(url):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
urls = set()
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
urls.add(href)
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?oid=\d+')
# 过滤符合格式的URL
specific_urls = [url1 for url1 in urls if pattern.match(url1)]
return specific_urls
except:
return [] # 返回一个空列表作为默认值
def extract_urls(url):
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
urls = set()
urlbilibili="https://"
for a_tag in soup.find_all('a', href=True):
href = a_tag['href']
# 将相对URL转换为绝对URL
full_url = urljoin(urlbilibili, href)
urls.add(full_url)
pattern = re.compile(r'^https://www\.bilibili\.com/video')
# 过滤符合格式的URL
specific_urls = [url1 for url1 in urls if pattern.match(url1)]
if (len(specific_urls) + vedio_sum > 300):
specific_urls = specific_urls[:6] # 保留前六个元素
return specific_urls
def vedio_transform_port(url):
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
page_num = [] #储存总共的分p数
span_tag = None #用做判断有无分p的flag
div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div
for tag in div_tags:
span_tag = tag.findAll("span", attrs={"class": "cur-page"}) # 再从中找到class=cur-page的span
if not span_tag: # 值为None则为单个视频
return 0
else:
for tag in span_tag:
pattern = re.compile(r'\((\d+)/(\d+)\)')
match = pattern.search(tag.text) # 使用tag.text提取文本内容
if match: # 确保找到匹配项
page_num = int(match.group(2)) # 获取总页数
return page_num
list=[]
for page_num in range(1,9):
urls = extract_urls(f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page_num}")
mod_urls=set()
vedio_sum +=len(urls)
for ori_url in urls:
clean_url = re.sub(r'\?.*$', '', ori_url)
p_sum=vedio_transform_port(ori_url)
if (p_sum == 0):
modified_url = clean_url.replace("bilibili.com", "ibilibili.com")
mod_urls.add(modified_url)
else:
for p_num in range(1, p_sum + 1):
ori2_url = f"{clean_url}?p={p_num}"
modified_url = ori2_url.replace("bilibili.com", "ibilibili.com")
mod_urls.add(modified_url)
for url in mod_urls:
dmurls=dm_urls(url)
if dmurls: # 检查列表是否为空
dmurl = dmurls[0]
response = requests.get(dmurl, headers=headers)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, "lxml-xml")
# 提取所有评论
comments = soup.find_all('d')
for comment in comments:
comment_text = comment.get_text()
list.append(comment_text)
result = top_n_strings(list,len(list))
# 创建 DataFrame
df = pd.DataFrame(result, columns=['Element', 'Count'])
# 写入 Excel 文件
df.to_excel('output.xlsx', index=False)
result8=top_n_strings(list,8)
for item, count in result8:
print(f'{item}: {count}')
df = pd.read_excel('output.xlsx')
# 创建词云输入格式
text = ' '.join(df.apply(lambda row: f"{row['Element']} " * row['Count'], axis=1))
# 生成词云
wordcloud = WordCloud(width=800, height=400, background_color='white',font_path=r'C:\Windows\Fonts\FZSTK.TTF').generate(text)
wordcloud.to_file('wordcloud5.png')
# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()