parent
30bd529259
commit
47b24d27e6
@ -0,0 +1,136 @@
|
|||||||
|
import requests
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from collections import Counter
|
||||||
|
import pandas as pd
|
||||||
|
from wordcloud import WordCloud
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
|
||||||
|
"cookie": "CURRENT_FNVAL=4048; buvid_fp_plain=undefined; buvid4=04DF7AEF-34D9-CC62-690A-D369B35D458509591-023061415-%2FxwqHe8zHTWav6Q4ZiB1Ag%3D%3D; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; PVID=1; buvid3=D5B12366-476E-6163-1D79-774D300DF97306537infoc; b_nut=1718270506; _uuid=243B710F9-1010E3-9654-E867-4A8D8BB10AB1307743infoc; header_theme_version=CLOSE; rpdid=0zbfAHMKHr|S8rGMSwG|1uI|3w1Sum1G; fingerprint=042b265e3c7da3104d09a0692278e922; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1659-836; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjU5NDEwOTEsImlhdCI6MTcyNTY4MTgzMSwicGx0IjotMX0.j7rN8z5QOwH-7R7gPvyBxJzDLqymAWFfZeFF-QAXoTQ; bili_ticket_expires=1725941031; bp_t_offset_482950113=974463371485118464; buvid_fp=042b265e3c7da3104d09a0692278e922; b_lsid=DDE103767_191D4FCA152"
|
||||||
|
}
|
||||||
|
vedio_sum=0
|
||||||
|
|
||||||
|
def contains_ai_or_artificial_intelligence(text):
|
||||||
|
ai_pattern = re.compile( r'(\bai\b|人工智能|([\u4e00-\u9fff]|\s|^)ai([\u4e00-\u9fff]|\s|$))', re.IGNORECASE)
|
||||||
|
return re.search(ai_pattern, text)
|
||||||
|
|
||||||
|
def filter_strings(strings_list):
|
||||||
|
return [string for string in strings_list if contains_ai_or_artificial_intelligence(string)]
|
||||||
|
|
||||||
|
def top_n_strings(strings_list, n):
|
||||||
|
filtered_strings = filter_strings(strings_list)
|
||||||
|
# 统计每个字符串的出现次数
|
||||||
|
counter = Counter(filtered_strings)
|
||||||
|
# 获取出现次数最多的前n个字符串
|
||||||
|
most_common = counter.most_common(n)
|
||||||
|
return most_common
|
||||||
|
|
||||||
|
def dm_urls(url):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
urls = set()
|
||||||
|
for a_tag in soup.find_all('a', href=True):
|
||||||
|
href = a_tag['href']
|
||||||
|
urls.add(href)
|
||||||
|
pattern = re.compile(r'https://api\.bilibili\.com/x/v1/dm/list\.so\?oid=\d+')
|
||||||
|
# 过滤符合格式的URL
|
||||||
|
specific_urls = [url1 for url1 in urls if pattern.match(url1)]
|
||||||
|
return specific_urls
|
||||||
|
except:
|
||||||
|
return [] # 返回一个空列表作为默认值
|
||||||
|
|
||||||
|
def extract_urls(url):
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
urls = set()
|
||||||
|
urlbilibili="https://"
|
||||||
|
for a_tag in soup.find_all('a', href=True):
|
||||||
|
href = a_tag['href']
|
||||||
|
# 将相对URL转换为绝对URL
|
||||||
|
full_url = urljoin(urlbilibili, href)
|
||||||
|
urls.add(full_url)
|
||||||
|
pattern = re.compile(r'^https://www\.bilibili\.com/video')
|
||||||
|
# 过滤符合格式的URL
|
||||||
|
specific_urls = [url1 for url1 in urls if pattern.match(url1)]
|
||||||
|
if (len(specific_urls) + vedio_sum > 300):
|
||||||
|
specific_urls = specific_urls[:6] # 保留前六个元素
|
||||||
|
return specific_urls
|
||||||
|
|
||||||
|
def vedio_transform_port(url):
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
page_num = [] #储存总共的分p数
|
||||||
|
span_tag = None #用做判断有无分p的flag
|
||||||
|
div_tags = soup.findAll("div",attrs={"class":"head-left"}) #找到class=head-left的div
|
||||||
|
for tag in div_tags:
|
||||||
|
span_tag = tag.findAll("span", attrs={"class": "cur-page"}) # 再从中找到class=cur-page的span
|
||||||
|
if not span_tag: # 值为None则为单个视频
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
for tag in span_tag:
|
||||||
|
pattern = re.compile(r'\((\d+)/(\d+)\)')
|
||||||
|
match = pattern.search(tag.text) # 使用tag.text提取文本内容
|
||||||
|
if match: # 确保找到匹配项
|
||||||
|
page_num = int(match.group(2)) # 获取总页数
|
||||||
|
return page_num
|
||||||
|
|
||||||
|
list=[]
|
||||||
|
|
||||||
|
for page_num in range(1,9):
|
||||||
|
urls = extract_urls(f"https://search.bilibili.com/video?keyword=%E5%B7%B4%E9%BB%8E%E5%A5%A5%E8%BF%90%E4%BC%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=2&page={page_num}")
|
||||||
|
mod_urls=set()
|
||||||
|
vedio_sum +=len(urls)
|
||||||
|
for ori_url in urls:
|
||||||
|
clean_url = re.sub(r'\?.*$', '', ori_url)
|
||||||
|
p_sum=vedio_transform_port(ori_url)
|
||||||
|
if (p_sum == 0):
|
||||||
|
modified_url = clean_url.replace("bilibili.com", "ibilibili.com")
|
||||||
|
mod_urls.add(modified_url)
|
||||||
|
else:
|
||||||
|
for p_num in range(1, p_sum + 1):
|
||||||
|
ori2_url = f"{clean_url}?p={p_num}"
|
||||||
|
modified_url = ori2_url.replace("bilibili.com", "ibilibili.com")
|
||||||
|
mod_urls.add(modified_url)
|
||||||
|
|
||||||
|
for url in mod_urls:
|
||||||
|
dmurls=dm_urls(url)
|
||||||
|
if dmurls: # 检查列表是否为空
|
||||||
|
dmurl = dmurls[0]
|
||||||
|
response = requests.get(dmurl, headers=headers)
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
html = response.text
|
||||||
|
soup = BeautifulSoup(html, "lxml-xml")
|
||||||
|
# 提取所有评论
|
||||||
|
comments = soup.find_all('d')
|
||||||
|
for comment in comments:
|
||||||
|
comment_text = comment.get_text()
|
||||||
|
list.append(comment_text)
|
||||||
|
|
||||||
|
result = top_n_strings(list,len(list))
|
||||||
|
# 创建 DataFrame
|
||||||
|
df = pd.DataFrame(result, columns=['Element', 'Count'])
|
||||||
|
# 写入 Excel 文件
|
||||||
|
df.to_excel('output.xlsx', index=False)
|
||||||
|
|
||||||
|
result8=top_n_strings(list,8)
|
||||||
|
for item, count in result8:
|
||||||
|
print(f'{item}: {count}')
|
||||||
|
|
||||||
|
df = pd.read_excel('output.xlsx')
|
||||||
|
# 创建词云输入格式
|
||||||
|
text = ' '.join(df.apply(lambda row: f"{row['Element']} " * row['Count'], axis=1))
|
||||||
|
# 生成词云
|
||||||
|
wordcloud = WordCloud(width=800, height=400, background_color='white',font_path=r'C:\Windows\Fonts\FZSTK.TTF').generate(text)
|
||||||
|
wordcloud.to_file('wordcloud5.png')
|
||||||
|
# 显示词云图
|
||||||
|
plt.figure(figsize=(10, 5))
|
||||||
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||||||
|
plt.axis('off')
|
||||||
|
plt.show()
|
Loading…
Reference in new issue