Compare commits

..

3 Commits
main ... 图片

Author SHA1 Message Date
ptkbf2lr5 4b6d92d686 Update README.md
11 months ago
ptkbf2lr5 2f607162dd 词云图2
11 months ago
ptkbf2lr5 3c5cc6e3b7 词云图1
11 months ago

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

@ -1,9 +1,3 @@
# spider
在VS code 上做完才一次性上传的
中途有修改
#分为了"图片"/"main"两个分支,如果点开看不到所有文件可以在左上角分支处选择"图片"
spider_main.py-弹幕爬虫代码
cloudimage.py-词云图代码
select_content.py-筛选AI弹幕代码
ballgame.py-附加题代码
其中弹幕生成文件**all_content.txt**超过5MB无法上传

@ -1,20 +0,0 @@
import pandas as pd
from collections import Counter
# 读入所有弹幕
with open('all_content.txt', mode='r', encoding='utf-8') as f:
data_list = f.readlines()
# 六项球类关键词
keywords = ['乒乓球','羽毛球','排球','篮球','足球','网球']
# 筛选有关球类的弹幕
selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in keywords)]
# 统计弹幕数量
num = Counter(selectdanmu)
top_common = num.most_common(20)
# 展示数量前八条弹幕
print(top_common)
t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
# 导出excel文件
excel_path = 'top_ball_danmu.xlsx'
t.to_excel(excel_path, index=False)

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

@ -9,7 +9,7 @@ text1=jieba.cut(text)
# 以空格作为分隔符,将分词后的所有字符串合并成一个新的字符串
text = ' '.join(text1)
# 根据分词结果产生词云
wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=618, height=500, mode="RGBA", background_color=None).generate(text)
wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=500, height=400, mode="RGBA", background_color=None).generate(text)
# 以图片的形式显示词云
plt.imshow(wc, interpolation="bilinear")
# 不显示图像坐标系

Binary file not shown.

@ -1,21 +0,0 @@
import pandas as pd
from collections import Counter
# 读入所有弹幕
with open('all_content.txt', mode='r', encoding='utf-8') as f:
data_list = f.readlines()
# AI技术应用有关关键词
ai_keywords = ['AI','人工智能','ai音效','ai视频','ai技术','机器学习', '深度学习', '自然语言处理','ai训练',
'大模型','自然语言处理','云计算','神经网络', '自动驾驶','ai设计','ai图','AI软件',]
# 筛选有关AI的弹幕
selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in ai_keywords)]
# 统计弹幕数量
num = Counter(selectdanmu)
top_common = num.most_common(8)
# 展示数量前八条弹幕
print(top_common)
t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
# 导出excel文件
excel_path = 'top8_ai_danmu.xlsx'
t.to_excel(excel_path, index=False)

@ -1,68 +1,69 @@
import requests
import json
import re
import wordcloud
from bs4 import BeautifulSoup
# from video_bid import videobid
# 基本模块
# 所需视频bid号列表、弹幕内容、cid号列表
list1 = []
list2 = []
anslist = []
# 请求标头 模拟浏览器
headers = {
#Cookie 用户信息 u-a 用户代理 referer 反爬
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie":"_uuid=C910E215C-7103E-4E54-9515-10FF7599AAECF78232infoc; buvid_fp=e7bba7e4bae311a280afa29479dcd19d; buvid3=A7EFF3D3-823F-E623-0EC8-4DE13D84710980740infoc; b_nut=1709977881; buvid4=B0B9DD4D-D469-44F5-6A8F-B2BCC72B405380740-024030909-YSaors5wcVdXxPAyYUcK7w%3D%3D; CURRENT_FNVAL=4048; rpdid=0zbfAGEiQM|8kXSwyAg|26E|3w1Swd2X; DedeUserID=440181187; DedeUserID__ckMd5=7aea6a85d9bac605; CURRENT_QUALITY=80; header_theme_version=CLOSE; enable_web_push=DISABLE; SESSDATA=218ebe14%2C1742048643%2C74a54%2A92CjCwT-WOtbM-xsZKbPnRuVAbDVtv--QmJvHh6khSys1se0CpcdYi5_2hO4THAprUtX0SVjMxenRnVjNiQ0JDd1ZtLVU1OWs3OUdOczNYWlZ6aWU2dkg2RHpWb2hZc00xOWJjWm9MVjdGNUcwLVY1ckYzOC1BUHViR0dvajZoeWpQb1M3ek9DUC13IIEC; bili_jct=51295d2894045195c66fdadf6b1b7a91; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTU4OTQsImlhdCI6MTcyNjQ5NjYzNCwicGx0IjotMX0.Dr-RHjAMQ0qLpe3McjbIChH_d_S3UYkbXlb2doqMdWw; bili_ticket_expires=1726755834; home_feed_column=5; browser_resolution=1536-695; bp_t_offset_440181187=978115949112590336; sid=6xn7yn2o; b_lsid=BA647F49_1920109E5DD",
"referer":"https://www.bilibili.com/"
}
def video_bid(num1):
# 获取综合排序前300视频的bid
# 请求网址
page_url = f"https://search.bilibili.com/all?keyword={"2024巴黎奥运会"}&page={num1}"
response = requests.get(url=page_url, headers=headers)# 发送请求
response.encoding = response.apparent_encoding
data_list = re.findall('"bvid":"(.*?)"', response.text)# 正则表达式获取bid
global list1 # 声明为全局变量
list1.extend(data_list) # 每次循环导入数据
def video_cid(num2):
# 获取视频cid
# 请求网址
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={num2}&jsonp=jsonp'
response = requests.get(url=cid_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
content = json.loads(response.text)# 获取cid号
datalist = []
cid = content["data"][0]["cid"]
if isinstance(cid, list):
datalist.extend(cid)
else:
datalist.append(cid)
global list2# 声明为全局变量
list2.extend(datalist)
def video_bullet(num3):
# 爬取弹幕
bullet_url = f'https://comment.bilibili.com/{num3}.xml' # 弹幕URL
response = requests.get(url=bullet_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
global anslist
chat_xml = BeautifulSoup(response.text,"xml") # 解析XML响应 提取文本
ss = chat_xml.find_all("d") #爬取所有的弹幕
for d in ss:
anslist.append(d.text)
if __name__ == '__main__':
# 每页30个视频 爬取第1页到第10页的bid号
for page in (1,11):
video_bid(page)
# 对于每个bid号 爬取cid号
for bv in list1:
video_cid(bv)
# 对于每个cid号 爬取弹幕内容
for cid in list2:
video_bullet(cid)
# 将弹幕内容导出到本地文件
blscn = '\n'.join(anslist)
with open('所有弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(blscn)
import requests
import json
import re
import wordcloud
from bs4 import BeautifulSoup
from video_bid import videobid
# 基本模块
# 所需视频bid号列表、弹幕内容、cid号列表
list1 = []
list2 = []
anslist = []
# 请求标头 模拟浏览器
headers = {
#Cookie 用户信息 u-a 用户代理 referer 反爬
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0",
"cookie":"_uuid=C910E215C-7103E-4E54-9515-10FF7599AAECF78232infoc; buvid_fp=e7bba7e4bae311a280afa29479dcd19d; buvid3=A7EFF3D3-823F-E623-0EC8-4DE13D84710980740infoc; b_nut=1709977881; buvid4=B0B9DD4D-D469-44F5-6A8F-B2BCC72B405380740-024030909-YSaors5wcVdXxPAyYUcK7w%3D%3D; CURRENT_FNVAL=4048; rpdid=0zbfAGEiQM|8kXSwyAg|26E|3w1Swd2X; DedeUserID=440181187; DedeUserID__ckMd5=7aea6a85d9bac605; CURRENT_QUALITY=80; header_theme_version=CLOSE; enable_web_push=DISABLE; SESSDATA=218ebe14%2C1742048643%2C74a54%2A92CjCwT-WOtbM-xsZKbPnRuVAbDVtv--QmJvHh6khSys1se0CpcdYi5_2hO4THAprUtX0SVjMxenRnVjNiQ0JDd1ZtLVU1OWs3OUdOczNYWlZ6aWU2dkg2RHpWb2hZc00xOWJjWm9MVjdGNUcwLVY1ckYzOC1BUHViR0dvajZoeWpQb1M3ek9DUC13IIEC; bili_jct=51295d2894045195c66fdadf6b1b7a91; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjY3NTU4OTQsImlhdCI6MTcyNjQ5NjYzNCwicGx0IjotMX0.Dr-RHjAMQ0qLpe3McjbIChH_d_S3UYkbXlb2doqMdWw; bili_ticket_expires=1726755834; home_feed_column=5; browser_resolution=1536-695; bp_t_offset_440181187=978115949112590336; sid=6xn7yn2o; b_lsid=BA647F49_1920109E5DD",
"referer":"https://www.bilibili.com/"
}
def video_bid(num1):
# 获取综合排序前300视频的bid
# 请求网址
page_url = f"https://search.bilibili.com/all?keyword={"2024巴黎奥运会"}&page={num1}"
response = requests.get(url=page_url, headers=headers)# 发送请求
response.encoding = response.apparent_encoding
data_list = re.findall('"bvid":"(.*?)"', response.text)# 正则表达式获取bid
global list1 # 声明为全局变量
list1.extend(data_list) # 每次循环导入数据
def video_cid(num2):
# 获取视频cid
# 请求网址
cid_url = f'https://api.bilibili.com/x/player/pagelist?bvid={num2}&jsonp=jsonp'
response = requests.get(url=cid_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
content = json.loads(response.text)# 获取cid号
datalist = []
# if "data" in content and "cid" in content["data"][0]:
cid = content["data"][0]["cid"]
if isinstance(cid, list):
datalist.extend(cid)
else:
datalist.append(cid)
global list2# 声明为全局变量
list2.extend(datalist)
def video_bullet(num3):
# 爬取弹幕
bullet_url = f'https://comment.bilibili.com/{num3}.xml' # 弹幕URL
response = requests.get(url=bullet_url, headers=headers) # 发送请求
response.encoding = response.apparent_encoding
global anslist
chat_xml = BeautifulSoup(response.text,"xml") # 解析XML响应 提取文本
ss = chat_xml.find_all("d") #爬取所有的弹幕
for d in ss:
anslist.append(d.text)
if __name__ == '__main__':
# 每页30个视频 爬取第1页到第10页的bid号
for page in (1,11):
video_bid(page)
# 对于每个bid号爬取cid号
for bv in list1:
video_cid(bv)
# 对于每个CID号爬取弹幕内容
for cid in list2:
video_bullet(cid)
# 将弹幕内容导出到本地文件
blscn = '\n'.join(anslist)
with open('所有弹幕.txt', mode='a', encoding='utf-8') as f:
f.write(blscn)
print("endprint") #检测是否正常退出

Binary file not shown.

Binary file not shown.
Loading…
Cancel
Save