Compare commits

...

16 Commits
图片 ... main

@ -1,2 +1,9 @@
# spider
中途有修改
#分为了"图片"/"main"两个分支,如果点开看不到所有文件可以在左上角分支处选择"图片"
spider_main.py-弹幕爬虫代码
cloudimage.py-词云图代码
select_content.py-筛选AI弹幕代码
ballgame.py-附加题代码
其中弹幕生成文件**all_content.txt**超过5MB无法上传

@ -0,0 +1,20 @@
import pandas as pd
from collections import Counter
# 读入所有弹幕
with open('all_content.txt', mode='r', encoding='utf-8') as f:
data_list = f.readlines()
# 六项球类关键词
keywords = ['乒乓球','羽毛球','排球','篮球','足球','网球']
# 筛选有关球类的弹幕
selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in keywords)]
# 统计弹幕数量
num = Counter(selectdanmu)
top_common = num.most_common(20)
# 展示数量前八条弹幕
print(top_common)
t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
# 导出excel文件
excel_path = 'top_ball_danmu.xlsx'
t.to_excel(excel_path, index=False)

@ -9,7 +9,7 @@ text1=jieba.cut(text)
# 以空格作为分隔符,将分词后的所有字符串合并成一个新的字符串
text = ' '.join(text1)
# 根据分词结果产生词云
wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=500, height=400, mode="RGBA", background_color=None).generate(text)
wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=618, height=500, mode="RGBA", background_color=None).generate(text)
# 以图片的形式显示词云
plt.imshow(wc, interpolation="bilinear")
# 不显示图像坐标系

Binary file not shown.

@ -0,0 +1,21 @@
import pandas as pd
from collections import Counter
# 读入所有弹幕
with open('all_content.txt', mode='r', encoding='utf-8') as f:
data_list = f.readlines()
# AI技术应用有关关键词
ai_keywords = ['AI','人工智能','ai音效','ai视频','ai技术','机器学习', '深度学习', '自然语言处理','ai训练',
'大模型','自然语言处理','云计算','神经网络', '自动驾驶','ai设计','ai图','AI软件',]
# 筛选有关AI的弹幕
selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in ai_keywords)]
# 统计弹幕数量
num = Counter(selectdanmu)
top_common = num.most_common(8)
# 展示数量前八条弹幕
print(top_common)
t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
# 导出excel文件
excel_path = 'top8_ai_danmu.xlsx'
t.to_excel(excel_path, index=False)

@ -3,7 +3,7 @@ import json
import re
import wordcloud
from bs4 import BeautifulSoup
from video_bid import videobid
# from video_bid import videobid
# 基本模块
# 所需视频bid号列表、弹幕内容、cid号列表
list1 = []
@ -34,7 +34,6 @@ def video_cid(num2):
response.encoding = response.apparent_encoding
content = json.loads(response.text)# 获取cid号
datalist = []
# if "data" in content and "cid" in content["data"][0]:
cid = content["data"][0]["cid"]
if isinstance(cid, list):
datalist.extend(cid)
@ -56,10 +55,10 @@ if __name__ == '__main__':
# 每页30个视频 爬取第1页到第10页的bid号
for page in (1,11):
video_bid(page)
# 对于每个bid号爬取cid号
# 对于每个bid号 爬取cid号
for bv in list1:
video_cid(bv)
# 对于每个CID号爬取弹幕内容
# 对于每个cid号 爬取弹幕内容
for cid in list2:
video_bullet(cid)
# 将弹幕内容导出到本地文件

Binary file not shown.

Binary file not shown.
Loading…
Cancel
Save