Update spider_main.py

Update select_content.py
8 changed files with 116 additions and 69 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,9 @@
 # spider
+中途有修改
+#分为了"图片"/"main"两个分支，如果点开看不到所有文件可以在左上角分支处选择"图片"

+spider_main.py-弹幕爬虫代码
+cloudimage.py-词云图代码
+select_content.py-筛选AI弹幕代码
+ballgame.py-附加题代码
+其中弹幕生成文件**all_content.txt**超过5MB无法上传
--- a/ballgame.py
+++ b/ballgame.py
@ -0,0 +1,20 @@
+import pandas as pd
+from collections import Counter
+
+# 读入所有弹幕
+with open('all_content.txt', mode='r', encoding='utf-8') as f:
+    data_list = f.readlines()
+# 六项球类关键词
+keywords = ['乒乓球','羽毛球','排球','篮球','足球','网球']
+# 筛选有关球类的弹幕
+selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in keywords)]
+
+# 统计弹幕数量
+num = Counter(selectdanmu)
+top_common = num.most_common(20)
+# 展示数量前八条弹幕
+print(top_common)
+t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
+# 导出excel文件
+excel_path = 'top_ball_danmu.xlsx'  
+t.to_excel(excel_path, index=False)
--- a/cloudimage2.py
+++ b/cloudimage2.py
@ -9,7 +9,7 @@ text1=jieba.cut(text)
 # 以空格作为分隔符，将分词后的所有字符串合并成一个新的字符串
 text = ' '.join(text1)
 # 根据分词结果产生词云
-wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=500, height=400, mode="RGBA", background_color=None).generate(text)
+wc = WordCloud(font_path = "C:\Windows\Fonts\Microsoft YaHei UI\msyh.ttc",width=618, height=500, mode="RGBA", background_color=None).generate(text)
 # 以图片的形式显示词云
 plt.imshow(wc, interpolation="bilinear")
 # 不显示图像坐标系
--- a/requirements.txt
+++ b/requirements.txt
--- a/select_content.py
+++ b/select_content.py
@ -0,0 +1,21 @@
+import pandas as pd
+from collections import Counter
+
+# 读入所有弹幕
+with open('all_content.txt', mode='r', encoding='utf-8') as f:
+    data_list = f.readlines()
+# AI技术应用有关关键词
+ai_keywords = ['AI','人工智能','ai音效','ai视频','ai技术','机器学习', '深度学习', '自然语言处理','ai训练',
+            '大模型','自然语言处理','云计算','神经网络', '自动驾驶','ai设计','ai图','AI软件',]
+# 筛选有关AI的弹幕
+selectdanmu = [danmu for danmu in data_list if any(keyword in danmu for keyword in ai_keywords)]
+
+# 统计弹幕数量
+num = Counter(selectdanmu)
+top_common = num.most_common(8)
+# 展示数量前八条弹幕
+print(top_common)
+t = pd.DataFrame(top_common, columns=['弹幕内容', '数量'])
+# 导出excel文件
+excel_path = 'top8_ai_danmu.xlsx'  
+t.to_excel(excel_path, index=False)
--- a/spider_main.py
+++ b/spider_main.py
@ -3,7 +3,7 @@ import json
 import re
 import wordcloud
 from bs4 import BeautifulSoup
-from video_bid import videobid
+# from video_bid import videobid
 # 基本模块
 # 所需视频bid号列表、弹幕内容、cid号列表
 list1 = []
@ -34,7 +34,6 @@ def video_cid(num2):
    response.encoding = response.apparent_encoding
    content = json.loads(response.text)# 获取cid号
    datalist = []
-#    if "data" in content and "cid" in content["data"][0]:
    cid = content["data"][0]["cid"]
    if isinstance(cid, list):
        datalist.extend(cid)
@ -56,10 +55,10 @@ if __name__ == '__main__':
    # 每页30个视频 爬取第1页到第10页的bid号
    for page in (1,11):
        video_bid(page)
-    # 对于每个bid号爬取cid号
+    # 对于每个bid号 爬取cid号
    for bv in list1:
        video_cid(bv)
-    # 对于每个CID号，爬取弹幕内容
+    # 对于每个cid号 爬取弹幕内容
    for cid in list2:
        video_bullet(cid)
    # 将弹幕内容导出到本地文件
--- a/top8_ai_danmu.xlsx
+++ b/top8_ai_danmu.xlsx
--- a/top_ball_danmu.xlsx
+++ b/top_ball_danmu.xlsx
Author	SHA1	Message	Date
ptkbf2lr5	1973f58287	Update spider_main.py	5 months ago
ptkbf2lr5	be62c0356d	Update select_content.py	5 months ago
ptkbf2lr5	99a7f3587d	Update spider_main.py	5 months ago
ptkbf2lr5	cce5370169	Update cloudimage.py	5 months ago
ptkbf2lr5	ff7b6e8702	Delete 'cloudimage.py'	5 months ago
ptkbf2lr5	adee86021e	Update spider_main.py	5 months ago
ptkbf2lr5	235dedf111	Update select_content.py	5 months ago
ptkbf2lr5	f80d80b091	Update README.md	5 months ago
ptkbf2lr5	6d30d34b44	ADD file via upload	5 months ago
ptkbf2lr5	4db4b37a16	对球类喜好分析结果	5 months ago
ptkbf2lr5	872a9d2fde	对人们对球类项目的喜好程度分析	5 months ago
ptkbf2lr5	712ab4fb0f	ADD file via upload	5 months ago
ptkbf2lr5	da23d1b336	数量前八的弹幕excel文件	5 months ago
ptkbf2lr5	422e1fe9db	筛选AI有关弹幕	5 months ago
ptkbf2lr5	454eeb27bd	Update README.md	5 months ago
ptkbf2lr5	d7b1403538	Update README.md	5 months ago