parent
1923632812
commit
5809c0a5f6
After Width: | Height: | Size: 212 KiB |
@ -0,0 +1,21 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def top_8_ai_barrage(file_path):
|
||||||
|
# 读取CSV文件
|
||||||
|
all_barrage = pd.read_csv(file_path, encoding='utf-8')
|
||||||
|
|
||||||
|
# 过滤包含"AI"或"人工智能"的弹幕
|
||||||
|
ai_barrage = all_barrage[all_barrage['barrage'].str.contains('AI |人工智能|科技|智能', case=False, na=False)]
|
||||||
|
|
||||||
|
# 统计每个弹幕出现的次数
|
||||||
|
counter = ai_barrage['barrage'].value_counts()
|
||||||
|
|
||||||
|
# 获取数量排名前8项
|
||||||
|
top_8 = counter.head(8).reset_index()
|
||||||
|
top_8.columns = ['弹幕', '出现次数']
|
||||||
|
return top_8
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(top_8_ai_barrage('barrage.csv'))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,44 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import jieba
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
|
||||||
|
|
||||||
|
# 对弹幕内容进行分词
|
||||||
|
def jieba_tokenizer(text):
|
||||||
|
return jieba.lcut(text)
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_analysis():
|
||||||
|
df = pd.read_csv('barrage.csv')
|
||||||
|
stopwords_list = list(stopwords.words('chinese'))
|
||||||
|
stopwords_list.append('都')
|
||||||
|
stopwords_list.append('不')
|
||||||
|
stopwords_list.append('好')
|
||||||
|
stopwords_list.append('5')
|
||||||
|
|
||||||
|
# 使用TF-IDF进行特征提取
|
||||||
|
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list)
|
||||||
|
tfidf_matrix = vectorizer.fit_transform(df['barrage'])
|
||||||
|
|
||||||
|
# 使用KMeans进行聚类
|
||||||
|
num_clusters = 10
|
||||||
|
km = KMeans(n_clusters=num_clusters)
|
||||||
|
km.fit(tfidf_matrix)
|
||||||
|
|
||||||
|
# 将聚类结果添加到原始数据中
|
||||||
|
df['cluster'] = km.labels_
|
||||||
|
|
||||||
|
# 输出每个聚类的前几个弹幕
|
||||||
|
for i in range(num_clusters):
|
||||||
|
print(f"cluster {i}:")
|
||||||
|
print(df[df['cluster'] == i]['barrage'].head(10))
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
# 保存聚类结果到CSV文件
|
||||||
|
df.to_csv('barrage_clustered.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cluster_analysis()
|
@ -0,0 +1,61 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
os.environ['NO_PROXY'] = 'bilibili.com'
|
||||||
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
||||||
|
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
|
||||||
|
|
||||||
|
|
||||||
|
def get_bvid(page, pos):
|
||||||
|
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
|
||||||
|
headers = {
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'cookie': COOKIE,
|
||||||
|
}
|
||||||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||||||
|
print(response)
|
||||||
|
json_dict = json.loads(response)
|
||||||
|
return json_dict["data"]["result"][11]["data"][pos]['bvid']
|
||||||
|
|
||||||
|
|
||||||
|
def get_cid(bvid):
|
||||||
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||||||
|
headers = {
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'cookie': COOKIE,
|
||||||
|
}
|
||||||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||||||
|
dirt = json.loads(response)
|
||||||
|
cid = dirt['data'][0]['cid']
|
||||||
|
print(cid)
|
||||||
|
return cid
|
||||||
|
|
||||||
|
|
||||||
|
def get_barrage(cid):
|
||||||
|
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
html_doc = response.content.decode('utf-8')
|
||||||
|
# 正则表达式的匹配模式
|
||||||
|
res = re.compile('<d.*?>(.*?)</d>')
|
||||||
|
# 根据模式提取网页数据
|
||||||
|
barrage = re.findall(res, html_doc)
|
||||||
|
df = pd.DataFrame(barrage, columns=['barrage'])
|
||||||
|
if not os.path.isfile('barrage.csv'):
|
||||||
|
df.to_csv('barrage.csv', mode='w', index=False, encoding='utf-8-sig')
|
||||||
|
else:
|
||||||
|
df.to_csv('barrage.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
for i in range(15):
|
||||||
|
for j in range(20):
|
||||||
|
get_barrage(get_cid(get_bvid(i, j)))
|
||||||
|
time.sleep(1)
|
@ -0,0 +1,68 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
os.environ['NO_PROXY'] = 'bilibili.com'
|
||||||
|
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
||||||
|
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
|
||||||
|
|
||||||
|
|
||||||
|
def get_bvid(page, pos):
|
||||||
|
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
|
||||||
|
headers = {
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'cookie': COOKIE,
|
||||||
|
}
|
||||||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||||||
|
print(response)
|
||||||
|
json_dict = json.loads(response)
|
||||||
|
return json_dict["data"]["result"][11]["data"][pos]['bvid']
|
||||||
|
|
||||||
|
|
||||||
|
def get_cid(bvid):
|
||||||
|
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||||||
|
headers = {
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'cookie': COOKIE,
|
||||||
|
}
|
||||||
|
response = requests.get(url=url, headers=headers, verify=False).text
|
||||||
|
dirt = json.loads(response)
|
||||||
|
cid = dirt['data'][0]['cid']
|
||||||
|
print(cid)
|
||||||
|
return cid
|
||||||
|
|
||||||
|
|
||||||
|
def get_barrage(cid):
|
||||||
|
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
|
||||||
|
headers = {
|
||||||
|
"User-Agent": USER_AGENT
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
html_doc = response.content.decode('utf-8')
|
||||||
|
# 正则表达式的匹配模式
|
||||||
|
res = re.compile('<d.*?>(.*?)</d>')
|
||||||
|
# 根据模式提取网页数据
|
||||||
|
barrage = re.findall(res, html_doc)
|
||||||
|
df = pd.DataFrame(barrage, columns=['barrage'])
|
||||||
|
if not os.path.isfile('2.csv'):
|
||||||
|
df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
|
||||||
|
else:
|
||||||
|
df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
|
||||||
|
|
||||||
|
|
||||||
|
def process_page(page, pos):
|
||||||
|
bvid = get_bvid(page, pos)
|
||||||
|
cid = get_cid(bvid)
|
||||||
|
get_barrage(cid)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||||
|
for i in range(15):
|
||||||
|
for j in range(20):
|
||||||
|
executor.submit(process_page, i, j)
|
@ -0,0 +1,20 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from snownlp import SnowNLP
|
||||||
|
|
||||||
|
|
||||||
|
def sentiment_analysis(text):
|
||||||
|
s = SnowNLP(text)
|
||||||
|
return s.sentiments
|
||||||
|
|
||||||
|
|
||||||
|
def do_sentiment_analysis(filename):
|
||||||
|
df = pd.read_csv('barrage.csv')
|
||||||
|
# 对弹幕内容进行情感分析
|
||||||
|
df['sentiment'] = df['barrage'].apply(sentiment_analysis)
|
||||||
|
print(df.head())
|
||||||
|
# 保存情感分析结果到CSV文件
|
||||||
|
df.to_csv('barrage_sentiment.csv', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
do_sentiment_analysis('barrage.csv')
|
@ -0,0 +1,36 @@
|
|||||||
|
import jieba
|
||||||
|
import nltk
|
||||||
|
import numpy as np
|
||||||
|
import PIL.Image as image
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from wordcloud import wordcloud
|
||||||
|
|
||||||
|
|
||||||
|
def get_wordcloud(file_name):
|
||||||
|
nltk.download('stopwords')
|
||||||
|
f = open(file_name, encoding='utf-8')
|
||||||
|
txt = f.read()
|
||||||
|
txt_list = jieba.lcut(txt)
|
||||||
|
string = ' '.join(txt_list)
|
||||||
|
mask_image = "2.png"
|
||||||
|
mask = np.array(image.open(mask_image))
|
||||||
|
stopwords_list = set(stopwords.words('chinese'))
|
||||||
|
stopwords_target = ['都', '不', '好', '哈哈哈', '说', '还', '很', '没']
|
||||||
|
for i in stopwords_target:
|
||||||
|
stopwords_list.add(i)
|
||||||
|
w = wordcloud.WordCloud(
|
||||||
|
mask=mask,
|
||||||
|
width=mask.shape[1],
|
||||||
|
height=mask.shape[0],
|
||||||
|
background_color='white',
|
||||||
|
font_path='C:/Windows/Fonts/STLITI.TTF',
|
||||||
|
stopwords=stopwords_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
w.generate(string)
|
||||||
|
# 打印词云图片
|
||||||
|
w.to_file('wordcloud.jpg')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
get_wordcloud('barrage.csv')
|
|
Loading…
Reference in new issue