parent
1923632812
commit
5809c0a5f6
After Width: | Height: | Size: 212 KiB |
@ -0,0 +1,21 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def top_8_ai_barrage(file_path):
|
||||
# 读取CSV文件
|
||||
all_barrage = pd.read_csv(file_path, encoding='utf-8')
|
||||
|
||||
# 过滤包含"AI"或"人工智能"的弹幕
|
||||
ai_barrage = all_barrage[all_barrage['barrage'].str.contains('AI |人工智能|科技|智能', case=False, na=False)]
|
||||
|
||||
# 统计每个弹幕出现的次数
|
||||
counter = ai_barrage['barrage'].value_counts()
|
||||
|
||||
# 获取数量排名前8项
|
||||
top_8 = counter.head(8).reset_index()
|
||||
top_8.columns = ['弹幕', '出现次数']
|
||||
return top_8
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(top_8_ai_barrage('barrage.csv'))
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,44 @@
|
||||
import pandas as pd
|
||||
import jieba
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
|
||||
# 对弹幕内容进行分词
|
||||
def jieba_tokenizer(text):
|
||||
return jieba.lcut(text)
|
||||
|
||||
|
||||
def cluster_analysis():
|
||||
df = pd.read_csv('barrage.csv')
|
||||
stopwords_list = list(stopwords.words('chinese'))
|
||||
stopwords_list.append('都')
|
||||
stopwords_list.append('不')
|
||||
stopwords_list.append('好')
|
||||
stopwords_list.append('5')
|
||||
|
||||
# 使用TF-IDF进行特征提取
|
||||
vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer, stop_words=stopwords_list)
|
||||
tfidf_matrix = vectorizer.fit_transform(df['barrage'])
|
||||
|
||||
# 使用KMeans进行聚类
|
||||
num_clusters = 10
|
||||
km = KMeans(n_clusters=num_clusters)
|
||||
km.fit(tfidf_matrix)
|
||||
|
||||
# 将聚类结果添加到原始数据中
|
||||
df['cluster'] = km.labels_
|
||||
|
||||
# 输出每个聚类的前几个弹幕
|
||||
for i in range(num_clusters):
|
||||
print(f"cluster {i}:")
|
||||
print(df[df['cluster'] == i]['barrage'].head(10))
|
||||
print("\n")
|
||||
|
||||
# 保存聚类结果到CSV文件
|
||||
df.to_csv('barrage_clustered.csv', index=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cluster_analysis()
|
@ -0,0 +1,61 @@
|
||||
import os
|
||||
import time
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
|
||||
os.environ['NO_PROXY'] = 'bilibili.com'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
||||
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
|
||||
|
||||
|
||||
def get_bvid(page, pos):
|
||||
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'cookie': COOKIE,
|
||||
}
|
||||
response = requests.get(url=url, headers=headers, verify=False).text
|
||||
print(response)
|
||||
json_dict = json.loads(response)
|
||||
return json_dict["data"]["result"][11]["data"][pos]['bvid']
|
||||
|
||||
|
||||
def get_cid(bvid):
|
||||
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'cookie': COOKIE,
|
||||
}
|
||||
response = requests.get(url=url, headers=headers, verify=False).text
|
||||
dirt = json.loads(response)
|
||||
cid = dirt['data'][0]['cid']
|
||||
print(cid)
|
||||
return cid
|
||||
|
||||
|
||||
def get_barrage(cid):
|
||||
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
|
||||
headers = {
|
||||
"User-Agent": USER_AGENT
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
html_doc = response.content.decode('utf-8')
|
||||
# 正则表达式的匹配模式
|
||||
res = re.compile('<d.*?>(.*?)</d>')
|
||||
# 根据模式提取网页数据
|
||||
barrage = re.findall(res, html_doc)
|
||||
df = pd.DataFrame(barrage, columns=['barrage'])
|
||||
if not os.path.isfile('barrage.csv'):
|
||||
df.to_csv('barrage.csv', mode='w', index=False, encoding='utf-8-sig')
|
||||
else:
|
||||
df.to_csv('barrage.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
for i in range(15):
|
||||
for j in range(20):
|
||||
get_barrage(get_cid(get_bvid(i, j)))
|
||||
time.sleep(1)
|
@ -0,0 +1,68 @@
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
import pandas as pd
|
||||
|
||||
os.environ['NO_PROXY'] = 'bilibili.com'
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
|
||||
COOKIE = "buvid3=5CD968B6-5E6F-AA6C-D8BB-422744C1DB0054109infoc; b_nut=1673696354; _uuid=1092729F9-10FE3-DB8C-64A2-F27261E3B43153165infoc; buvid4=82DFB562-AF5C-F20C-AC96-E71C089E97E355884-023011419-hBTxrxbr8pWyUVDiIX7ZVw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)luk|llRR0J'uY~RkuJ|Ju; buvid_fp_plain=undefined; i-wanna-go-back=-1; nostalgia_conf=-1; b_ut=5; header_theme_version=CLOSE; LIVE_BUVID=AUTO6216768194613696; home_feed_column=4; CURRENT_PID=8e025d90-cb08-11ed-8e36-15f3cf3099af; CURRENT_QUALITY=80; browser_resolution=1392-786; FEED_LIVE_VERSION=V_SEO_FIRST_CARD; fingerprint=efbe80e589d57838b8ff20cb5df98e9d; buvid_fp=a1198a46c9f71d42436ace10a9ab7448; bili_jct=416c2ad96c39091affdb9092e9a593d9; DedeUserID=414937912; DedeUserID__ckMd5=a09ace8891a7091a; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2OTQwODgxODUsImlhdCI6MTY5MzgyODk4NSwicGx0IjotMX0.pEBW75b3VX6p2lqx7jPuUpPvrAHz0QIoHtagLMp3_iU; bili_ticket_expires=1694088185; bp_video_offset_1480857975=837478778313637938; bp_video_offset_414937912=838225299482083363; sid=6udo3o89; PVID=3; b_lsid=B11E5210A_18A6ABF8D38"
|
||||
|
||||
|
||||
def get_bvid(page, pos):
|
||||
url = 'https://api.bilibili.com/x/web-interface/search/all/v2?page='+str(page+1)+'&keyword=2024巴黎奥运会'
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'cookie': COOKIE,
|
||||
}
|
||||
response = requests.get(url=url, headers=headers, verify=False).text
|
||||
print(response)
|
||||
json_dict = json.loads(response)
|
||||
return json_dict["data"]["result"][11]["data"][pos]['bvid']
|
||||
|
||||
|
||||
def get_cid(bvid):
|
||||
url = f"https://api.bilibili.com/x/player/pagelist?bvid={bvid}&jsonp=jsonp"
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'cookie': COOKIE,
|
||||
}
|
||||
response = requests.get(url=url, headers=headers, verify=False).text
|
||||
dirt = json.loads(response)
|
||||
cid = dirt['data'][0]['cid']
|
||||
print(cid)
|
||||
return cid
|
||||
|
||||
|
||||
def get_barrage(cid):
|
||||
url = 'https://api.bilibili.com/x/v1/dm/list.so?oid=' + str(cid)
|
||||
headers = {
|
||||
"User-Agent": USER_AGENT
|
||||
}
|
||||
response = requests.get(url, headers=headers)
|
||||
html_doc = response.content.decode('utf-8')
|
||||
# 正则表达式的匹配模式
|
||||
res = re.compile('<d.*?>(.*?)</d>')
|
||||
# 根据模式提取网页数据
|
||||
barrage = re.findall(res, html_doc)
|
||||
df = pd.DataFrame(barrage, columns=['barrage'])
|
||||
if not os.path.isfile('2.csv'):
|
||||
df.to_csv('2.csv', mode='w', index=False, encoding='utf-8-sig')
|
||||
else:
|
||||
df.to_csv('2.csv', mode='a', index=False, header=False, encoding='utf-8-sig')
|
||||
|
||||
|
||||
def process_page(page, pos):
|
||||
bvid = get_bvid(page, pos)
|
||||
cid = get_cid(bvid)
|
||||
get_barrage(cid)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
for i in range(15):
|
||||
for j in range(20):
|
||||
executor.submit(process_page, i, j)
|
@ -0,0 +1,20 @@
|
||||
import pandas as pd
|
||||
from snownlp import SnowNLP
|
||||
|
||||
|
||||
def sentiment_analysis(text):
|
||||
s = SnowNLP(text)
|
||||
return s.sentiments
|
||||
|
||||
|
||||
def do_sentiment_analysis(filename):
|
||||
df = pd.read_csv('barrage.csv')
|
||||
# 对弹幕内容进行情感分析
|
||||
df['sentiment'] = df['barrage'].apply(sentiment_analysis)
|
||||
print(df.head())
|
||||
# 保存情感分析结果到CSV文件
|
||||
df.to_csv('barrage_sentiment.csv', index=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
do_sentiment_analysis('barrage.csv')
|
@ -0,0 +1,36 @@
|
||||
import jieba
|
||||
import nltk
|
||||
import numpy as np
|
||||
import PIL.Image as image
|
||||
from nltk.corpus import stopwords
|
||||
from wordcloud import wordcloud
|
||||
|
||||
|
||||
def get_wordcloud(file_name):
|
||||
nltk.download('stopwords')
|
||||
f = open(file_name, encoding='utf-8')
|
||||
txt = f.read()
|
||||
txt_list = jieba.lcut(txt)
|
||||
string = ' '.join(txt_list)
|
||||
mask_image = "2.png"
|
||||
mask = np.array(image.open(mask_image))
|
||||
stopwords_list = set(stopwords.words('chinese'))
|
||||
stopwords_target = ['都', '不', '好', '哈哈哈', '说', '还', '很', '没']
|
||||
for i in stopwords_target:
|
||||
stopwords_list.add(i)
|
||||
w = wordcloud.WordCloud(
|
||||
mask=mask,
|
||||
width=mask.shape[1],
|
||||
height=mask.shape[0],
|
||||
background_color='white',
|
||||
font_path='C:/Windows/Fonts/STLITI.TTF',
|
||||
stopwords=stopwords_list,
|
||||
)
|
||||
|
||||
w.generate(string)
|
||||
# 打印词云图片
|
||||
w.to_file('wordcloud.jpg')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_wordcloud('barrage.csv')
|
|
Loading…
Reference in new issue