|
|
'''
|
|
|
数据处理与转存,实现弹幕AI关键词提取
|
|
|
|
|
|
sep是转存成字符后的弹幕分隔标志
|
|
|
isSaveText是用于表明是否将弹幕数据转存成文本的标志
|
|
|
xlsxPath是弹幕数据的路径
|
|
|
saveToText是转存成文本数据的路径
|
|
|
saveAiPath是输出ai相关弹幕的路径
|
|
|
keywords是ai关键词
|
|
|
'''
|
|
|
|
|
|
import re
|
|
|
import pandas as pd
|
|
|
from collections import Counter
|
|
|
from openpyxl import Workbook
|
|
|
|
|
|
# 读取弹幕文件
|
|
|
def ReadXlsx(filePath=''):
|
|
|
df = pd.read_excel(filePath, sheet_name=0)
|
|
|
df.dropna(axis=1, how='all')
|
|
|
return df
|
|
|
|
|
|
# 将dataframe类型转为string类型
|
|
|
def ChangeDfToString(df,sep=',', isSave=False, filePath=''):
|
|
|
string_data = df.to_string(index=False, header=False, na_rep='')
|
|
|
string = string_data.replace('\n', ' ')
|
|
|
str = re.sub(' +', sep, string)
|
|
|
if isSave:
|
|
|
with open(filePath, mode="w",encoding='utf-8') as file:
|
|
|
file.write(str)
|
|
|
return str
|
|
|
|
|
|
# 根据关键词进行检索
|
|
|
def GetKeyFromList(keyWords, origin_list):
|
|
|
filtered_list = [item for item in origin_list if any(keyword in item for keyword in keyWords)]
|
|
|
counter_list = Counter(filtered_list)
|
|
|
sorted_list = sorted(counter_list.items(), key = lambda x:x[1], reverse=True)
|
|
|
return sorted_list
|
|
|
|
|
|
def main():
|
|
|
sep=','
|
|
|
isSaveText=True
|
|
|
xlsxPath = './docs/barrage.xlsx'
|
|
|
saveToText = './docs/allBarrage.txt'
|
|
|
saveAiPath = './docs/aiBarrage.xlsx'
|
|
|
keywords = ['AI', '智能']
|
|
|
df = ReadXlsx(xlsxPath)
|
|
|
str = ChangeDfToString(df, sep, isSaveText, saveToText)
|
|
|
s_list = str.split(sep)
|
|
|
sorted_list = GetKeyFromList(keywords, s_list)
|
|
|
wb = Workbook()
|
|
|
ws = wb.active
|
|
|
ws.cell(1, 1, '弹幕')
|
|
|
ws.cell(1, 2, '次数')
|
|
|
for i in range(8):
|
|
|
for j in range(2):
|
|
|
ws.cell(i+2, j+1, sorted_list[i][j])
|
|
|
wb.save(saveAiPath)
|
|
|
|
|
|
print(sorted_list)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
|