You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

65 lines
1.9 KiB

'''
数据处理与转存实现弹幕AI关键词提取
sep是转存成字符后的弹幕分隔标志
isSaveText是用于表明是否将弹幕数据转存成文本的标志
xlsxPath是弹幕数据的路径
saveToText是转存成文本数据的路径
saveAiPath是输出ai相关弹幕的路径
keywords是ai关键词
'''
import re
import pandas as pd
from collections import Counter
from openpyxl import Workbook
3 months ago
# 读取弹幕文件
def ReadXlsx(filePath=''):
df = pd.read_excel(filePath, sheet_name=0)
df.dropna(axis=1, how='all')
return df
3 months ago
# 将dataframe类型转为string类型
def ChangeDfToString(df,sep=',', isSave=False, filePath=''):
string_data = df.to_string(index=False, header=False, na_rep='')
string = string_data.replace('\n', ' ')
str = re.sub(' +', sep, string)
if isSave:
with open(filePath, mode="w",encoding='utf-8') as file:
file.write(str)
return str
3 months ago
# 根据关键词进行检索
def GetKeyFromList(keyWords, origin_list):
filtered_list = [item for item in origin_list if any(keyword in item for keyword in keyWords)]
counter_list = Counter(filtered_list)
sorted_list = sorted(counter_list.items(), key = lambda x:x[1], reverse=True)
return sorted_list
def main():
sep=','
isSaveText=True
xlsxPath = './docs/barrage.xlsx'
saveToText = './docs/allBarrage.txt'
saveAiPath = './docs/aiBarrage.xlsx'
keywords = ['AI', '智能']
df = ReadXlsx(xlsxPath)
str = ChangeDfToString(df, sep, isSaveText, saveToText)
s_list = str.split(sep)
sorted_list = GetKeyFromList(keywords, s_list)
wb = Workbook()
ws = wb.active
ws.cell(1, 1, '弹幕')
ws.cell(1, 2, '次数')
for i in range(8):
for j in range(2):
ws.cell(i+2, j+1, sorted_list[i][j])
wb.save(saveAiPath)
print(sorted_list)
if __name__ == '__main__':
main()