You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

62 lines
1.8 KiB

'''
数据处理与转存实现弹幕AI关键词提取
sep是转存成字符后的弹幕分隔标志
isSaveText是用于表明是否将弹幕数据转存成文本的标志
xlsxPath是弹幕数据的路径
saveToText是转存成文本数据的路径
saveAiPath是输出ai相关弹幕的路径
keywords是ai关键词
'''
import re
import pandas as pd
from collections import Counter
from openpyxl import Workbook
def ReadXlsx(filePath=''):
df = pd.read_excel(filePath, sheet_name=0)
df.dropna(axis=1, how='all')
return df
def ChangeDfToString(df,sep=',', isSave=False, filePath=''):
string_data = df.to_string(index=False, header=False, na_rep='')
string = string_data.replace('\n', ' ')
str = re.sub(' +', sep, string)
if isSave:
with open(filePath, mode="w",encoding='utf-8') as file:
file.write(str)
return str
def GetKeyFromList(keyWords, origin_list):
filtered_list = [item for item in origin_list if any(keyword in item for keyword in keyWords)]
counter_list = Counter(filtered_list)
sorted_list = sorted(counter_list.items(), key = lambda x:x[1], reverse=True)
return sorted_list
def main():
sep=','
isSaveText=True
xlsxPath = './docs/barrage.xlsx'
saveToText = './docs/allBarrage.txt'
saveAiPath = './docs/aiBarrage.xlsx'
keywords = ['AI', '智能']
df = ReadXlsx(xlsxPath)
str = ChangeDfToString(df, sep, isSaveText, saveToText)
s_list = str.split(sep)
sorted_list = GetKeyFromList(keywords, s_list)
wb = Workbook()
ws = wb.active
ws.cell(1, 1, '弹幕')
ws.cell(1, 2, '次数')
for i in range(8):
for j in range(2):
ws.cell(i+2, j+1, sorted_list[i][j])
wb.save(saveAiPath)
print(sorted_list)
if __name__ == '__main__':
main()