''' 数据处理与转存,实现弹幕AI关键词提取 sep是转存成字符后的弹幕分隔标志 isSaveText是用于表明是否将弹幕数据转存成文本的标志 xlsxPath是弹幕数据的路径 saveToText是转存成文本数据的路径 saveAiPath是输出ai相关弹幕的路径 keywords是ai关键词 ''' import re import pandas as pd from collections import Counter from openpyxl import Workbook def ReadXlsx(filePath=''): df = pd.read_excel(filePath, sheet_name=0) df.dropna(axis=1, how='all') return df def ChangeDfToString(df,sep=',', isSave=False, filePath=''): string_data = df.to_string(index=False, header=False, na_rep='') string = string_data.replace('\n', ' ') str = re.sub(' +', sep, string) if isSave: with open(filePath, mode="w",encoding='utf-8') as file: file.write(str) return str def GetKeyFromList(keyWords, origin_list): filtered_list = [item for item in origin_list if any(keyword in item for keyword in keyWords)] counter_list = Counter(filtered_list) sorted_list = sorted(counter_list.items(), key = lambda x:x[1], reverse=True) return sorted_list def main(): sep=',' isSaveText=True xlsxPath = './docs/barrage.xlsx' saveToText = './docs/allBarrage.txt' saveAiPath = './docs/aiBarrage.xlsx' keywords = ['AI', '智能'] df = ReadXlsx(xlsxPath) str = ChangeDfToString(df, sep, isSaveText, saveToText) s_list = str.split(sep) sorted_list = GetKeyFromList(keywords, s_list) wb = Workbook() ws = wb.active ws.cell(1, 1, '弹幕') ws.cell(1, 2, '次数') for i in range(8): for j in range(2): ws.cell(i+2, j+1, sorted_list[i][j]) wb.save(saveAiPath) print(sorted_list) if __name__ == '__main__': main()