Update task2.py

1 year ago · db810988e3
parent c3f128c9eb
commit db810988e3
1 changed files with 66 additions and 48 deletions
--- a/task2.py
+++ b/task2.py
@ -1,48 +1,66 @@
-import os
+import os
-import re
+import re
-import os
+import os
-import re
+import re
-from openpyxl import Workbook
+from openpyxl import Workbook
-
+
-# 定义文件路径
+
-test_path = 'd:\\学习\\软件工程\\swork\\res\\total300.txt'
+def writeintxt (words,outpath):
-keywords = ['VR', 'ai', '元宇宙', '超高清', '3D', 'gpt', '建模', '大模型', 'nlp', 'cv', 'openai', '智慧体育', '计算机', '虚拟','ai音效','ai视频','AI修复']
+    with open(outpath, 'w', encoding='utf-8') as output_file:
-
+        for line in words:
-# 初始化一个字典来存储每个关键词的计数
+            output_file.write(line)
-keyword_count = {keyword: 0 for keyword in keywords}
+
-
+
-# 检查文件是否存在
+def findfrecuency(txt_path,keywords,outpath):
-if os.path.exists(test_path):
+    if os.path.exists(txt_path):
-    # 读取原始文本文件
+        # 读取原始文本文件,按行读入lines
-    with open(test_path, 'r', encoding='utf-8') as file:
+        resultword = []
-        lines = file.readlines()
+        with open(txt_path, 'r', encoding='utf-8') as file:
-
+            lines = file.readlines()
-    # 检查每一行是否包含关键词，并统计数量
+        for line in lines:
-    for line in lines:
+            for keyword in keywords:
-        for keyword in keywords:
+                # 对于英文关键词，确保前后不是英文字符
-            # 对于英文关键词，确保前后不是英文字符
+                if re.match(r'^[a-zA-Z]+$', keyword):
-            if re.match(r'^[a-zA-Z]+$', keyword):
+                    pattern = r'(?<![a-zA-Z])' + re.escape(keyword) + r'(?![a-zA-Z])'
-                pattern = r'(?<![a-zA-Z])' + re.escape(keyword) + r'(?![a-zA-Z])'
+                    if re.search(pattern, line, re.IGNORECASE):
-                if re.search(pattern, line, re.IGNORECASE):
+                        keyword_count[keyword] += 1
-                    keyword_count[keyword] += 1
+                        resultword.append(line)
-            # 对于中文关键词，直接寻找
+                # 对于中文关键词，直接寻找
-            elif keyword in line:
+                elif keyword in line:
-                keyword_count[keyword] += 1
+                    keyword_count[keyword] += 1
-
+                    resultword.append(line)
-    # 创建一个新的 Excel 工作簿
+        resultword1 = list(set(resultword))
-    wb = Workbook()
+        writeintxt(resultword1,outpath) #去重
-    ws = wb.active
+        return keyword_count#返回字典
-    ws.title = "AI应用弹幕统计"
+    else:
-    savepath = "d:\\学习\\软件工程\\swork\\res\\AI应用弹幕统计结果.xlsx"
+        print("文件不存在\n")
-    # 添加表头
+        return {}
-    ws.append(['AI应用', '出现数量'])
+    
-
+def writeinexcel (keyword_count,savepath):
-    # 将关键词和计数写入 Excel 工作表
+    if keyword_count == {}:
-    for keyword, count in sorted(keyword_count.items(), key=lambda item: item[1], reverse=True):
+        print("没有筛选到关键词\n")
-        if count > 0:
+    else:
-            ws.append([keyword, count])
+        # 创建一个新的 Excel 工作簿
-
+        wb = Workbook()
-    # 保存 Excel 文件
+        ws = wb.active
-    wb.save(savepath)
+        ws.title = "AI应用弹幕统计"
-else:
+        ws.append(['AI应用', '出现数量'])
-    print("文件不存在\n")
+        # 将关键词和计数写入 Excel 工作表
        for keyword, count in sorted(keyword_count.items(), key=lambda item: item[1], reverse=True):
            if count > 0:
                ws.append([keyword, count])
        # 保存 Excel 文件
        wb.save(savepath)
 if __name__ =='__main__':
    # 定义文件路径
    txt_path = 'd:\\学习\\软件工程\\swork\\res\\total300_3.txt'#总弹幕数据
    keywords = ['VR', 'ai', '元宇宙', '超高清', '3D', 'gpt', '建模', '大模型', 'nlp', 'cv', 'openai', '智慧体育', '计算机', '虚拟','ai音效','ai视频','AI修复']
    # 初始化一个字典来存储每个关键词的计数
    keyword_count = {keyword: 0 for keyword in keywords}
    #定义保存位置
    savepath = "d:\\学习\\软件工程\\swork\\res\\AI_dm_rank4.xlsx"#存放排名，个数excel
    outpath = "d:\\学习\\软件工程\\swork\\res\\select_ai4.txt"#存放有含ai技术评论的评价txt
    keyword_count = findfrecuency(txt_path,keywords,outpath) #得到排名字典
    writeinexcel (keyword_count,savepath) #将排名写入excel