From 8a07ae41013d3276db79ad7ad4feef2d9b827e6e Mon Sep 17 00:00:00 2001
From: mamingyi <80972090@qq.com>
Date: Thu, 9 Oct 2025 11:38:00 +0800
Subject: [PATCH 1/2] 1111

---
 src/utils/helper_functions.py      | 47 ++++++++++++++++++++++++++----
 src/utils/test_helper_functions.py | 29 ++++++++++++++++++
 2 files changed, 70 insertions(+), 6 deletions(-)
 create mode 100644 src/utils/test_helper_functions.py

diff --git a/src/utils/helper_functions.py b/src/utils/helper_functions.py
index af3cb2f..db80af9 100644
--- a/src/utils/helper_functions.py
+++ b/src/utils/helper_functions.py
@@ -11,8 +11,21 @@ class Utils:
         - 尝试多种编码格式
         - 返回最可能的编码
         """
-        # TODO: 实现编码检测逻辑
-        pass
+        import chardet
+        
+        # 读取文件的前1024字节用于编码检测
+        with open(file_path, 'rb') as f:
+            raw_data = f.read(1024)
+            
+        # 使用chardet检测编码
+        result = chardet.detect(raw_data)
+        encoding = result['encoding']
+        
+        # 如果chardet无法确定编码，则默认使用utf-8
+        if encoding is None:
+            encoding = 'utf-8'
+            
+        return encoding
     
     @staticmethod
     def format_file_size(size_bytes: int) -> str:
@@ -20,9 +33,25 @@ class Utils:
         格式化文件大小
         - 将字节数转换为可读格式
         - 返回格式化字符串
+        参数:
+            size_bytes (int): 需要格式化的文件大小，单位为字节
+        返回:
+            str: 格式化后的文件大小字符串，如 "1.5 MB"
         """
-        # TODO: 实现文件大小格式化逻辑
-        pass
+        # 如果文件大小为0字节，直接返回 "0 B"
+        if size_bytes == 0:
+            return "0 B"
+            
+        # 定义文件大小单位列表
+        size_names = ["B", "KB", "MB", "GB", "TB"]
+        i = 0
+        # 当文件大小大于等于1024且未到达最大单位时，循环除以1024
+        while size_bytes >= 1024.0 and i < len(size_names) - 1:
+            size_bytes /= 1024.0
+            i += 1
+            
+        # 返回格式化后的字符串，保留一位小数
+        return f"{size_bytes:.1f} {size_names[i]}"
     
     @staticmethod
     def calculate_file_hash(file_path: str) -> str:
@@ -31,5 +60,11 @@ class Utils:
         - 使用SHA256算法
         - 返回哈希字符串
         """
-        # TODO: 实现文件哈希计算逻辑
-        pass
\ No newline at end of file
+        sha256_hash = hashlib.sha256()
+        
+        # 分块读取文件以避免大文件占用过多内存
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(chunk)
+                
+        return sha256_hash.hexdigest()
\ No newline at end of file
diff --git a/src/utils/test_helper_functions.py b/src/utils/test_helper_functions.py
new file mode 100644
index 0000000..e73ec07
--- /dev/null
+++ b/src/utils/test_helper_functions.py
@@ -0,0 +1,29 @@
+# test_helper_functions.py
+import os
+from helper_functions import Utils
+
+# 创建一个测试文件
+test_content = "这是一个测试文件，用于验证工具函数。\nThis is a test file to verify utility functions."
+test_file_path = "test_file.txt"
+
+# 写入测试文件
+with open(test_file_path, "w", encoding="utf-8") as f:
+    f.write(test_content)
+
+# 测试文件大小格式化
+file_size = os.path.getsize(test_file_path)
+formatted_size = Utils.format_file_size(file_size)
+print(f"文件大小: {file_size} 字节")
+print(f"格式化大小: {formatted_size}")
+
+# 测试文件编码检测
+detected_encoding = Utils.detect_encoding(test_file_path)
+print(f"检测到的编码: {detected_encoding}")
+
+# 测试文件哈希计算
+file_hash = Utils.calculate_file_hash(test_file_path)
+print(f"文件哈希值: {file_hash}")
+
+# 清理测试文件
+os.remove(test_file_path)
+print("测试完成，临时文件已清理。")
\ No newline at end of file
-- 
2.34.1


From 8a56312ba9ed63d7c12cee2c2087c8c56436556d Mon Sep 17 00:00:00 2001
From: mamingyi <80972090@qq.com>
Date: Sat, 11 Oct 2025 18:22:36 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E9=A9=AC=E6=98=8E=E4=B9=89=E5=AE=8C?=
 =?UTF-8?q?=E6=AF=95=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt                   |   2 +-
 src/file_parser.py                 | 156 +++++++++++++++++++++--------
 src/utils/test_helper_functions.py |  29 ------
 3 files changed, 118 insertions(+), 69 deletions(-)
 delete mode 100644 src/utils/test_helper_functions.py

diff --git a/requirements.txt b/requirements.txt
index 96c661b..17e7112 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,4 @@ PyQt5>=5.15.0
 requests>=2.25.1
 beautifulsoup4>=4.11.0
 pillow>=9.0.0
-chardet>=4.0.0
\ No newline at end of file
+chardet>=4.0.0
diff --git a/src/file_parser.py b/src/file_parser.py
index 84b3c04..fc44729 100644
--- a/src/file_parser.py
+++ b/src/file_parser.py
@@ -4,54 +4,132 @@ from typing import Union
 class FileParser:
     @staticmethod
     def parse_file(file_path: str) -> str:
-        """
-        主解析函数，根据文件扩展名路由到具体解析器
-        - 调用validate_file_path(file_path)验证路径
-        - 根据扩展名调用对应解析函数
-        - 统一异常处理
-        """
-        # TODO: 实现主解析函数逻辑
-        pass
+    
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 获取文件扩展名
+        _, ext = os.path.splitext(file_path)
+        ext = ext.lower()
+        
+        # 根据扩展名调用对应的解析函数
+        try:
+            if ext == '.txt':
+                return FileParser.parse_txt(file_path)
+            elif ext == '.docx':
+                return FileParser.parse_docx(file_path)
+            elif ext == '.pdf':
+                return FileParser.parse_pdf(file_path)
+            else:
+                raise ValueError(f"Unsupported file format: {ext}")
+        except Exception as e:
+            # 统一异常处理
+            raise Exception(f"Error parsing file {file_path}: {str(e)}")
     
     @staticmethod
     def parse_txt(file_path: str) -> str:
-        """
-        解析纯文本文件
-        - 自动检测编码（utf-8, gbk等）
-        - 处理不同换行符
-        - 返回纯文本内容
-        """
-        # TODO: 实现实现txt文件解析逻辑
-        pass
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 导入工具函数来检测编码
+        try:
+            from utils.helper_functions import Utils
+        except ImportError:
+            # 如果无法导入，使用默认方法检测编码
+            import chardet
+            with open(file_path, 'rb') as f:
+                raw_data = f.read(1024)
+            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+        else:
+            # 使用工具函数检测编码
+            encoding = Utils.detect_encoding(file_path)
+        
+        # 读取文件内容
+        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+            content = f.read()
+        
+        # 统一换行符为\n
+        content = content.replace('\r\n', '\n').replace('\r', '\n')
+        
+        return content
     
     @staticmethod
     def parse_docx(file_path: str) -> str:
-        """
-        解析Word文档
-        - 提取所有段落文本
-        - 保留基本格式（换行）
-        - 忽略图片、表格等非文本元素
-        """
-        # TODO: 实现docx文件解析逻辑
-        pass
+       
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 尝试导入python-docx库
+        try:
+            from docx import Document
+        except ImportError:
+            raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
+        
+        # 打开并解析docx文件
+        try:
+            doc = Document(file_path)
+            
+            # 提取所有段落文本
+            paragraphs = []
+            for paragraph in doc.paragraphs:
+                paragraphs.append(paragraph.text)
+            
+            # 用换行符连接所有段落
+            content = '\n'.join(paragraphs)
+            
+            return content
+        except Exception as e:
+            raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
     
     @staticmethod
     def parse_pdf(file_path: str) -> str:
-        """
-        解析PDF文档
-        - 提取文本内容
-        - 保留基本格式（换行）
-        """
-        # TODO: 实现PDF文件解析逻辑
-        pass
+       
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 尝试导入PyPDF2库
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
+        
+        # 打开并解析pdf文件
+        try:
+            content = ""
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                
+                # 提取每一页的文本
+                for page in pdf_reader.pages:
+                    content += page.extract_text()
+                    content += "\n"
+            
+            return content
+        except Exception as e:
+            raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
     
     @staticmethod
     def validate_file_path(file_path: str) -> bool:
-        """
-        验证文件路径的有效性
-        - 检查文件是否存在
-        - 检查文件是否可读
-        - 检查文件大小是否合理
-        """
-        # TODO: 实现文件路径验证逻辑
-        pass
\ No newline at end of file
+        
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            return False
+        
+        # 检查是否为文件（而非目录）
+        if not os.path.isfile(file_path):
+            return False
+        
+        # 检查文件是否可读
+        if not os.access(file_path, os.R_OK):
+            return False
+        
+        # 检查文件大小是否合理（小于10MB）
+        file_size = os.path.getsize(file_path)
+        if file_size > 10 * 1024 * 1024:  # 10MB
+            return False
+        
+        return True
\ No newline at end of file
diff --git a/src/utils/test_helper_functions.py b/src/utils/test_helper_functions.py
deleted file mode 100644
index e73ec07..0000000
--- a/src/utils/test_helper_functions.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# test_helper_functions.py
-import os
-from helper_functions import Utils
-
-# 创建一个测试文件
-test_content = "这是一个测试文件，用于验证工具函数。\nThis is a test file to verify utility functions."
-test_file_path = "test_file.txt"
-
-# 写入测试文件
-with open(test_file_path, "w", encoding="utf-8") as f:
-    f.write(test_content)
-
-# 测试文件大小格式化
-file_size = os.path.getsize(test_file_path)
-formatted_size = Utils.format_file_size(file_size)
-print(f"文件大小: {file_size} 字节")
-print(f"格式化大小: {formatted_size}")
-
-# 测试文件编码检测
-detected_encoding = Utils.detect_encoding(test_file_path)
-print(f"检测到的编码: {detected_encoding}")
-
-# 测试文件哈希计算
-file_hash = Utils.calculate_file_hash(test_file_path)
-print(f"文件哈希值: {file_hash}")
-
-# 清理测试文件
-os.remove(test_file_path)
-print("测试完成，临时文件已清理。")
\ No newline at end of file
-- 
2.34.1