3 changed files with 159 additions and 46 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,4 +5,4 @@ PyQt5>=5.15.0
 requests>=2.25.1
 beautifulsoup4>=4.11.0
 pillow>=9.0.0
-chardet>=4.0.0
+chardet>=4.0.0
--- a/src/file_parser.py
+++ b/src/file_parser.py
@ -4,54 +4,132 @@ from typing import Union
 class FileParser:
    @staticmethod
    def parse_file(file_path: str) -> str:
-        """
-        主解析函数，根据文件扩展名路由到具体解析器
-        - 调用validate_file_path(file_path)验证路径
-        - 根据扩展名调用对应解析函数
-        - 统一异常处理
-        """
-        # TODO: 实现主解析函数逻辑
-        pass
+    
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 获取文件扩展名
+        _, ext = os.path.splitext(file_path)
+        ext = ext.lower()
+        
+        # 根据扩展名调用对应的解析函数
+        try:
+            if ext == '.txt':
+                return FileParser.parse_txt(file_path)
+            elif ext == '.docx':
+                return FileParser.parse_docx(file_path)
+            elif ext == '.pdf':
+                return FileParser.parse_pdf(file_path)
+            else:
+                raise ValueError(f"Unsupported file format: {ext}")
+        except Exception as e:
+            # 统一异常处理
+            raise Exception(f"Error parsing file {file_path}: {str(e)}")
    
    @staticmethod
    def parse_txt(file_path: str) -> str:
-        """
-        解析纯文本文件
-        - 自动检测编码（utf-8, gbk等）
-        - 处理不同换行符
-        - 返回纯文本内容
-        """
-        # TODO: 实现实现txt文件解析逻辑
-        pass
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 导入工具函数来检测编码
+        try:
+            from utils.helper_functions import Utils
+        except ImportError:
+            # 如果无法导入，使用默认方法检测编码
+            import chardet
+            with open(file_path, 'rb') as f:
+                raw_data = f.read(1024)
+            encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
+        else:
+            # 使用工具函数检测编码
+            encoding = Utils.detect_encoding(file_path)
+        
+        # 读取文件内容
+        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+            content = f.read()
+        
+        # 统一换行符为\n
+        content = content.replace('\r\n', '\n').replace('\r', '\n')
+        
+        return content
    
    @staticmethod
    def parse_docx(file_path: str) -> str:
-        """
-        解析Word文档
-        - 提取所有段落文本
-        - 保留基本格式（换行）
-        - 忽略图片、表格等非文本元素
-        """
-        # TODO: 实现docx文件解析逻辑
-        pass
+       
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 尝试导入python-docx库
+        try:
+            from docx import Document
+        except ImportError:
+            raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
+        
+        # 打开并解析docx文件
+        try:
+            doc = Document(file_path)
+            
+            # 提取所有段落文本
+            paragraphs = []
+            for paragraph in doc.paragraphs:
+                paragraphs.append(paragraph.text)
+            
+            # 用换行符连接所有段落
+            content = '\n'.join(paragraphs)
+            
+            return content
+        except Exception as e:
+            raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
    
    @staticmethod
    def parse_pdf(file_path: str) -> str:
-        """
-        解析PDF文档
-        - 提取文本内容
-        - 保留基本格式（换行）
-        """
-        # TODO: 实现PDF文件解析逻辑
-        pass
+       
+        # 验证文件路径
+        if not FileParser.validate_file_path(file_path):
+            raise ValueError(f"Invalid file path: {file_path}")
+        
+        # 尝试导入PyPDF2库
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
+        
+        # 打开并解析pdf文件
+        try:
+            content = ""
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                
+                # 提取每一页的文本
+                for page in pdf_reader.pages:
+                    content += page.extract_text()
+                    content += "\n"
+            
+            return content
+        except Exception as e:
+            raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
    
    @staticmethod
    def validate_file_path(file_path: str) -> bool:
-        """
-        验证文件路径的有效性
-        - 检查文件是否存在
-        - 检查文件是否可读
-        - 检查文件大小是否合理
-        """
-        # TODO: 实现文件路径验证逻辑
-        pass
+        
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            return False
+        
+        # 检查是否为文件（而非目录）
+        if not os.path.isfile(file_path):
+            return False
+        
+        # 检查文件是否可读
+        if not os.access(file_path, os.R_OK):
+            return False
+        
+        # 检查文件大小是否合理（小于10MB）
+        file_size = os.path.getsize(file_path)
+        if file_size > 10 * 1024 * 1024:  # 10MB
+            return False
+        
+        return True
--- a/src/utils/helper_functions.py
+++ b/src/utils/helper_functions.py
@ -11,8 +11,21 @@ class Utils:
        - 尝试多种编码格式
        - 返回最可能的编码
        """
-        # TODO: 实现编码检测逻辑
-        pass
+        import chardet
+        
+        # 读取文件的前1024字节用于编码检测
+        with open(file_path, 'rb') as f:
+            raw_data = f.read(1024)
+            
+        # 使用chardet检测编码
+        result = chardet.detect(raw_data)
+        encoding = result['encoding']
+        
+        # 如果chardet无法确定编码，则默认使用utf-8
+        if encoding is None:
+            encoding = 'utf-8'
+            
+        return encoding
    
    @staticmethod
    def format_file_size(size_bytes: int) -> str:
@ -20,9 +33,25 @@ class Utils:
        格式化文件大小
        - 将字节数转换为可读格式
        - 返回格式化字符串
+        参数:
+            size_bytes (int): 需要格式化的文件大小，单位为字节
+        返回:
+            str: 格式化后的文件大小字符串，如 "1.5 MB"
        """
-        # TODO: 实现文件大小格式化逻辑
-        pass
+        # 如果文件大小为0字节，直接返回 "0 B"
+        if size_bytes == 0:
+            return "0 B"
+            
+        # 定义文件大小单位列表
+        size_names = ["B", "KB", "MB", "GB", "TB"]
+        i = 0
+        # 当文件大小大于等于1024且未到达最大单位时，循环除以1024
+        while size_bytes >= 1024.0 and i < len(size_names) - 1:
+            size_bytes /= 1024.0
+            i += 1
+            
+        # 返回格式化后的字符串，保留一位小数
+        return f"{size_bytes:.1f} {size_names[i]}"
    
    @staticmethod
    def calculate_file_hash(file_path: str) -> str:
@ -31,5 +60,11 @@ class Utils:
        - 使用SHA256算法
        - 返回哈希字符串
        """
-        # TODO: 实现文件哈希计算逻辑
-        pass
+        sha256_hash = hashlib.sha256()
+        
+        # 分块读取文件以避免大文件占用过多内存
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                sha256_hash.update(chunk)
+                
+        return sha256_hash.hexdigest()