Curriculum_Design/src/file_parser.py

import os
from typing import Union

class FileParser:
    @staticmethod
    def parse_file(file_path: str) -> str:

        # 验证文件路径
        if not FileParser.validate_file_path(file_path):
            raise ValueError(f"Invalid file path: {file_path}")

        # 获取文件扩展名
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()

        # 根据扩展名调用对应的解析函数
        try:
            if ext == '.txt':
                return FileParser.parse_txt(file_path)
            elif ext == '.docx':
                return FileParser.parse_docx(file_path)
            elif ext == '.pdf':
                return FileParser.parse_pdf(file_path)
            else:
                raise ValueError(f"Unsupported file format: {ext}")
        except Exception as e:
            # 统一异常处理
            raise Exception(f"Error parsing file {file_path}: {str(e)}")

    @staticmethod
    def parse_txt(file_path: str) -> str:
        # 验证文件路径
        if not FileParser.validate_file_path(file_path):
            raise ValueError(f"Invalid file path: {file_path}")

        # 尝试多种编码方式读取文件
        encoding = FileParser.detect_file_encoding(file_path)

        # 读取文件内容
        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
            content = f.read()

        # 统一换行符为\n
        content = content.replace('\r\n', '\n').replace('\r', '\n')

        return content

    @staticmethod
    def detect_file_encoding(file_path: str) -> str:
        """检测文件编码"""
        # 首先尝试UTF-8
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                f.read()
            return 'utf-8'
        except UnicodeDecodeError:
            pass

        # 尝试GBK（中文Windows常用）
        try:
            with open(file_path, 'r', encoding='gbk') as f:
                f.read()
            return 'gbk'
        except UnicodeDecodeError:
            pass

        # 尝试GB2312
        try:
            with open(file_path, 'r', encoding='gb2312') as f:
                f.read()
            return 'gb2312'
        except UnicodeDecodeError:
            pass

        # 尝试使用chardet（如果可用）
        try:
            import chardet
            with open(file_path, 'rb') as f:
                raw_data = f.read(1024)
            result = chardet.detect(raw_data)
            if result and result['encoding']:
                return result['encoding']
        except ImportError:
            pass

        # 默认返回UTF-8
        return 'utf-8'

    @staticmethod
    def parse_docx(file_path: str) -> str:

        # 验证文件路径
        if not FileParser.validate_file_path(file_path):
            raise ValueError(f"Invalid file path: {file_path}")

        # 尝试导入python-docx库
        try:
            from docx import Document
        except ImportError:
            raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")

        # 打开并解析docx文件
        try:
            doc = Document(file_path)

            # 提取所有段落文本
            paragraphs = []
            for paragraph in doc.paragraphs:
                paragraphs.append(paragraph.text)

            # 用换行符连接所有段落
            content = '\n'.join(paragraphs)

            return content
        except Exception as e:
            raise Exception(f"Error parsing docx file {file_path}: {str(e)}")

    @staticmethod
    def parse_pdf(file_path: str) -> str:

        # 验证文件路径
        if not FileParser.validate_file_path(file_path):
            raise ValueError(f"Invalid file path: {file_path}")

        # 尝试导入PyPDF2库
        try:
            import PyPDF2
        except ImportError:
            raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")

        # 打开并解析pdf文件
        try:
            content = ""
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                # 提取每一页的文本
                for page in pdf_reader.pages:
                    content += page.extract_text()
                    content += "\n"

            return content
        except Exception as e:
            raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")

    @staticmethod
    def validate_file_path(file_path: str) -> bool:

        # 检查文件是否存在
        if not os.path.exists(file_path):
            return False

        # 检查是否为文件（而非目录）
        if not os.path.isfile(file_path):
            return False

        # 检查文件是否可读
        if not os.access(file_path, os.R_OK):
            return False

        # 检查文件大小是否合理（小于10MB）
        file_size = os.path.getsize(file_path)
        if file_size > 10 * 1024 * 1024:  # 10MB
            return False

        return True