import os from typing import Union class FileParser: @staticmethod def parse_file(file_path: str) -> str: # 验证文件路径123 if not FileParser.validate_file_path(file_path): raise ValueError(f"Invalid file path: {file_path}") # 获取文件扩展名 _, ext = os.path.splitext(file_path) ext = ext.lower() # 根据扩展名调用对应的解析函数 try: if ext == '.txt': return FileParser.parse_txt(file_path) elif ext == '.docx': return FileParser.parse_docx(file_path) elif ext == '.pdf': return FileParser.parse_pdf(file_path) else: raise ValueError(f"Unsupported file format: {ext}") except Exception as e: # 统一异常处理 raise Exception(f"Error parsing file {file_path}: {str(e)}") @staticmethod def parse_txt(file_path: str) -> str: # 验证文件路径 if not FileParser.validate_file_path(file_path): raise ValueError(f"Invalid file path: {file_path}") # 尝试多种编码方式读取文件 encoding = FileParser.detect_file_encoding(file_path) # 读取文件内容 with open(file_path, 'r', encoding=encoding, errors='ignore') as f: content = f.read() # 统一换行符为\n content = content.replace('\r\n', '\n').replace('\r', '\n') return content @staticmethod def detect_file_encoding(file_path: str) -> str: """检测文件编码""" # 首先尝试UTF-8 try: with open(file_path, 'r', encoding='utf-8') as f: f.read() return 'utf-8' except UnicodeDecodeError: pass # 尝试GBK(中文Windows常用) try: with open(file_path, 'r', encoding='gbk') as f: f.read() return 'gbk' except UnicodeDecodeError: pass # 尝试GB2312 try: with open(file_path, 'r', encoding='gb2312') as f: f.read() return 'gb2312' except UnicodeDecodeError: pass # 尝试使用chardet(如果可用) try: import chardet with open(file_path, 'rb') as f: raw_data = f.read(1024) result = chardet.detect(raw_data) if result and result['encoding']: return result['encoding'] except ImportError: pass # 默认返回UTF-8 return 'utf-8' @staticmethod def parse_docx(file_path: str) -> str: # 验证文件路径 if not FileParser.validate_file_path(file_path): raise ValueError(f"Invalid file path: {file_path}") # 尝试导入python-docx库 try: from docx import Document except ImportError: raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'") # 打开并解析docx文件 try: doc = Document(file_path) # 提取所有段落文本 paragraphs = [] for paragraph in doc.paragraphs: paragraphs.append(paragraph.text) # 用换行符连接所有段落 content = '\n'.join(paragraphs) return content except Exception as e: raise Exception(f"Error parsing docx file {file_path}: {str(e)}") @staticmethod def parse_pdf(file_path: str) -> str: # 验证文件路径 if not FileParser.validate_file_path(file_path): raise ValueError(f"Invalid file path: {file_path}") # 尝试导入PyPDF2库 try: import PyPDF2 except ImportError: raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'") # 打开并解析pdf文件 try: content = "" with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) # 提取每一页的文本 for page in pdf_reader.pages: content += page.extract_text() content += "\n" return content except Exception as e: raise Exception(f"Error parsing pdf file {file_path}: {str(e)}") @staticmethod def validate_file_path(file_path: str) -> bool: # 检查文件是否存在 if not os.path.exists(file_path): return False # 检查是否为文件(而非目录) if not os.path.isfile(file_path): return False # 检查文件是否可读 if not os.access(file_path, os.R_OK): return False # 检查文件大小是否合理(小于10MB) file_size = os.path.getsize(file_path) if file_size > 10 * 1024 * 1024: # 10MB return False return True