|
|
|
|
@ -4,54 +4,132 @@ from typing import Union
|
|
|
|
|
class FileParser:
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_file(file_path: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
主解析函数,根据文件扩展名路由到具体解析器
|
|
|
|
|
- 调用validate_file_path(file_path)验证路径
|
|
|
|
|
- 根据扩展名调用对应解析函数
|
|
|
|
|
- 统一异常处理
|
|
|
|
|
"""
|
|
|
|
|
# TODO: 实现主解析函数逻辑
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 验证文件路径
|
|
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
|
|
|
|
# 获取文件扩展名
|
|
|
|
|
_, ext = os.path.splitext(file_path)
|
|
|
|
|
ext = ext.lower()
|
|
|
|
|
|
|
|
|
|
# 根据扩展名调用对应的解析函数
|
|
|
|
|
try:
|
|
|
|
|
if ext == '.txt':
|
|
|
|
|
return FileParser.parse_txt(file_path)
|
|
|
|
|
elif ext == '.docx':
|
|
|
|
|
return FileParser.parse_docx(file_path)
|
|
|
|
|
elif ext == '.pdf':
|
|
|
|
|
return FileParser.parse_pdf(file_path)
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"Unsupported file format: {ext}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
# 统一异常处理
|
|
|
|
|
raise Exception(f"Error parsing file {file_path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_txt(file_path: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
解析纯文本文件
|
|
|
|
|
- 自动检测编码(utf-8, gbk等)
|
|
|
|
|
- 处理不同换行符
|
|
|
|
|
- 返回纯文本内容
|
|
|
|
|
"""
|
|
|
|
|
# TODO: 实现实现txt文件解析逻辑
|
|
|
|
|
pass
|
|
|
|
|
# 验证文件路径
|
|
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
|
|
|
|
# 导入工具函数来检测编码
|
|
|
|
|
try:
|
|
|
|
|
from utils.helper_functions import Utils
|
|
|
|
|
except ImportError:
|
|
|
|
|
# 如果无法导入,使用默认方法检测编码
|
|
|
|
|
import chardet
|
|
|
|
|
with open(file_path, 'rb') as f:
|
|
|
|
|
raw_data = f.read(1024)
|
|
|
|
|
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
|
|
|
|
|
else:
|
|
|
|
|
# 使用工具函数检测编码
|
|
|
|
|
encoding = Utils.detect_encoding(file_path)
|
|
|
|
|
|
|
|
|
|
# 读取文件内容
|
|
|
|
|
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
|
# 统一换行符为\n
|
|
|
|
|
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_docx(file_path: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
解析Word文档
|
|
|
|
|
- 提取所有段落文本
|
|
|
|
|
- 保留基本格式(换行)
|
|
|
|
|
- 忽略图片、表格等非文本元素
|
|
|
|
|
"""
|
|
|
|
|
# TODO: 实现docx文件解析逻辑
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 验证文件路径
|
|
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
|
|
|
|
# 尝试导入python-docx库
|
|
|
|
|
try:
|
|
|
|
|
from docx import Document
|
|
|
|
|
except ImportError:
|
|
|
|
|
raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
|
|
|
|
|
|
|
|
|
|
# 打开并解析docx文件
|
|
|
|
|
try:
|
|
|
|
|
doc = Document(file_path)
|
|
|
|
|
|
|
|
|
|
# 提取所有段落文本
|
|
|
|
|
paragraphs = []
|
|
|
|
|
for paragraph in doc.paragraphs:
|
|
|
|
|
paragraphs.append(paragraph.text)
|
|
|
|
|
|
|
|
|
|
# 用换行符连接所有段落
|
|
|
|
|
content = '\n'.join(paragraphs)
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def parse_pdf(file_path: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
解析PDF文档
|
|
|
|
|
- 提取文本内容
|
|
|
|
|
- 保留基本格式(换行)
|
|
|
|
|
"""
|
|
|
|
|
# TODO: 实现PDF文件解析逻辑
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 验证文件路径
|
|
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
|
|
|
|
# 尝试导入PyPDF2库
|
|
|
|
|
try:
|
|
|
|
|
import PyPDF2
|
|
|
|
|
except ImportError:
|
|
|
|
|
raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
|
|
|
|
|
|
|
|
|
|
# 打开并解析pdf文件
|
|
|
|
|
try:
|
|
|
|
|
content = ""
|
|
|
|
|
with open(file_path, 'rb') as file:
|
|
|
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
|
|
|
|
|
|
# 提取每一页的文本
|
|
|
|
|
for page in pdf_reader.pages:
|
|
|
|
|
content += page.extract_text()
|
|
|
|
|
content += "\n"
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def validate_file_path(file_path: str) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
验证文件路径的有效性
|
|
|
|
|
- 检查文件是否存在
|
|
|
|
|
- 检查文件是否可读
|
|
|
|
|
- 检查文件大小是否合理
|
|
|
|
|
"""
|
|
|
|
|
# TODO: 实现文件路径验证逻辑
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
# 检查文件是否存在
|
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 检查是否为文件(而非目录)
|
|
|
|
|
if not os.path.isfile(file_path):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 检查文件是否可读
|
|
|
|
|
if not os.access(file_path, os.R_OK):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 检查文件大小是否合理(小于10MB)
|
|
|
|
|
file_size = os.path.getsize(file_path)
|
|
|
|
|
if file_size > 10 * 1024 * 1024: # 10MB
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|