|
|
import os
|
|
|
from typing import Union
|
|
|
|
|
|
class FileParser:
|
|
|
@staticmethod
|
|
|
def parse_file(file_path: str) -> str:
|
|
|
|
|
|
# 验证文件路径
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
# 获取文件扩展名
|
|
|
_, ext = os.path.splitext(file_path)
|
|
|
ext = ext.lower()
|
|
|
|
|
|
# 根据扩展名调用对应的解析函数
|
|
|
try:
|
|
|
if ext == '.txt':
|
|
|
return FileParser.parse_txt(file_path)
|
|
|
elif ext == '.docx':
|
|
|
return FileParser.parse_docx(file_path)
|
|
|
elif ext == '.pdf':
|
|
|
return FileParser.parse_pdf(file_path)
|
|
|
else:
|
|
|
raise ValueError(f"Unsupported file format: {ext}")
|
|
|
except Exception as e:
|
|
|
# 统一异常处理
|
|
|
raise Exception(f"Error parsing file {file_path}: {str(e)}")
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_txt(file_path: str) -> str:
|
|
|
# 验证文件路径
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
# 尝试多种编码方式读取文件
|
|
|
encoding = FileParser.detect_file_encoding(file_path)
|
|
|
|
|
|
# 读取文件内容
|
|
|
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
|
|
content = f.read()
|
|
|
|
|
|
# 统一换行符为\n
|
|
|
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
|
|
return content
|
|
|
|
|
|
@staticmethod
|
|
|
def detect_file_encoding(file_path: str) -> str:
|
|
|
"""检测文件编码"""
|
|
|
# 首先尝试UTF-8
|
|
|
try:
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
f.read()
|
|
|
return 'utf-8'
|
|
|
except UnicodeDecodeError:
|
|
|
pass
|
|
|
|
|
|
# 尝试GBK(中文Windows常用)
|
|
|
try:
|
|
|
with open(file_path, 'r', encoding='gbk') as f:
|
|
|
f.read()
|
|
|
return 'gbk'
|
|
|
except UnicodeDecodeError:
|
|
|
pass
|
|
|
|
|
|
# 尝试GB2312
|
|
|
try:
|
|
|
with open(file_path, 'r', encoding='gb2312') as f:
|
|
|
f.read()
|
|
|
return 'gb2312'
|
|
|
except UnicodeDecodeError:
|
|
|
pass
|
|
|
|
|
|
# 尝试使用chardet(如果可用)
|
|
|
try:
|
|
|
import chardet
|
|
|
with open(file_path, 'rb') as f:
|
|
|
raw_data = f.read(1024)
|
|
|
result = chardet.detect(raw_data)
|
|
|
if result and result['encoding']:
|
|
|
return result['encoding']
|
|
|
except ImportError:
|
|
|
pass
|
|
|
|
|
|
# 默认返回UTF-8
|
|
|
return 'utf-8'
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_docx(file_path: str) -> str:
|
|
|
|
|
|
# 验证文件路径
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
# 尝试导入python-docx库
|
|
|
try:
|
|
|
from docx import Document
|
|
|
except ImportError:
|
|
|
raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
|
|
|
|
|
|
# 打开并解析docx文件
|
|
|
try:
|
|
|
doc = Document(file_path)
|
|
|
|
|
|
# 提取所有段落文本
|
|
|
paragraphs = []
|
|
|
for paragraph in doc.paragraphs:
|
|
|
paragraphs.append(paragraph.text)
|
|
|
|
|
|
# 用换行符连接所有段落
|
|
|
content = '\n'.join(paragraphs)
|
|
|
|
|
|
return content
|
|
|
except Exception as e:
|
|
|
raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
|
|
|
|
|
|
@staticmethod
|
|
|
def parse_pdf(file_path: str) -> str:
|
|
|
|
|
|
# 验证文件路径
|
|
|
if not FileParser.validate_file_path(file_path):
|
|
|
raise ValueError(f"Invalid file path: {file_path}")
|
|
|
|
|
|
# 尝试导入PyPDF2库
|
|
|
try:
|
|
|
import PyPDF2
|
|
|
except ImportError:
|
|
|
raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
|
|
|
|
|
|
# 打开并解析pdf文件
|
|
|
try:
|
|
|
content = ""
|
|
|
with open(file_path, 'rb') as file:
|
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
|
|
|
|
# 提取每一页的文本
|
|
|
for page in pdf_reader.pages:
|
|
|
content += page.extract_text()
|
|
|
content += "\n"
|
|
|
|
|
|
return content
|
|
|
except Exception as e:
|
|
|
raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
|
|
|
|
|
|
@staticmethod
|
|
|
def validate_file_path(file_path: str) -> bool:
|
|
|
|
|
|
# 检查文件是否存在
|
|
|
if not os.path.exists(file_path):
|
|
|
return False
|
|
|
|
|
|
# 检查是否为文件(而非目录)
|
|
|
if not os.path.isfile(file_path):
|
|
|
return False
|
|
|
|
|
|
# 检查文件是否可读
|
|
|
if not os.access(file_path, os.R_OK):
|
|
|
return False
|
|
|
|
|
|
# 检查文件大小是否合理(小于10MB)
|
|
|
file_size = os.path.getsize(file_path)
|
|
|
if file_size > 10 * 1024 * 1024: # 10MB
|
|
|
return False
|
|
|
|
|
|
return True |