You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Curriculum_Design/src/file_parser.py

166 lines
5.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
from typing import Union
class FileParser:
@staticmethod
def parse_file(file_path: str) -> str:
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 获取文件扩展名
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# 根据扩展名调用对应的解析函数
try:
if ext == '.txt':
return FileParser.parse_txt(file_path)
elif ext == '.docx':
return FileParser.parse_docx(file_path)
elif ext == '.pdf':
return FileParser.parse_pdf(file_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
except Exception as e:
# 统一异常处理
raise Exception(f"Error parsing file {file_path}: {str(e)}")
@staticmethod
def parse_txt(file_path: str) -> str:
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试多种编码方式读取文件
encoding = FileParser.detect_file_encoding(file_path)
# 读取文件内容
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
content = f.read()
# 统一换行符为\n
content = content.replace('\r\n', '\n').replace('\r', '\n')
return content
@staticmethod
def detect_file_encoding(file_path: str) -> str:
"""检测文件编码"""
# 首先尝试UTF-8
try:
with open(file_path, 'r', encoding='utf-8') as f:
f.read()
return 'utf-8'
except UnicodeDecodeError:
pass
# 尝试GBK中文Windows常用
try:
with open(file_path, 'r', encoding='gbk') as f:
f.read()
return 'gbk'
except UnicodeDecodeError:
pass
# 尝试GB2312
try:
with open(file_path, 'r', encoding='gb2312') as f:
f.read()
return 'gb2312'
except UnicodeDecodeError:
pass
# 尝试使用chardet如果可用
try:
import chardet
with open(file_path, 'rb') as f:
raw_data = f.read(1024)
result = chardet.detect(raw_data)
if result and result['encoding']:
return result['encoding']
except ImportError:
pass
# 默认返回UTF-8
return 'utf-8'
@staticmethod
def parse_docx(file_path: str) -> str:
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试导入python-docx库
try:
from docx import Document
except ImportError:
raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
# 打开并解析docx文件
try:
doc = Document(file_path)
# 提取所有段落文本
paragraphs = []
for paragraph in doc.paragraphs:
paragraphs.append(paragraph.text)
# 用换行符连接所有段落
content = '\n'.join(paragraphs)
return content
except Exception as e:
raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
@staticmethod
def parse_pdf(file_path: str) -> str:
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试导入PyPDF2库
try:
import PyPDF2
except ImportError:
raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
# 打开并解析pdf文件
try:
content = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# 提取每一页的文本
for page in pdf_reader.pages:
content += page.extract_text()
content += "\n"
return content
except Exception as e:
raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
@staticmethod
def validate_file_path(file_path: str) -> bool:
# 检查文件是否存在
if not os.path.exists(file_path):
return False
# 检查是否为文件(而非目录)
if not os.path.isfile(file_path):
return False
# 检查文件是否可读
if not os.access(file_path, os.R_OK):
return False
# 检查文件大小是否合理小于10MB
file_size = os.path.getsize(file_path)
if file_size > 10 * 1024 * 1024: # 10MB
return False
return True