You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Curriculum_Design/src/file_parser.py

341 lines
12 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os
import zipfile
import tempfile
from typing import Union, List, Tuple
class FileParser:
@staticmethod
def parse_file(file_path: str) -> str:
"""解析文件并返回文本内容"""
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 获取文件扩展名
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# 根据扩展名调用对应的解析函数
try:
if ext == '.txt':
return FileParser.parse_txt(file_path)
elif ext == '.docx':
return FileParser.parse_docx(file_path)
elif ext == '.pdf':
return FileParser.parse_pdf(file_path)
elif ext == '.html':
return FileParser.parse_html(file_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
except Exception as e:
# 统一异常处理
raise Exception(f"Error parsing file {file_path}: {str(e)}")
@staticmethod
def parse_and_convert_to_txt(file_path: str, output_dir: str = None) -> dict:
"""
解析文件并转换为txt格式保留图片和分段
Args:
file_path: 输入文件路径
output_dir: 输出目录如果为None则使用临时目录
Returns:
dict: 包含转换结果的信息
- 'txt_path': 生成的临时txt文件路径
- 'images': 提取的图片列表 [(文件名, 二进制数据), ...]
- 'content': 转换后的文本内容
- 'success': 是否成功
- 'error': 错误信息(如果有)
"""
try:
# 验证输入文件
if not FileParser.validate_file_path(file_path):
return {
'success': False,
'error': f"Invalid file path: {file_path}"
}
# 使用临时文件而不是永久文件
import tempfile
# 获取文件扩展名
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# 提取文本内容
content = ""
images = []
if ext == '.txt':
# TXT文件直接读取内容
content = FileParser.parse_txt(file_path)
images = [] # TXT文件没有图片
elif ext == '.docx':
# DOCX文件提取文本和图片
content = FileParser.parse_docx(file_path)
images = FileParser.extract_images_from_docx(file_path)
elif ext == '.pdf':
# PDF文件提取文本图片处理较复杂暂时只提取文本
content = FileParser.parse_pdf(file_path)
images = [] # PDF图片提取较复杂暂时跳过
elif ext == '.html':
# HTML文件提取文本内容
content = FileParser.parse_html(file_path)
images = [] # HTML图片提取较复杂暂时跳过
else:
return {
'success': False,
'error': f"Unsupported file format: {ext}"
}
# 创建临时文件而不是永久文件
base_name = os.path.splitext(os.path.basename(file_path))[0]
# 创建临时txt文件程序结束时会被自动清理
with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8',
suffix=f'_{base_name}_converted.txt',
delete=False) as temp_file:
temp_file.write(content)
txt_path = temp_file.name
return {
'success': True,
'txt_path': txt_path,
'images': images,
'content': content,
'original_ext': ext,
'is_temp_file': True # 标记这是临时文件
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
@staticmethod
def parse_txt(file_path: str) -> str:
"""解析TXT文件"""
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试多种编码方式读取文件
encoding = FileParser.detect_file_encoding(file_path)
# 读取文件内容
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
content = f.read()
# 统一换行符为\n
content = content.replace('\r\n', '\n').replace('\r', '\n')
return content
@staticmethod
def detect_file_encoding(file_path: str) -> str:
"""检测文件编码"""
# 首先尝试UTF-8
try:
with open(file_path, 'r', encoding='utf-8') as f:
f.read()
return 'utf-8'
except UnicodeDecodeError:
pass
# 尝试GBK中文Windows常用
try:
with open(file_path, 'r', encoding='gbk') as f:
f.read()
return 'gbk'
except UnicodeDecodeError:
pass
# 尝试GB2312
try:
with open(file_path, 'r', encoding='gb2312') as f:
f.read()
return 'gb2312'
except UnicodeDecodeError:
pass
# 尝试使用chardet如果可用
try:
import chardet
with open(file_path, 'rb') as f:
raw_data = f.read(1024)
result = chardet.detect(raw_data)
if result and result['encoding']:
return result['encoding']
except ImportError:
pass
# 默认返回UTF-8
return 'utf-8'
@staticmethod
def extract_images_from_docx(file_path: str) -> List[Tuple[str, bytes]]:
"""从Word文档中提取图片
Args:
file_path: Word文档路径
Returns:
图片列表,每个元素为(图片文件名, 图片二进制数据)的元组
"""
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
images = []
try:
# Word文档实际上是ZIP文件可以直接解压
with zipfile.ZipFile(file_path, 'r') as zip_file:
# 遍历ZIP文件中的所有文件
for file_info in zip_file.filelist:
file_name = file_info.filename
# Word文档中的图片通常存储在word/media/目录下
if file_name.startswith('word/media/') and file_info.file_size > 0:
# 读取图片数据
image_data = zip_file.read(file_name)
# 获取图片扩展名
image_ext = os.path.splitext(file_name)[1].lower()
if image_ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp']:
# 保存图片信息
base_name = os.path.basename(file_name)
images.append((base_name, image_data))
return images
except Exception as e:
raise Exception(f"Error extracting images from docx file {file_path}: {str(e)}")
@staticmethod
def parse_docx(file_path: str) -> str:
"""解析DOCX文件保留段落结构"""
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试导入python-docx库
try:
from docx import Document
except ImportError:
raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'")
# 打开并解析docx文件
try:
doc = Document(file_path)
# 提取所有段落文本,保留空行以保持格式
paragraphs = []
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
if text: # 非空段落
paragraphs.append(paragraph.text)
else: # 空段落,用空行表示
paragraphs.append("")
# 用换行符连接所有段落,保留空行
content = '\n'.join(paragraphs)
return content
except Exception as e:
raise Exception(f"Error parsing docx file {file_path}: {str(e)}")
@staticmethod
def parse_pdf(file_path: str) -> str:
"""解析PDF文件保留段落结构"""
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
# 尝试导入PyPDF2库
try:
import PyPDF2
except ImportError:
raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'")
# 打开并解析pdf文件
try:
content = ""
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# 提取每一页的文本
for i, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
if page_text:
content += page_text
# 在页面之间添加空行分隔
if i < len(pdf_reader.pages) - 1:
content += "\n\n"
return content
except Exception as e:
raise Exception(f"Error parsing pdf file {file_path}: {str(e)}")
@staticmethod
def parse_html(file_path: str) -> str:
"""解析HTML文件提取文本内容"""
# 验证文件路径
if not FileParser.validate_file_path(file_path):
raise ValueError(f"Invalid file path: {file_path}")
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError("BeautifulSoup4 library is required for parsing .html files. Please install it using 'pip install beautifulsoup4'")
try:
# 检测文件编码
encoding = FileParser.detect_file_encoding(file_path)
# 读取HTML文件
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
html_content = f.read()
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
# 移除script和style标签
for script in soup(["script", "style"]):
script.decompose()
# 提取文本内容
text = soup.get_text()
# 清理多余的空白字符
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
raise Exception(f"Error parsing html file {file_path}: {str(e)}")
@staticmethod
def validate_file_path(file_path: str) -> bool:
"""验证文件路径是否有效"""
# 检查文件是否存在
if not os.path.exists(file_path):
return False
# 检查是否为文件(而非目录)
if not os.path.isfile(file_path):
return False
# 检查文件是否可读
if not os.access(file_path, os.R_OK):
return False
# 检查文件大小是否合理小于10MB
file_size = os.path.getsize(file_path)
if file_size > 10 * 1024 * 1024: # 10MB
return False
return True