From 8a07ae41013d3276db79ad7ad4feef2d9b827e6e Mon Sep 17 00:00:00 2001 From: mamingyi <80972090@qq.com> Date: Thu, 9 Oct 2025 11:38:00 +0800 Subject: [PATCH 1/2] 1111 --- src/utils/helper_functions.py | 47 ++++++++++++++++++++++++++---- src/utils/test_helper_functions.py | 29 ++++++++++++++++++ 2 files changed, 70 insertions(+), 6 deletions(-) create mode 100644 src/utils/test_helper_functions.py diff --git a/src/utils/helper_functions.py b/src/utils/helper_functions.py index af3cb2f..db80af9 100644 --- a/src/utils/helper_functions.py +++ b/src/utils/helper_functions.py @@ -11,8 +11,21 @@ class Utils: - 尝试多种编码格式 - 返回最可能的编码 """ - # TODO: 实现编码检测逻辑 - pass + import chardet + + # 读取文件的前1024字节用于编码检测 + with open(file_path, 'rb') as f: + raw_data = f.read(1024) + + # 使用chardet检测编码 + result = chardet.detect(raw_data) + encoding = result['encoding'] + + # 如果chardet无法确定编码,则默认使用utf-8 + if encoding is None: + encoding = 'utf-8' + + return encoding @staticmethod def format_file_size(size_bytes: int) -> str: @@ -20,9 +33,25 @@ class Utils: 格式化文件大小 - 将字节数转换为可读格式 - 返回格式化字符串 + 参数: + size_bytes (int): 需要格式化的文件大小,单位为字节 + 返回: + str: 格式化后的文件大小字符串,如 "1.5 MB" """ - # TODO: 实现文件大小格式化逻辑 - pass + # 如果文件大小为0字节,直接返回 "0 B" + if size_bytes == 0: + return "0 B" + + # 定义文件大小单位列表 + size_names = ["B", "KB", "MB", "GB", "TB"] + i = 0 + # 当文件大小大于等于1024且未到达最大单位时,循环除以1024 + while size_bytes >= 1024.0 and i < len(size_names) - 1: + size_bytes /= 1024.0 + i += 1 + + # 返回格式化后的字符串,保留一位小数 + return f"{size_bytes:.1f} {size_names[i]}" @staticmethod def calculate_file_hash(file_path: str) -> str: @@ -31,5 +60,11 @@ class Utils: - 使用SHA256算法 - 返回哈希字符串 """ - # TODO: 实现文件哈希计算逻辑 - pass \ No newline at end of file + sha256_hash = hashlib.sha256() + + # 分块读取文件以避免大文件占用过多内存 + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256_hash.update(chunk) + + return sha256_hash.hexdigest() \ No newline at end of file diff --git a/src/utils/test_helper_functions.py b/src/utils/test_helper_functions.py new file mode 100644 index 0000000..e73ec07 --- /dev/null +++ b/src/utils/test_helper_functions.py @@ -0,0 +1,29 @@ +# test_helper_functions.py +import os +from helper_functions import Utils + +# 创建一个测试文件 +test_content = "这是一个测试文件,用于验证工具函数。\nThis is a test file to verify utility functions." +test_file_path = "test_file.txt" + +# 写入测试文件 +with open(test_file_path, "w", encoding="utf-8") as f: + f.write(test_content) + +# 测试文件大小格式化 +file_size = os.path.getsize(test_file_path) +formatted_size = Utils.format_file_size(file_size) +print(f"文件大小: {file_size} 字节") +print(f"格式化大小: {formatted_size}") + +# 测试文件编码检测 +detected_encoding = Utils.detect_encoding(test_file_path) +print(f"检测到的编码: {detected_encoding}") + +# 测试文件哈希计算 +file_hash = Utils.calculate_file_hash(test_file_path) +print(f"文件哈希值: {file_hash}") + +# 清理测试文件 +os.remove(test_file_path) +print("测试完成,临时文件已清理。") \ No newline at end of file -- 2.34.1 From 8a56312ba9ed63d7c12cee2c2087c8c56436556d Mon Sep 17 00:00:00 2001 From: mamingyi <80972090@qq.com> Date: Sat, 11 Oct 2025 18:22:36 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E9=A9=AC=E6=98=8E=E4=B9=89=E5=AE=8C?= =?UTF-8?q?=E6=AF=95=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- src/file_parser.py | 156 +++++++++++++++++++++-------- src/utils/test_helper_functions.py | 29 ------ 3 files changed, 118 insertions(+), 69 deletions(-) delete mode 100644 src/utils/test_helper_functions.py diff --git a/requirements.txt b/requirements.txt index 96c661b..17e7112 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ PyQt5>=5.15.0 requests>=2.25.1 beautifulsoup4>=4.11.0 pillow>=9.0.0 -chardet>=4.0.0 \ No newline at end of file +chardet>=4.0.0 diff --git a/src/file_parser.py b/src/file_parser.py index 84b3c04..fc44729 100644 --- a/src/file_parser.py +++ b/src/file_parser.py @@ -4,54 +4,132 @@ from typing import Union class FileParser: @staticmethod def parse_file(file_path: str) -> str: - """ - 主解析函数,根据文件扩展名路由到具体解析器 - - 调用validate_file_path(file_path)验证路径 - - 根据扩展名调用对应解析函数 - - 统一异常处理 - """ - # TODO: 实现主解析函数逻辑 - pass + + # 验证文件路径 + if not FileParser.validate_file_path(file_path): + raise ValueError(f"Invalid file path: {file_path}") + + # 获取文件扩展名 + _, ext = os.path.splitext(file_path) + ext = ext.lower() + + # 根据扩展名调用对应的解析函数 + try: + if ext == '.txt': + return FileParser.parse_txt(file_path) + elif ext == '.docx': + return FileParser.parse_docx(file_path) + elif ext == '.pdf': + return FileParser.parse_pdf(file_path) + else: + raise ValueError(f"Unsupported file format: {ext}") + except Exception as e: + # 统一异常处理 + raise Exception(f"Error parsing file {file_path}: {str(e)}") @staticmethod def parse_txt(file_path: str) -> str: - """ - 解析纯文本文件 - - 自动检测编码(utf-8, gbk等) - - 处理不同换行符 - - 返回纯文本内容 - """ - # TODO: 实现实现txt文件解析逻辑 - pass + # 验证文件路径 + if not FileParser.validate_file_path(file_path): + raise ValueError(f"Invalid file path: {file_path}") + + # 导入工具函数来检测编码 + try: + from utils.helper_functions import Utils + except ImportError: + # 如果无法导入,使用默认方法检测编码 + import chardet + with open(file_path, 'rb') as f: + raw_data = f.read(1024) + encoding = chardet.detect(raw_data)['encoding'] or 'utf-8' + else: + # 使用工具函数检测编码 + encoding = Utils.detect_encoding(file_path) + + # 读取文件内容 + with open(file_path, 'r', encoding=encoding, errors='ignore') as f: + content = f.read() + + # 统一换行符为\n + content = content.replace('\r\n', '\n').replace('\r', '\n') + + return content @staticmethod def parse_docx(file_path: str) -> str: - """ - 解析Word文档 - - 提取所有段落文本 - - 保留基本格式(换行) - - 忽略图片、表格等非文本元素 - """ - # TODO: 实现docx文件解析逻辑 - pass + + # 验证文件路径 + if not FileParser.validate_file_path(file_path): + raise ValueError(f"Invalid file path: {file_path}") + + # 尝试导入python-docx库 + try: + from docx import Document + except ImportError: + raise ImportError("python-docx library is required for parsing .docx files. Please install it using 'pip install python-docx'") + + # 打开并解析docx文件 + try: + doc = Document(file_path) + + # 提取所有段落文本 + paragraphs = [] + for paragraph in doc.paragraphs: + paragraphs.append(paragraph.text) + + # 用换行符连接所有段落 + content = '\n'.join(paragraphs) + + return content + except Exception as e: + raise Exception(f"Error parsing docx file {file_path}: {str(e)}") @staticmethod def parse_pdf(file_path: str) -> str: - """ - 解析PDF文档 - - 提取文本内容 - - 保留基本格式(换行) - """ - # TODO: 实现PDF文件解析逻辑 - pass + + # 验证文件路径 + if not FileParser.validate_file_path(file_path): + raise ValueError(f"Invalid file path: {file_path}") + + # 尝试导入PyPDF2库 + try: + import PyPDF2 + except ImportError: + raise ImportError("PyPDF2 library is required for parsing .pdf files. Please install it using 'pip install PyPDF2'") + + # 打开并解析pdf文件 + try: + content = "" + with open(file_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + + # 提取每一页的文本 + for page in pdf_reader.pages: + content += page.extract_text() + content += "\n" + + return content + except Exception as e: + raise Exception(f"Error parsing pdf file {file_path}: {str(e)}") @staticmethod def validate_file_path(file_path: str) -> bool: - """ - 验证文件路径的有效性 - - 检查文件是否存在 - - 检查文件是否可读 - - 检查文件大小是否合理 - """ - # TODO: 实现文件路径验证逻辑 - pass \ No newline at end of file + + # 检查文件是否存在 + if not os.path.exists(file_path): + return False + + # 检查是否为文件(而非目录) + if not os.path.isfile(file_path): + return False + + # 检查文件是否可读 + if not os.access(file_path, os.R_OK): + return False + + # 检查文件大小是否合理(小于10MB) + file_size = os.path.getsize(file_path) + if file_size > 10 * 1024 * 1024: # 10MB + return False + + return True \ No newline at end of file diff --git a/src/utils/test_helper_functions.py b/src/utils/test_helper_functions.py deleted file mode 100644 index e73ec07..0000000 --- a/src/utils/test_helper_functions.py +++ /dev/null @@ -1,29 +0,0 @@ -# test_helper_functions.py -import os -from helper_functions import Utils - -# 创建一个测试文件 -test_content = "这是一个测试文件,用于验证工具函数。\nThis is a test file to verify utility functions." -test_file_path = "test_file.txt" - -# 写入测试文件 -with open(test_file_path, "w", encoding="utf-8") as f: - f.write(test_content) - -# 测试文件大小格式化 -file_size = os.path.getsize(test_file_path) -formatted_size = Utils.format_file_size(file_size) -print(f"文件大小: {file_size} 字节") -print(f"格式化大小: {formatted_size}") - -# 测试文件编码检测 -detected_encoding = Utils.detect_encoding(test_file_path) -print(f"检测到的编码: {detected_encoding}") - -# 测试文件哈希计算 -file_hash = Utils.calculate_file_hash(test_file_path) -print(f"文件哈希值: {file_hash}") - -# 清理测试文件 -os.remove(test_file_path) -print("测试完成,临时文件已清理。") \ No newline at end of file -- 2.34.1