|
|
|
|
@ -19,7 +19,6 @@ from fastapi import (
|
|
|
|
|
File,
|
|
|
|
|
HTTPException,
|
|
|
|
|
UploadFile,
|
|
|
|
|
Form,
|
|
|
|
|
)
|
|
|
|
|
from pydantic import BaseModel, Field, field_validator
|
|
|
|
|
|
|
|
|
|
@ -748,212 +747,6 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
|
|
|
logger.error(f"Error deleting file {file_path}: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
async def pipeline_enqueue_file_uploadType(rag: LightRAG, file_path: Path, uploadType: Dict[str, Any] = None) -> bool:
|
|
|
|
|
"""将文件添加到处理队列,并包含元数据
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
rag: LightRAG实例
|
|
|
|
|
file_path: 要保存的文件路径
|
|
|
|
|
uploadType: 要与文档一起存储的元数据字典
|
|
|
|
|
返回:
|
|
|
|
|
bool: 如果文件成功入队则返回True,否则返回False
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
content = ""
|
|
|
|
|
ext = file_path.suffix.lower()
|
|
|
|
|
|
|
|
|
|
file = None
|
|
|
|
|
async with aiofiles.open(file_path, "rb") as f:
|
|
|
|
|
file = await f.read()
|
|
|
|
|
|
|
|
|
|
# 根据文件类型处理
|
|
|
|
|
match ext:
|
|
|
|
|
case (
|
|
|
|
|
".txt"
|
|
|
|
|
| ".md"
|
|
|
|
|
| ".html"
|
|
|
|
|
| ".htm"
|
|
|
|
|
| ".tex"
|
|
|
|
|
| ".json"
|
|
|
|
|
| ".xml"
|
|
|
|
|
| ".yaml"
|
|
|
|
|
| ".yml"
|
|
|
|
|
| ".rtf"
|
|
|
|
|
| ".odt"
|
|
|
|
|
| ".epub"
|
|
|
|
|
| ".csv"
|
|
|
|
|
| ".log"
|
|
|
|
|
| ".conf"
|
|
|
|
|
| ".ini"
|
|
|
|
|
| ".properties"
|
|
|
|
|
| ".sql"
|
|
|
|
|
| ".bat"
|
|
|
|
|
| ".sh"
|
|
|
|
|
| ".c"
|
|
|
|
|
| ".cpp"
|
|
|
|
|
| ".py"
|
|
|
|
|
| ".java"
|
|
|
|
|
| ".js"
|
|
|
|
|
| ".ts"
|
|
|
|
|
| ".swift"
|
|
|
|
|
| ".go"
|
|
|
|
|
| ".rb"
|
|
|
|
|
| ".php"
|
|
|
|
|
| ".css"
|
|
|
|
|
| ".scss"
|
|
|
|
|
| ".less"
|
|
|
|
|
):
|
|
|
|
|
try:
|
|
|
|
|
# 尝试以UTF-8解码
|
|
|
|
|
content = file.decode("utf-8")
|
|
|
|
|
|
|
|
|
|
# 验证内容
|
|
|
|
|
if not content or len(content.strip()) == 0:
|
|
|
|
|
logger.error(f"文件内容为空: {file_path.name}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 检查内容是否看起来像二进制数据的字符串表示
|
|
|
|
|
if content.startswith("b'") or content.startswith('b"'):
|
|
|
|
|
logger.error(
|
|
|
|
|
f"文件 {file_path.name} 似乎包含二进制数据的字符串表示而不是文本"
|
|
|
|
|
)
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
logger.error(
|
|
|
|
|
f"文件 {file_path.name} 不是有效的UTF-8编码文本。请在处理前将其转换为UTF-8。"
|
|
|
|
|
)
|
|
|
|
|
return False
|
|
|
|
|
case ".pdf":
|
|
|
|
|
if global_args.document_loading_engine == "DOCLING":
|
|
|
|
|
if not pm.is_installed("docling"): # type: ignore
|
|
|
|
|
pm.install("docling")
|
|
|
|
|
from docling.document_converter import DocumentConverter # type: ignore
|
|
|
|
|
|
|
|
|
|
converter = DocumentConverter()
|
|
|
|
|
result = converter.convert(file_path)
|
|
|
|
|
content = result.document.export_to_markdown()
|
|
|
|
|
else:
|
|
|
|
|
if not pm.is_installed("pypdf2"): # type: ignore
|
|
|
|
|
pm.install("pypdf2")
|
|
|
|
|
from PyPDF2 import PdfReader # type: ignore
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
pdf_file = BytesIO(file)
|
|
|
|
|
reader = PdfReader(pdf_file)
|
|
|
|
|
for page in reader.pages:
|
|
|
|
|
content += page.extract_text() + "\n"
|
|
|
|
|
case ".docx":
|
|
|
|
|
if global_args.document_loading_engine == "DOCLING":
|
|
|
|
|
if not pm.is_installed("docling"): # type: ignore
|
|
|
|
|
pm.install("docling")
|
|
|
|
|
from docling.document_converter import DocumentConverter # type: ignore
|
|
|
|
|
|
|
|
|
|
converter = DocumentConverter()
|
|
|
|
|
result = converter.convert(file_path)
|
|
|
|
|
content = result.document.export_to_markdown()
|
|
|
|
|
else:
|
|
|
|
|
if not pm.is_installed("python-docx"): # type: ignore
|
|
|
|
|
try:
|
|
|
|
|
pm.install("python-docx")
|
|
|
|
|
except Exception:
|
|
|
|
|
pm.install("docx")
|
|
|
|
|
from docx import Document # type: ignore
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
docx_file = BytesIO(file)
|
|
|
|
|
doc = Document(docx_file)
|
|
|
|
|
content = "\n".join(
|
|
|
|
|
[paragraph.text for paragraph in doc.paragraphs]
|
|
|
|
|
)
|
|
|
|
|
case ".pptx":
|
|
|
|
|
if global_args.document_loading_engine == "DOCLING":
|
|
|
|
|
if not pm.is_installed("docling"): # type: ignore
|
|
|
|
|
pm.install("docling")
|
|
|
|
|
from docling.document_converter import DocumentConverter # type: ignore
|
|
|
|
|
|
|
|
|
|
converter = DocumentConverter()
|
|
|
|
|
result = converter.convert(file_path)
|
|
|
|
|
content = result.document.export_to_markdown()
|
|
|
|
|
else:
|
|
|
|
|
if not pm.is_installed("python-pptx"): # type: ignore
|
|
|
|
|
pm.install("pptx")
|
|
|
|
|
from pptx import Presentation # type: ignore
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
pptx_file = BytesIO(file)
|
|
|
|
|
prs = Presentation(pptx_file)
|
|
|
|
|
for slide in prs.slides:
|
|
|
|
|
for shape in slide.shapes:
|
|
|
|
|
if hasattr(shape, "text"):
|
|
|
|
|
content += shape.text + "\n"
|
|
|
|
|
case ".xlsx":
|
|
|
|
|
if global_args.document_loading_engine == "DOCLING":
|
|
|
|
|
if not pm.is_installed("docling"): # type: ignore
|
|
|
|
|
pm.install("docling")
|
|
|
|
|
from docling.document_converter import DocumentConverter # type: ignore
|
|
|
|
|
|
|
|
|
|
converter = DocumentConverter()
|
|
|
|
|
result = converter.convert(file_path)
|
|
|
|
|
content = result.document.export_to_markdown()
|
|
|
|
|
else:
|
|
|
|
|
if not pm.is_installed("openpyxl"): # type: ignore
|
|
|
|
|
pm.install("openpyxl")
|
|
|
|
|
from openpyxl import load_workbook # type: ignore
|
|
|
|
|
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
xlsx_file = BytesIO(file)
|
|
|
|
|
wb = load_workbook(xlsx_file)
|
|
|
|
|
for sheet in wb:
|
|
|
|
|
content += f"Sheet: {sheet.title}\n"
|
|
|
|
|
for row in sheet.iter_rows(values_only=True):
|
|
|
|
|
content += (
|
|
|
|
|
"\t".join(
|
|
|
|
|
str(cell) if cell is not None else ""
|
|
|
|
|
for cell in row
|
|
|
|
|
)
|
|
|
|
|
+ "\n"
|
|
|
|
|
)
|
|
|
|
|
content += "\n"
|
|
|
|
|
case _:
|
|
|
|
|
logger.error(
|
|
|
|
|
f"不支持的文件类型: {file_path.name} (扩展名 {ext})"
|
|
|
|
|
)
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 将文件与元数据一起添加到RAG队列
|
|
|
|
|
if content:
|
|
|
|
|
# 检查内容是否只包含空白字符
|
|
|
|
|
if not content.strip():
|
|
|
|
|
logger.warning(
|
|
|
|
|
f"文件只包含空白字符。file_paths={file_path.name}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 使用带元数据的文档入队函数
|
|
|
|
|
await rag.apipeline_enqueue_documents_with_uploadType(
|
|
|
|
|
content,
|
|
|
|
|
file_paths=file_path.name,
|
|
|
|
|
uploadType=uploadType
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.info(f"成功获取并入队文件: {file_path.name} 及其元数据")
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
logger.error(f"无法从文件中提取内容: {file_path.name}")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"处理或入队文件时出错 {file_path.name} 及其元数据: {str(e)}")
|
|
|
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
finally:
|
|
|
|
|
# 如果是临时文件则删除
|
|
|
|
|
if file_path.name.startswith(temp_prefix):
|
|
|
|
|
try:
|
|
|
|
|
file_path.unlink()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"删除文件时出错 {file_path}: {str(e)}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def pipeline_index_file(rag: LightRAG, file_path: Path):
|
|
|
|
|
"""Index a file
|
|
|
|
|
@ -963,10 +756,6 @@ async def pipeline_index_file(rag: LightRAG, file_path: Path):
|
|
|
|
|
file_path: Path to the saved file
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
"""
|
|
|
|
|
pipeline_enqueue_file:该函数负责读取文件内容,根据文件类型进行不同的处理,然后将处理后的内容添加到rag队列中。
|
|
|
|
|
它支持多种文件类型,包括文本文件、PDF、Word文档、PowerPoint演示文稿、Excel电子表格等。
|
|
|
|
|
"""
|
|
|
|
|
if await pipeline_enqueue_file(rag, file_path):
|
|
|
|
|
await rag.apipeline_process_enqueue_documents()
|
|
|
|
|
|
|
|
|
|
@ -974,23 +763,6 @@ async def pipeline_index_file(rag: LightRAG, file_path: Path):
|
|
|
|
|
logger.error(f"Error indexing file {file_path.name}: {str(e)}")
|
|
|
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
async def pipeline_index_file_uploadType(rag: LightRAG, file_path: Path, uploadType: Dict[str, Any] = None):
|
|
|
|
|
"""Index a file with additional uploadType
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
rag: LightRAG instance
|
|
|
|
|
file_path: Path to the saved file
|
|
|
|
|
uploadType: Optional dictionary containing uploadType for the document
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
if await pipeline_enqueue_file_uploadType(rag, file_path, uploadType):
|
|
|
|
|
await rag.apipeline_process_enqueue_documents()
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error indexing file {file_path.name} with uploadType: {str(e)}")
|
|
|
|
|
logger.error(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
|
|
|
|
|
"""Index multiple files sequentially to avoid high CPU load
|
|
|
|
|
@ -1272,7 +1044,7 @@ def create_document_routes(
|
|
|
|
|
"/upload", response_model=InsertResponse, dependencies=[Depends(combined_auth)]
|
|
|
|
|
)
|
|
|
|
|
async def upload_to_input_dir(
|
|
|
|
|
background_tasks: BackgroundTasks, file: UploadFile = File(...), field_type: str=Form(None) ,relation_type: str=Form(None)
|
|
|
|
|
background_tasks: BackgroundTasks, file: UploadFile = File(...)
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Upload a file to the input directory and index it.
|
|
|
|
|
@ -1314,13 +1086,7 @@ def create_document_routes(
|
|
|
|
|
shutil.copyfileobj(file.file, buffer)
|
|
|
|
|
|
|
|
|
|
# Add to background tasks
|
|
|
|
|
# 创建元数据字典存储额外参数
|
|
|
|
|
uploadType = {
|
|
|
|
|
"field_type": field_type,
|
|
|
|
|
"relation_type": relation_type
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
background_tasks.add_task(pipeline_index_file_uploadType, rag, file_path, uploadType)
|
|
|
|
|
background_tasks.add_task(pipeline_index_file, rag, file_path)
|
|
|
|
|
|
|
|
|
|
return InsertResponse(
|
|
|
|
|
status="success",
|
|
|
|
|
@ -1711,7 +1477,6 @@ def create_document_routes(
|
|
|
|
|
error=doc_status.error,
|
|
|
|
|
metadata=doc_status.metadata,
|
|
|
|
|
file_path=doc_status.file_path,
|
|
|
|
|
uploadType=doc_status.uploadType,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
return response
|