from langchain.document_loaders import ( CSVLoader, PDFMinerLoader, TextLoader, UnstructuredEPubLoader, UnstructuredHTMLLoader, UnstructuredMarkdownLoader, UnstructuredPowerPointLoader, UnstructuredWordDocumentLoader, JSONLoader, ) from langchain.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import Chroma import os import glob from args import args from langchain.vectorstores import FAISS def embed_data(embeddings_model_name,original_data_path,preprocessed_data_path): embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) # embedding模型 if does_vectorstore_exist(preprocessed_data_path): print('++++++++++++++++++++++++++++启用本地知识库+++++++++++++++++++++++++++++++++') # 如果存在,加载向量库。 #db = Chroma(persist_directory=preprocessed_data_path, embedding_function=embeddings) db = FAISS.load_local(preprocessed_data_path,embeddings) else: print('++++++++++++++++++++++++++++保存本地知识库+++++++++++++++++++++++++++++++++') # 如果不存在,就保存在本地 texts = load_data(original_data_path) # 文本切分 #db = Chroma.from_documents(texts, embeddings, persist_directory=preprocessed_data_path) db = FAISS.from_documents(texts, embeddings) db.save_local(preprocessed_data_path) return db def load_data(original_data_path): data_map = { ".csv": CSVLoader, ".json": JSONLoader, ".doc": UnstructuredWordDocumentLoader, ".docx": UnstructuredWordDocumentLoader, ".txt": TextLoader, ".md": UnstructuredMarkdownLoader, ".epub": UnstructuredEPubLoader, ".pdf": PDFMinerLoader, ".ppt": UnstructuredPowerPointLoader, ".pptx": UnstructuredPowerPointLoader, ".html": UnstructuredHTMLLoader, } data_all = [] for _, (key, value) in enumerate(data_map.items()): if key == ".csv": loader_kwargs = {'encoding': 'gbk'} loader = DirectoryLoader(original_data_path, glob=f"*{key}", show_progress=True, # 显示进度条 use_multithreading=True, # 多线程 loader_cls=value, silent_errors=True, # 跳过失败加载 loader_kwargs=loader_kwargs ) elif key == ".txt": loader_kwargs = {'autodetect_encoding': True} loader = DirectoryLoader(original_data_path, glob=f"*{key}", show_progress=True, # 显示进度条 use_multithreading=True, # 多线程 loader_cls=value, silent_errors=True, # 跳过失败加载 loader_kwargs=loader_kwargs ) else: loader = DirectoryLoader(original_data_path, glob=f"*{key}", show_progress=True, # 显示进度条 use_multithreading=True, # 多线程 loader_cls=value, silent_errors=True, # 跳过失败加载 ) data_one = loader.load() data_all.append(data_one) #length_function:用于计算文本块长度的方法。chunk_size:文本块的最大尺寸.chunk_overlap:文本块之间的最大重叠量。 text_splitter = RecursiveCharacterTextSplitter(chunk_size=args.chunk_size,chunk_overlap=args.chunk_overlap,length_function=len) texts = [] for one in data_all: if one == []: continue else: text = text_splitter.split_documents(one) texts.extend(text) return texts #更新向量库 def dBgx(original_data_path,preprocessed_data_path,embeddings_model_name): embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name) # embedding模型 texts = load_data(original_data_path) # 文本切分 # db = Chroma.from_documents(texts, embeddings, persist_directory=preprocessed_data_path) db = FAISS.from_documents(texts, embeddings) db.save_local(preprocessed_data_path) return db def does_vectorstore_exist(persist_directory): if os.path.exists(os.path.join(persist_directory, 'index.pkl')) and os.path.exists(os.path.join(persist_directory, 'index.faiss')): return True else: return False