From f6921886795625705cc7ef506c60e3a337bb39bf Mon Sep 17 00:00:00 2001
From: Frieren <1692219062wang@gmail.com>
Date: Sat, 18 May 2024 00:32:32 +0800
Subject: [PATCH] complete RAG, History RAG
---
.config/config.properties | 2 +-
.idea/workspace.xml | 50 ++++++++++++++++++----
requirements.txt | 14 +++----
src/base/PdfLoader.py | 15 -------
src/base/README.md | 10 -----
src/base/ZhuPuAiEmbeddings.py | 47 +++++++++++++++++++++
src/base/chinese_text_splitter.py | 59 --------------------------
src/base/embedding.py | 26 ++++++++++++
src/serve/HistoryRAG.py | 66 +++++++++++++++++++++++++++++
src/serve/RAG.py | 69 +++++++++++++++++++++----------
src/serve/embedding.py | 28 -------------
11 files changed, 238 insertions(+), 148 deletions(-)
delete mode 100644 src/base/PdfLoader.py
delete mode 100644 src/base/README.md
create mode 100644 src/base/ZhuPuAiEmbeddings.py
delete mode 100644 src/base/chinese_text_splitter.py
create mode 100644 src/base/embedding.py
create mode 100644 src/serve/HistoryRAG.py
delete mode 100644 src/serve/embedding.py
diff --git a/.config/config.properties b/.config/config.properties
index 408d0f0..c33e868 100644
--- a/.config/config.properties
+++ b/.config/config.properties
@@ -1 +1 @@
-API_KEY=9d2d36b2c8b20bb329fd44fe058b7ac2.UF14LzIdcAH2Ob21
\ No newline at end of file
+API_KEY=86df1c93a84174062eaf6a38e331efc1.AJV5y2jkKASbZ3YD
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 47574ce..04ac642 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -4,7 +4,11 @@
-
+
+
+
+
+
@@ -26,6 +30,20 @@
+ {
+ "lastFilter": {
+ "state": "OPEN",
+ "assignee": "YDzzz"
+ }
+}
+
+
+
@@ -42,11 +60,13 @@
+
+
+
+
+
+
@@ -109,7 +135,15 @@
1715530309702
-
+
+
+ 1715963552332
+
+
+
+ 1715963552332
+
+
@@ -126,7 +160,7 @@
-
+
@@ -141,10 +175,12 @@
-
+
+
-
-
+
+
+
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index f5aee2c..74d3fdb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
-langchain~=0.1.19
-httpx
-pyjwt
-transformers
-text2vec~=1.2.9
-chardet
-pypdf~=4.2.0
\ No newline at end of file
+langchain>=0.1.20
+pypdf>=4.2.0
+zhipuai~=2.0.1.20240423.1
+langchainhub~=0.1.15
+chromadb
+httpx-sse~=0.4.0
+chardet~=5.2.0
\ No newline at end of file
diff --git a/src/base/PdfLoader.py b/src/base/PdfLoader.py
deleted file mode 100644
index 25bdd12..0000000
--- a/src/base/PdfLoader.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from langchain_community.document_loaders import PyPDFLoader
-
-
-class PDFLoader:
- @staticmethod
- def loader(file_path: str):
- content = ""
- loader = PyPDFLoader(file_path)
- for page in loader.load():
- content += page.page_content
- return content
-
-
-if __name__ == '__main__':
- print(PDFLoader.loader("C:\\Users\\16922\\Desktop\\文档1.pdf"))
diff --git a/src/base/README.md b/src/base/README.md
deleted file mode 100644
index 81bde49..0000000
--- a/src/base/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-## Document Loader Interface
-| Method Name | Explanation |
-|-------------|-----------------------------------------------------------------------------------------------------------------------------|
-| lazy_load | Used to load documents one by one lazily. Use for production code. |
-| alazy_load | Async variant of lazy_load |
-| load | Used to load all the documents into memory eagerly. Use for prototyping or interactive work. |
-| aload | Used to load all the documents into memory eagerly. Use for prototyping or interactive work. Added in 2024-04 to LangChain. |
-- The load methods is a convenience method meant solely for prototyping work – it just invokes list(self.lazy_load()).
-- The alazy_load has a default implementation that will delegate to lazy_load. If you’re using async, we recommend overriding the default implementation and providing a native async implementation.
-
diff --git a/src/base/ZhuPuAiEmbeddings.py b/src/base/ZhuPuAiEmbeddings.py
new file mode 100644
index 0000000..5fbb914
--- /dev/null
+++ b/src/base/ZhuPuAiEmbeddings.py
@@ -0,0 +1,47 @@
+import os
+from abc import ABC
+from typing import List
+
+from zhipuai import ZhipuAI
+
+from src.init.property import Property
+
+from langchain_core.embeddings import Embeddings
+
+os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
+client = ZhipuAI()
+
+
+def _text_qualify(embedding_text):
+ """
+ using ZhipuAI Embedding API to get embedding 1024 dimension support
+ :param embedding_text:
+ :return:
+ """
+ if type(embedding_text) == str:
+ e_t = embedding_text
+ else:
+ e_t = embedding_text.page_content
+ # print usage of token number:
+ # response.usage.total_tokens
+ # embedding support:
+ # response.data[0].embedding
+ response = client.embeddings.create(
+ model="embedding-2",
+ input=e_t,
+ )
+ return response.data[0].embedding
+
+
+class ZhuPuAiEmbedding(Embeddings, ABC):
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
+ embeddings = []
+ for i in texts:
+ embeddings.append(_text_qualify(i))
+ return embeddings
+
+ def embed_query(self, text: str) -> List[float]:
+ return _text_qualify(text)
+
+ def __init__(self):
+ super().__init__()
diff --git a/src/base/chinese_text_splitter.py b/src/base/chinese_text_splitter.py
deleted file mode 100644
index 414fd41..0000000
--- a/src/base/chinese_text_splitter.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from langchain.text_splitter import CharacterTextSplitter
-import re
-from typing import List
-
-
-class ChineseTextSplitter(CharacterTextSplitter):
- def __init__(self, pdf: bool = False, sentence_size: int = 250, **kwargs):
- super().__init__(**kwargs)
- self.pdf = pdf
- self.sentence_size = sentence_size
-
- def split_text1(self, text: str) -> List[str]:
- if self.pdf:
- text = re.sub(r"\n{3,}", "\n", text)
- text = re.sub('\s', ' ', text)
- text = text.replace("\n\n", "")
- sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
- sent_list = []
- for ele in sent_sep_pattern.split(text):
- if sent_sep_pattern.match(ele) and sent_list:
- sent_list[-1] += ele
- elif ele:
- sent_list.append(ele)
- return sent_list
-
- def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
- if self.pdf:
- text = re.sub(r"\n{3,}", r"\n", text)
- text = re.sub('\s', " ", text)
- text = re.sub("\n\n", "", text)
-
- text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
- text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
- text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
- text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
- # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
- text = text.rstrip() # 段尾如果有多余的\n就去掉它
- # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
- ls = [i for i in text.split("\n") if i]
- for ele in ls:
- if len(ele) > self.sentence_size:
- ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
- ele1_ls = ele1.split("\n")
- for ele_ele1 in ele1_ls:
- if len(ele_ele1) > self.sentence_size:
- ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
- ele2_ls = ele_ele2.split("\n")
- for ele_ele2 in ele2_ls:
- if len(ele_ele2) > self.sentence_size:
- ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
- ele2_id = ele2_ls.index(ele_ele2)
- ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
- ele2_id + 1:]
- ele_id = ele1_ls.index(ele_ele1)
- ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
-
- id = ls.index(ele)
- ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
- return ls
\ No newline at end of file
diff --git a/src/base/embedding.py b/src/base/embedding.py
new file mode 100644
index 0000000..02fb86a
--- /dev/null
+++ b/src/base/embedding.py
@@ -0,0 +1,26 @@
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from src.base.ZhuPuAiEmbeddings import ZhuPuAiEmbedding
+
+
+class SentenceEmbedding:
+ def __init__(self, file_path: str):
+ self.file_path = file_path
+ docs = PyPDFLoader(file_path).load()
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
+ sentences = text_splitter.split_documents(docs)
+ self.vectorstore = Chroma.from_documents(sentences, ZhuPuAiEmbedding())
+
+ def get_vectorstore(self):
+ return self.vectorstore
+
+ def search(self, query: str) -> str:
+ docs = self.vectorstore.similarity_search(query)
+ return docs[0].page_content
+
+
+if __name__ == '__main__':
+ a = SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf")
+ print(a.search("We will dedicate a segment of our project to discussing these ethical issues"))
diff --git a/src/serve/HistoryRAG.py b/src/serve/HistoryRAG.py
new file mode 100644
index 0000000..2fe1a98
--- /dev/null
+++ b/src/serve/HistoryRAG.py
@@ -0,0 +1,66 @@
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains.history_aware_retriever import create_history_aware_retriever
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain_core.messages import HumanMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+
+from src.serve.RAG import RAG
+
+__contextualize_q_system_prompt = """Given a chat history and the latest user question \
+which might reference context in the chat history, formulate a standalone question \
+which can be understood without the chat history. Do NOT answer the question, \
+just reformulate it if needed and otherwise return it as is."""
+_contextualize_q_prompt = ChatPromptTemplate.from_messages(
+ [
+ ("system", __contextualize_q_system_prompt),
+ MessagesPlaceholder("chat_history"),
+ ("human", "{input}"),
+ ]
+)
+
+__qa_system_prompt = """You are an assistant for question-answering tasks. \
+Use the following pieces of retrieved context to answer the question. \
+If you don't know the answer, just say that you don't know. \
+Use three sentences maximum and keep the answer concise.\
+
+{context}"""
+_qa_prompt = ChatPromptTemplate.from_messages(
+ [
+ ("system", __qa_system_prompt),
+ MessagesPlaceholder("chat_history"),
+ ("human", "{input}"),
+ ]
+)
+
+
+class HistoryRAG(RAG):
+
+ def __init__(self, file_path):
+ global _contextualize_q_prompt
+ global _qa_prompt
+ super().__init__(file_path)
+ self.__chat_history = []
+
+ history_aware_retriever = create_history_aware_retriever(
+ HistoryRAG._llm, self._retriever, _contextualize_q_prompt
+ )
+ question_answer_chain = create_stuff_documents_chain(HistoryRAG._llm, _qa_prompt)
+ self.__rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+
+ def get_chat(self, question: str):
+ ai_msg = self.__rag_chain.invoke({"input": question, "chat_history": self.__chat_history})
+ self.__chat_history.extend([HumanMessage(content=question), ai_msg["answer"]])
+ return ai_msg["answer"]
+
+ def clear_history(self):
+ self.__chat_history.clear()
+
+ def select_prompt(self, prompt_index: int = 1):
+ pass
+
+
+if __name__ == '__main__':
+ hr = HistoryRAG("C:\\Users\\16922\\Desktop\\文档1.pdf")
+ print(hr.get_chat("what can Multimodal Agent AI systems do?"))
+ print(hr.get_chat(input()))
+
diff --git a/src/serve/RAG.py b/src/serve/RAG.py
index d5cde14..d98f6b8 100644
--- a/src/serve/RAG.py
+++ b/src/serve/RAG.py
@@ -1,11 +1,28 @@
import os
-from langchain_community.document_loaders import PyPDFLoader
from langchain_core.callbacks import StreamingStdOutCallbackHandler, CallbackManager
-
-from src.init.property import Property
from langchain_community.chat_models.zhipuai import ChatZhipuAI
+from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
+from langchain import hub
+from langchain_core.runnables import RunnablePassthrough
+
+from src.init.property import Property
+from src.base.embedding import SentenceEmbedding
+
+# 初始化prompt工程
+_prompt = {
+ 'prompt_1': ChatPromptTemplate.from_template(
+ """你是问答任务的助手。使用以下检索到的上下文来回答问题。如果你不知道答案,就说你不知道。最多使用三个句子并保持答案简洁。
+
+ {context}
+
+ Question: {question}
+
+ Helpful Answer:"""
+ ),
+ 'prompt_2': hub.pull("rlm/rag-prompt")
+}
def format_docs(docs):
@@ -16,14 +33,8 @@ class RAG:
# 初始化ZHIPU API KEY
os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
- # 初始化prompt工程
- __prompt = ChatPromptTemplate.from_messages([
- ("system", "You are a world class technical documentation writer."),
- ("user", "{input}")
- ])
-
# 初始化模型
- __llm = ChatZhipuAI(
+ _llm = ChatZhipuAI(
temperature=0.95,
model="glm-4"
)
@@ -35,22 +46,38 @@ class RAG:
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
- # 构建langchain
- chain = __prompt | __llm
-
def __init__(self, file_path: str):
+ try:
+ global _prompt
+ self._example_prompt = _prompt["prompt_1"]
+ except NameError:
+ pass
self.__file_path = file_path
- loader = PyPDFLoader(self.__file_path)
- file = loader.load()
+ self.__sentenceEmbedding = SentenceEmbedding(file_path)
+ self._retriever = self.__sentenceEmbedding.get_vectorstore().as_retriever(search_type="similarity",
+ search_kwargs={"k": 5})
+
+ try:
+ self.__rag_chain = (
+ {"context": self._retriever | format_docs, "question": RunnablePassthrough()}
+ | self._example_prompt
+ | RAG._llm
+ | StrOutputParser()
+ )
+ except AttributeError:
+ pass
- def get_answer(self, message) -> str:
- return RAG.__llm.invoke(message).content
+ def get_chat(self, message: str):
+ for chunk in self.__rag_chain.stream(message):
+ print(chunk, end="", flush=True)
- def get_streaming_chat(self, message) -> str:
- return RAG.__streaming_chat.invoke(message).content
+ def select_prompt(self, prompt_index: int = 1):
+ global _prompt
+ prompt_name = "prompt_" + str(prompt_index)
+ self._example_prompt = _prompt[prompt_name]
if __name__ == '__main__':
r = RAG("C:\\Users\\16922\\Desktop\\文档1.pdf")
- print(r.get_streaming_chat("hello"))
- print(r.get_streaming_chat("what can you do"))
+ r.select_prompt(2)
+ r.get_chat("what can Multimodal Agent AI systems do?")
diff --git a/src/serve/embedding.py b/src/serve/embedding.py
deleted file mode 100644
index c7b71f4..0000000
--- a/src/serve/embedding.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.vectorstores import Chroma
-from text2vec import SentenceModel
-from src.base.chinese_text_splitter import ChineseTextSplitter
-
-
-class SentenceEmbedding:
- __model = SentenceModel('shibing624/text2vec-base-chinese')
- def __init__(self, file_path: str):
- self.file_path = file_path
- content = ""
- loader = PyPDFLoader(file_path)
- for page in loader.load():
- content += page.page_content
- sentences = ChineseTextSplitter(True).split_text(content)
- embeddings = SentenceEmbedding.__model.encode(sentences)
- self.vectorstore = Chroma.add_texts(iter(sentences), embeddings)
-
- def get_vectorstore(self):
- return self.vectorstore
-
- def search(self, query:str):
- embeddings = SentenceEmbedding.__model.encode(query)
- self.vectorstore
-
-
-if __name__ == '__main__':
- SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf").