complete RAG, History RAG

detached2
Frieren 10 months ago
parent aa5bcef264
commit f692188679

@ -1 +1 @@
API_KEY=9d2d36b2c8b20bb329fd44fe058b7ac2.UF14LzIdcAH2Ob21
API_KEY=86df1c93a84174062eaf6a38e331efc1.AJV5y2jkKASbZ3YD

@ -4,7 +4,11 @@
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="second commit" />
<list default="true" id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="complete RAG, History RAG">
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/serve/RAG.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/serve/RAG.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@ -26,6 +30,20 @@
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
<option name="RESET_MODE" value="HARD" />
</component>
<component name="GitHubPullRequestSearchHistory">{
&quot;lastFilter&quot;: {
&quot;state&quot;: &quot;OPEN&quot;,
&quot;assignee&quot;: &quot;YDzzz&quot;
}
}</component>
<component name="GithubPullRequestsUISettings">
<option name="selectedUrlAndAccountId">
<UrlAndAccount>
<option name="accountId" value="c877182e-149b-4367-a384-4c094fd3c76e" />
<option name="url" value="https://github.com/YDzzz/LLM.git" />
</UrlAndAccount>
</option>
</component>
<component name="MarkdownSettingsMigration">
<option name="stateVersion" value="1" />
</component>
@ -42,11 +60,13 @@
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"Python.HistoryRAG.executor": "Run",
"Python.RAG.executor": "Run",
"Python.embedding.executor": "Run",
"RunOnceActivity.OpenProjectViewOnStart": "true",
"RunOnceActivity.ShowReadmeOnStart": "true",
"git-widget-placeholder": "91444e75",
"com.google.cloudcode.ide_session_index": "20240518_0001",
"git-widget-placeholder": "detached",
"last_opened_file_path": "G:/code/py/LLM",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
@ -84,6 +104,12 @@
<workItem from="1715345875476" duration="11056000" />
<workItem from="1715441734127" duration="1201000" />
<workItem from="1715516642242" duration="6557000" />
<workItem from="1715878258483" duration="36000" />
<workItem from="1715878326308" duration="244000" />
<workItem from="1715878583964" duration="209000" />
<workItem from="1715935000078" duration="18704000" />
<workItem from="1715964027274" duration="201000" />
<workItem from="1715964238142" duration="2613000" />
</task>
<task id="LOCAL-00001" summary="init">
<option name="closed" value="true" />
@ -109,7 +135,15 @@
<option name="project" value="LOCAL" />
<updated>1715530309702</updated>
</task>
<option name="localTasksCounter" value="4" />
<task id="LOCAL-00004" summary="complete RAG, History RAG">
<option name="closed" value="true" />
<created>1715963552332</created>
<option name="number" value="00004" />
<option name="presentableId" value="LOCAL-00004" />
<option name="project" value="LOCAL" />
<updated>1715963552332</updated>
</task>
<option name="localTasksCounter" value="5" />
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
@ -126,7 +160,7 @@
<entry key="branch">
<value>
<list>
<option value="master" />
<option value="detached" />
</list>
</value>
</entry>
@ -141,10 +175,12 @@
<component name="VcsManagerConfiguration">
<MESSAGE value="init" />
<MESSAGE value="second commit" />
<option name="LAST_COMMIT_MESSAGE" value="second commit" />
<MESSAGE value="complete RAG, History RAG" />
<option name="LAST_COMMIT_MESSAGE" value="complete RAG, History RAG" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/LLM$RAG.coverage" NAME="RAG Coverage Results" MODIFIED="1715346172432" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$embedding.coverage" NAME="embedding Coverage Results" MODIFIED="1715354020292" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$RAG.coverage" NAME="RAG Coverage Results" MODIFIED="1715966784292" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$embedding.coverage" NAME="embedding Coverage Results" MODIFIED="1715952325378" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$HistoryRAG.coverage" NAME="HistoryRAG Coverage Results" MODIFIED="1715963407636" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
</component>
</project>

@ -1,7 +1,7 @@
langchain~=0.1.19
httpx
pyjwt
transformers
text2vec~=1.2.9
chardet
pypdf~=4.2.0
langchain>=0.1.20
pypdf>=4.2.0
zhipuai~=2.0.1.20240423.1
langchainhub~=0.1.15
chromadb
httpx-sse~=0.4.0
chardet~=5.2.0

@ -1,15 +0,0 @@
from langchain_community.document_loaders import PyPDFLoader
class PDFLoader:
@staticmethod
def loader(file_path: str):
content = ""
loader = PyPDFLoader(file_path)
for page in loader.load():
content += page.page_content
return content
if __name__ == '__main__':
print(PDFLoader.loader("C:\\Users\\16922\\Desktop\\文档1.pdf"))

@ -1,10 +0,0 @@
## Document Loader Interface
| Method Name | Explanation |
|-------------|-----------------------------------------------------------------------------------------------------------------------------|
| lazy_load | Used to load documents one by one lazily. Use for production code. |
| alazy_load | Async variant of lazy_load |
| load | Used to load all the documents into memory eagerly. Use for prototyping or interactive work. |
| aload | Used to load all the documents into memory eagerly. Use for prototyping or interactive work. Added in 2024-04 to LangChain. |
- The load methods is a convenience method meant solely for prototyping work it just invokes list(self.lazy_load()).
- The alazy_load has a default implementation that will delegate to lazy_load. If youre using async, we recommend overriding the default implementation and providing a native async implementation.

@ -0,0 +1,47 @@
import os
from abc import ABC
from typing import List
from zhipuai import ZhipuAI
from src.init.property import Property
from langchain_core.embeddings import Embeddings
os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
client = ZhipuAI()
def _text_qualify(embedding_text):
"""
using ZhipuAI Embedding API to get embedding 1024 dimension support
:param embedding_text:
:return:
"""
if type(embedding_text) == str:
e_t = embedding_text
else:
e_t = embedding_text.page_content
# print usage of token number:
# response.usage.total_tokens
# embedding support:
# response.data[0].embedding
response = client.embeddings.create(
model="embedding-2",
input=e_t,
)
return response.data[0].embedding
class ZhuPuAiEmbedding(Embeddings, ABC):
def embed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings = []
for i in texts:
embeddings.append(_text_qualify(i))
return embeddings
def embed_query(self, text: str) -> List[float]:
return _text_qualify(text)
def __init__(self):
super().__init__()

@ -1,59 +0,0 @@
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = 250, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", "\n", text)
text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "")
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
sent_list = []
for ele in sent_sep_pattern.split(text):
if sent_sep_pattern.match(ele) and sent_list:
sent_list[-1] += ele
elif ele:
sent_list.append(ele)
return sent_list
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
return ls

@ -0,0 +1,26 @@
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.base.ZhuPuAiEmbeddings import ZhuPuAiEmbedding
class SentenceEmbedding:
def __init__(self, file_path: str):
self.file_path = file_path
docs = PyPDFLoader(file_path).load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
sentences = text_splitter.split_documents(docs)
self.vectorstore = Chroma.from_documents(sentences, ZhuPuAiEmbedding())
def get_vectorstore(self):
return self.vectorstore
def search(self, query: str) -> str:
docs = self.vectorstore.similarity_search(query)
return docs[0].page_content
if __name__ == '__main__':
a = SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf")
print(a.search("We will dedicate a segment of our project to discussing these ethical issues"))

@ -0,0 +1,66 @@
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from src.serve.RAG import RAG
__contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
_contextualize_q_prompt = ChatPromptTemplate.from_messages(
[
("system", __contextualize_q_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
__qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\
{context}"""
_qa_prompt = ChatPromptTemplate.from_messages(
[
("system", __qa_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
class HistoryRAG(RAG):
def __init__(self, file_path):
global _contextualize_q_prompt
global _qa_prompt
super().__init__(file_path)
self.__chat_history = []
history_aware_retriever = create_history_aware_retriever(
HistoryRAG._llm, self._retriever, _contextualize_q_prompt
)
question_answer_chain = create_stuff_documents_chain(HistoryRAG._llm, _qa_prompt)
self.__rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
def get_chat(self, question: str):
ai_msg = self.__rag_chain.invoke({"input": question, "chat_history": self.__chat_history})
self.__chat_history.extend([HumanMessage(content=question), ai_msg["answer"]])
return ai_msg["answer"]
def clear_history(self):
self.__chat_history.clear()
def select_prompt(self, prompt_index: int = 1):
pass
if __name__ == '__main__':
hr = HistoryRAG("C:\\Users\\16922\\Desktop\\文档1.pdf")
print(hr.get_chat("what can Multimodal Agent AI systems do?"))
print(hr.get_chat(input()))

@ -1,11 +1,28 @@
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.callbacks import StreamingStdOutCallbackHandler, CallbackManager
from src.init.property import Property
from langchain_community.chat_models.zhipuai import ChatZhipuAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from src.init.property import Property
from src.base.embedding import SentenceEmbedding
# 初始化prompt工程
_prompt = {
'prompt_1': ChatPromptTemplate.from_template(
"""你是问答任务的助手。使用以下检索到的上下文来回答问题。如果你不知道答案,就说你不知道。最多使用三个句子并保持答案简洁。
{context}
Question: {question}
Helpful Answer:"""
),
'prompt_2': hub.pull("rlm/rag-prompt")
}
def format_docs(docs):
@ -16,14 +33,8 @@ class RAG:
# 初始化ZHIPU API KEY
os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
# 初始化prompt工程
__prompt = ChatPromptTemplate.from_messages([
("system", "You are a world class technical documentation writer."),
("user", "{input}")
])
# 初始化模型
__llm = ChatZhipuAI(
_llm = ChatZhipuAI(
temperature=0.95,
model="glm-4"
)
@ -35,22 +46,38 @@ class RAG:
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
# 构建langchain
chain = __prompt | __llm
def __init__(self, file_path: str):
try:
global _prompt
self._example_prompt = _prompt["prompt_1"]
except NameError:
pass
self.__file_path = file_path
loader = PyPDFLoader(self.__file_path)
file = loader.load()
self.__sentenceEmbedding = SentenceEmbedding(file_path)
self._retriever = self.__sentenceEmbedding.get_vectorstore().as_retriever(search_type="similarity",
search_kwargs={"k": 5})
try:
self.__rag_chain = (
{"context": self._retriever | format_docs, "question": RunnablePassthrough()}
| self._example_prompt
| RAG._llm
| StrOutputParser()
)
except AttributeError:
pass
def get_answer(self, message) -> str:
return RAG.__llm.invoke(message).content
def get_chat(self, message: str):
for chunk in self.__rag_chain.stream(message):
print(chunk, end="", flush=True)
def get_streaming_chat(self, message) -> str:
return RAG.__streaming_chat.invoke(message).content
def select_prompt(self, prompt_index: int = 1):
global _prompt
prompt_name = "prompt_" + str(prompt_index)
self._example_prompt = _prompt[prompt_name]
if __name__ == '__main__':
r = RAG("C:\\Users\\16922\\Desktop\\文档1.pdf")
print(r.get_streaming_chat("hello"))
print(r.get_streaming_chat("what can you do"))
r.select_prompt(2)
r.get_chat("what can Multimodal Agent AI systems do?")

@ -1,28 +0,0 @@
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from text2vec import SentenceModel
from src.base.chinese_text_splitter import ChineseTextSplitter
class SentenceEmbedding:
__model = SentenceModel('shibing624/text2vec-base-chinese')
def __init__(self, file_path: str):
self.file_path = file_path
content = ""
loader = PyPDFLoader(file_path)
for page in loader.load():
content += page.page_content
sentences = ChineseTextSplitter(True).split_text(content)
embeddings = SentenceEmbedding.__model.encode(sentences)
self.vectorstore = Chroma.add_texts(iter(sentences), embeddings)
def get_vectorstore(self):
return self.vectorstore
def search(self, query:str):
embeddings = SentenceEmbedding.__model.encode(query)
self.vectorstore
if __name__ == '__main__':
SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf").
Loading…
Cancel
Save