RAG #1

Open
pbfhtyfu6 wants to merge 0 commits from detached into main

@ -0,0 +1,2 @@
API_KEY=
FILE_PATH=

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

@ -0,0 +1,20 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<Languages>
<language minSize="161" name="Python" />
</Languages>
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="2">
<item index="0" class="java.lang.String" itemvalue="sentence_transformers" />
<item index="1" class="java.lang.String" itemvalue="cpm_kernels" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="LLM" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="LLM" project-jdk-type="Python SDK" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/LLM.iml" filepath="$PROJECT_DIR$/.idea/LLM.iml" />
</modules>
</component>
</project>

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

@ -0,0 +1,212 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="complete RAG, History RAG">
<change beforePath="$PROJECT_DIR$/.config/config.properties" beforeDir="false" afterPath="$PROJECT_DIR$/.config/config.properties" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/serve/HistoryRAG.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/serve/HistoryRAG.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/src/serve/RAG.py" beforeDir="false" afterPath="$PROJECT_DIR$/src/serve/RAG.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="Git.Settings">
<option name="RECENT_BRANCH_BY_REPOSITORY">
<map>
<entry key="$PROJECT_DIR$" value="detached" />
</map>
</option>
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
<option name="RESET_MODE" value="HARD" />
</component>
<component name="GitHubPullRequestSearchHistory">{
&quot;lastFilter&quot;: {
&quot;state&quot;: &quot;OPEN&quot;,
&quot;assignee&quot;: &quot;YDzzz&quot;
}
}</component>
<component name="GithubPullRequestsUISettings">
<option name="selectedUrlAndAccountId">
<UrlAndAccount>
<option name="accountId" value="c877182e-149b-4367-a384-4c094fd3c76e" />
<option name="url" value="https://github.com/YDzzz/LLM.git" />
</UrlAndAccount>
</option>
</component>
<component name="MarkdownSettingsMigration">
<option name="stateVersion" value="1" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 8
}</component>
<component name="ProjectId" id="2gEthR7lQkfXVFalbjvQftZcFMK" />
<component name="ProjectLevelVcsManager" settingsEditedManually="true">
<ConfirmationsSetting value="2" id="Add" />
</component>
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"Python.HistoryRAG.executor": "Run",
"Python.RAG.executor": "Run",
"Python.embedding.executor": "Run",
"RunOnceActivity.OpenProjectViewOnStart": "true",
"RunOnceActivity.ShowReadmeOnStart": "true",
"com.google.cloudcode.ide_session_index": "20240528_0001",
"git-widget-placeholder": "032994f8",
"last_opened_file_path": "G:/code/py/LLM",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
"vue.rearranger.settings.migration": "true"
}
}]]></component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="G:\code\py\LLM\src\base" />
<recent name="G:\code\py\LLM" />
</key>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-python-sdk-d68999036c7f-b11f5e8da5ad-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.14475.56" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="" />
<created>1715274683455</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1715274683455</updated>
<workItem from="1715274700837" duration="4418000" />
<workItem from="1715336208466" duration="4087000" />
<workItem from="1715343547428" duration="1181000" />
<workItem from="1715345875476" duration="11056000" />
<workItem from="1715441734127" duration="1201000" />
<workItem from="1715516642242" duration="6557000" />
<workItem from="1715878258483" duration="36000" />
<workItem from="1715878326308" duration="244000" />
<workItem from="1715878583964" duration="209000" />
<workItem from="1715935000078" duration="18704000" />
<workItem from="1715964027274" duration="201000" />
<workItem from="1715964238142" duration="2613000" />
<workItem from="1715966887877" duration="22000" />
<workItem from="1716089691296" duration="2747000" />
<workItem from="1716101749458" duration="1744000" />
<workItem from="1716103616899" duration="1362000" />
<workItem from="1716718682372" duration="7000" />
<workItem from="1716789003601" duration="30000" />
<workItem from="1716896259454" duration="607000" />
<workItem from="1716896875841" duration="3460000" />
</task>
<task id="LOCAL-00001" summary="init">
<option name="closed" value="true" />
<created>1715279142669</created>
<option name="number" value="00001" />
<option name="presentableId" value="LOCAL-00001" />
<option name="project" value="LOCAL" />
<updated>1715279142669</updated>
</task>
<task id="LOCAL-00002" summary="second commit">
<option name="closed" value="true" />
<created>1715530268064</created>
<option name="number" value="00002" />
<option name="presentableId" value="LOCAL-00002" />
<option name="project" value="LOCAL" />
<updated>1715530268064</updated>
</task>
<task id="LOCAL-00003" summary="second commit">
<option name="closed" value="true" />
<created>1715530309702</created>
<option name="number" value="00003" />
<option name="presentableId" value="LOCAL-00003" />
<option name="project" value="LOCAL" />
<updated>1715530309702</updated>
</task>
<task id="LOCAL-00004" summary="complete RAG, History RAG">
<option name="closed" value="true" />
<created>1715963552332</created>
<option name="number" value="00004" />
<option name="presentableId" value="LOCAL-00004" />
<option name="project" value="LOCAL" />
<updated>1715963552332</updated>
</task>
<task id="LOCAL-00005" summary="complete RAG, History RAG">
<option name="closed" value="true" />
<created>1715966892912</created>
<option name="number" value="00005" />
<option name="presentableId" value="LOCAL-00005" />
<option name="project" value="LOCAL" />
<updated>1715966892912</updated>
</task>
<task id="LOCAL-00006" summary="complete RAG, History RAG">
<option name="closed" value="true" />
<created>1716789017012</created>
<option name="number" value="00006" />
<option name="presentableId" value="LOCAL-00006" />
<option name="project" value="LOCAL" />
<updated>1716789017012</updated>
</task>
<option name="localTasksCounter" value="7" />
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="Vcs.Log.Tabs.Properties">
<option name="TAB_STATES">
<map>
<entry key="MAIN">
<value>
<State>
<option name="FILTERS">
<map>
<entry key="branch">
<value>
<list>
<option value="detached" />
</list>
</value>
</entry>
</map>
</option>
</State>
</value>
</entry>
</map>
</option>
</component>
<component name="VcsManagerConfiguration">
<MESSAGE value="init" />
<MESSAGE value="second commit" />
<MESSAGE value="complete RAG, History RAG" />
<option name="LAST_COMMIT_MESSAGE" value="complete RAG, History RAG" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/LLM$RAG.coverage" NAME="RAG Coverage Results" MODIFIED="1716897848448" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$embedding.coverage" NAME="embedding Coverage Results" MODIFIED="1715952325378" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$HistoryRAG.coverage" NAME="HistoryRAG Coverage Results" MODIFIED="1716901838902" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
</component>
</project>

@ -1,9 +0,0 @@
MIT License
Copyright (c) <year> <copyright holders>
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -1,2 +0,0 @@
# LLM

@ -0,0 +1,7 @@
langchain>=0.1.20
pypdf>=4.2.0
zhipuai~=2.0.1.20240423.1
langchainhub~=0.1.15
httpx-sse~=0.4.0
chardet~=5.2.0
gradio

@ -0,0 +1,47 @@
import os
from abc import ABC
from typing import List
from zhipuai import ZhipuAI
from src.init.property import Property
from langchain_core.embeddings import Embeddings
os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
client = ZhipuAI()
def _text_qualify(embedding_text):
"""
using ZhipuAI Embedding API to get embedding 1024 dimension support
:param embedding_text:
:return:
"""
if type(embedding_text) == str:
e_t = embedding_text
else:
e_t = embedding_text.page_content
# print usage of token number:
# response.usage.total_tokens
# embedding support:
# response.data[0].embedding
response = client.embeddings.create(
model="embedding-2",
input=e_t,
)
return response.data[0].embedding
class ZhuPuAiEmbedding(Embeddings, ABC):
def embed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings = []
for i in texts:
embeddings.append(_text_qualify(i))
return embeddings
def embed_query(self, text: str) -> List[float]:
return _text_qualify(text)
def __init__(self):
super().__init__()

@ -0,0 +1,26 @@
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.base.ZhuPuAiEmbeddings import ZhuPuAiEmbedding
class SentenceEmbedding:
def __init__(self, file_path: str):
self.file_path = file_path
docs = PyPDFLoader(file_path).load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
sentences = text_splitter.split_documents(docs)
self.vectorstore = Chroma.from_documents(sentences, ZhuPuAiEmbedding())
def get_vectorstore(self):
return self.vectorstore
def search(self, query: str) -> str:
docs = self.vectorstore.similarity_search(query)
return docs[0].page_content
if __name__ == '__main__':
a = SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf")
print(a.search("We will dedicate a segment of our project to discussing these ethical issues"))

@ -0,0 +1,31 @@
import os
class Property(object):
__props = {}
filepath = os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, ".config\\config.properties")
with open(filepath, "r") as f:
for line in f:
l = line.strip()
if l and not l.startswith('#'):
key_map = l.split('=')
key_name = key_map[0].strip()
key_value = '='.join(key_map[1:]).strip().strip()
__props[key_name] = key_value
@staticmethod
def get_property(property_name):
try:
return Property.__props[property_name]
except KeyError:
print("there is no that property name")
return None
def main():
print(Property.get_property("API_KEY"))
if __name__ == '__main__':
main()

@ -0,0 +1,114 @@
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from src.serve.RAG import RAG
from src.init.property import Property
import gradio as gr
__contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
_contextualize_q_prompt = ChatPromptTemplate.from_messages(
[
("system", __contextualize_q_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
__qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\
{context}"""
_qa_prompt = ChatPromptTemplate.from_messages(
[
("system", __qa_system_prompt),
MessagesPlaceholder("chat_history"),
("human", "{input}"),
]
)
class HistoryRAG(RAG):
def __init__(self, file_path):
global _contextualize_q_prompt
global _qa_prompt
super().__init__(file_path)
self.__chat_history = []
history_aware_retriever = create_history_aware_retriever(
HistoryRAG._llm, self._retriever, _contextualize_q_prompt
)
question_answer_chain = create_stuff_documents_chain(HistoryRAG._llm, _qa_prompt)
self.__rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
def get_chat(self, question: str):
ai_msg = self.__rag_chain.invoke({"input": question, "chat_history": self.__chat_history})
self.__chat_history.extend([HumanMessage(content=question), ai_msg["answer"]])
return ai_msg["answer"]
def clear_history(self):
self.__chat_history.clear()
def select_prompt(self, prompt_index: int = 1):
pass
if __name__ == '__main__':
file_path = Property.get_property("FILE_PATH")
hr = HistoryRAG(file_path)
iface = gr.Interface(fn=hr.get_chat, inputs="text", outputs="text", title="Chatbot",
description="从自定义文档中提问并获取答案")
gr.HTML("""<h1 align="center">ChatGLM4</h1>""")
iface.launch()
# print("welcome to use RAG question, input exit() to end")
# try:
# file_path = input("please input file path:").strip('"')
# if not len(file_path):
# raise ValueError("path not be empty")
# except ValueError:
# print("arise error" + repr(ValueError))
# finally:
# hr = HistoryRAG(file_path)
# while True:
# chat = input("user:")
# if chat == "exit()":
# break
# print("system:" + hr.get_chat(chat))
#
#
# with gr.Blocks() as demo:
# gr.HTML("""<h1 align="center">ChatGLM4</h1>""")
# chatbot = gr.Chatbot()
#
# with gr.Row():
# with gr.Column(scale=4):
# with gr.Column(scale=12):
# user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10, container=False)
# with gr.Column(min_width=32, scale=1):
# submitBtn = gr.Button("Submit")
# with gr.Column(scale=1):
# emptyBtn = gr.Button("Clear History")
# max_length = gr.Slider(0, 32768, value=8192, step=1.0, label="Maximum length", interactive=True)
# top_p = gr.Slider(0, 1, value=0.8, step=0.01, label="Top P", interactive=True)
# temperature = gr.Slider(0.01, 1, value=0.6, step=0.01, label="Temperature", interactive=True)
#
#
# def user(query, history):
# return "", history + [[parse_text(query), ""]]
#
#
# submitBtn.click(user, [user_input, chatbot], [user_input, chatbot], queue=False).then(
# [chatbot, max_length, top_p, temperature], chatbot
# )
# emptyBtn.click(lambda: None, None, chatbot, queue=False)
#
# demo.queue()
# demo.launch(server_name="127.0.0.1", server_port=7870, inbrowser=True, share=False)

@ -0,0 +1,84 @@
import os
from langchain_core.callbacks import StreamingStdOutCallbackHandler, CallbackManager
from langchain_community.chat_models.zhipuai import ChatZhipuAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from src.init.property import Property
from src.base.embedding import SentenceEmbedding
# 初始化prompt工程
_prompt = {
'prompt_1': ChatPromptTemplate.from_template(
"""你是问答任务的助手。使用以下检索到的上下文来回答问题。如果你不知道答案,就说你不知道。最多使用三个句子并保持答案简洁。
{context}
Question: {question}
Helpful Answer:"""
),
'prompt_2': hub.pull("rlm/rag-prompt")
}
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
class RAG:
# 初始化ZHIPU API KEY
os.environ["ZHIPUAI_API_KEY"] = Property.get_property("API_KEY")
# 初始化模型
_llm = ChatZhipuAI(
temperature=0.95,
model="glm-4"
)
__streaming_chat = ChatZhipuAI(
model="glm-4",
temperature=0.5,
streaming=True,
callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)
def __init__(self, file_path: str):
try:
global _prompt
self._example_prompt = _prompt["prompt_1"]
except NameError:
pass
self.__file_path = file_path
self.__sentenceEmbedding = SentenceEmbedding(file_path)
self._retriever = self.__sentenceEmbedding.get_vectorstore().as_retriever(search_type="similarity",
search_kwargs={"k": 5})
try:
self.__rag_chain = (
{"context": self._retriever | format_docs, "question": RunnablePassthrough()}
| self._example_prompt
| RAG._llm
| StrOutputParser()
)
except AttributeError:
pass
def get_chat(self, message: str):
for chunk in self.__rag_chain.stream(message):
print(chunk, end="", flush=True)
def select_prompt(self, prompt_index: int = 1):
global _prompt
prompt_name = "prompt_" + str(prompt_index)
self._example_prompt = _prompt[prompt_name]
if __name__ == '__main__':
r = RAG("C:\\Users\\16922\\Desktop\\文档1.pdf")
r.select_prompt(2)
r.get_chat("what can Multimodal Agent AI systems do?")
Loading…
Cancel
Save