Merge remote-tracking branch 'origin/detached' into detached

detached2
Frieren 10 months ago
commit aa5bcef264

@ -1,4 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="LLM" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="LLM" project-jdk-type="Python SDK" />
</project>

@ -1,33 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="" />
<list default="true" id="db7f4434-7cc2-4790-b42e-92159334a8b4" name="Changes" comment="second commit" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="Git.Settings">
<option name="RECENT_BRANCH_BY_REPOSITORY">
<map>
<entry key="$PROJECT_DIR$" value="master" />
</map>
</option>
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
<option name="RESET_MODE" value="HARD" />
</component>
<component name="ProjectColorInfo"><![CDATA[{
"associatedIndex": 8
}]]></component>
<component name="MarkdownSettingsMigration">
<option name="stateVersion" value="1" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 8
}</component>
<component name="ProjectId" id="2gEthR7lQkfXVFalbjvQftZcFMK" />
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
<component name="ProjectLevelVcsManager" settingsEditedManually="true">
<ConfirmationsSetting value="2" id="Add" />
</component>
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"Python.RAG.executor": "Run",
"Python.embedding.executor": "Run",
"RunOnceActivity.OpenProjectViewOnStart": "true",
"RunOnceActivity.ShowReadmeOnStart": "true",
"git-widget-placeholder": "master",
"git-widget-placeholder": "91444e75",
"last_opened_file_path": "G:/code/py/LLM",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
"vue.rearranger.settings.migration": "true"
}
}]]></component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="G:\code\py\LLM\src\base" />
<recent name="G:\code\py\LLM" />
</key>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
@ -43,8 +78,73 @@
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1715274683455</updated>
<workItem from="1715274700837" duration="5000" />
<workItem from="1715274700837" duration="4418000" />
<workItem from="1715336208466" duration="4087000" />
<workItem from="1715343547428" duration="1181000" />
<workItem from="1715345875476" duration="11056000" />
<workItem from="1715441734127" duration="1201000" />
<workItem from="1715516642242" duration="6557000" />
</task>
<task id="LOCAL-00001" summary="init">
<option name="closed" value="true" />
<created>1715279142669</created>
<option name="number" value="00001" />
<option name="presentableId" value="LOCAL-00001" />
<option name="project" value="LOCAL" />
<updated>1715279142669</updated>
</task>
<task id="LOCAL-00002" summary="second commit">
<option name="closed" value="true" />
<created>1715530268064</created>
<option name="number" value="00002" />
<option name="presentableId" value="LOCAL-00002" />
<option name="project" value="LOCAL" />
<updated>1715530268064</updated>
</task>
<task id="LOCAL-00003" summary="second commit">
<option name="closed" value="true" />
<created>1715530309702</created>
<option name="number" value="00003" />
<option name="presentableId" value="LOCAL-00003" />
<option name="project" value="LOCAL" />
<updated>1715530309702</updated>
</task>
<option name="localTasksCounter" value="4" />
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="Vcs.Log.Tabs.Properties">
<option name="TAB_STATES">
<map>
<entry key="MAIN">
<value>
<State>
<option name="FILTERS">
<map>
<entry key="branch">
<value>
<list>
<option value="master" />
</list>
</value>
</entry>
</map>
</option>
</State>
</value>
</entry>
</map>
</option>
</component>
<component name="VcsManagerConfiguration">
<MESSAGE value="init" />
<MESSAGE value="second commit" />
<option name="LAST_COMMIT_MESSAGE" value="second commit" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/LLM$RAG.coverage" NAME="RAG Coverage Results" MODIFIED="1715346172432" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
<SUITE FILE_PATH="coverage/LLM$embedding.coverage" NAME="embedding Coverage Results" MODIFIED="1715354020292" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src/serve" />
</component>
</project>

@ -1,4 +1,7 @@
langchain~=0.1.19
httpx-sse
langchainhub
pyjwt
httpx
pyjwt
transformers
text2vec~=1.2.9
chardet
pypdf~=4.2.0

@ -0,0 +1,15 @@
from langchain_community.document_loaders import PyPDFLoader
class PDFLoader:
@staticmethod
def loader(file_path: str):
content = ""
loader = PyPDFLoader(file_path)
for page in loader.load():
content += page.page_content
return content
if __name__ == '__main__':
print(PDFLoader.loader("C:\\Users\\16922\\Desktop\\文档1.pdf"))

@ -1,25 +1,59 @@
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
from langchain.text_splitter import CharacterTextSplitter
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, **kwargs):
def __init__(self, pdf: bool = False, sentence_size: int = 250, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text(self, text: str) -> List[str]:
def split_text1(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", "\n", text)
text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "")
sent_sep_pattern = re.compile(
'([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
sent_list = []
for ele in sent_sep_pattern.split(text):
if sent_sep_pattern.match(ele) and sent_list:
sent_list[-1] += ele
elif ele:
sent_list.append(ele)
return sent_list
return sent_list
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
return ls

@ -0,0 +1,28 @@
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from text2vec import SentenceModel
from src.base.chinese_text_splitter import ChineseTextSplitter
class SentenceEmbedding:
__model = SentenceModel('shibing624/text2vec-base-chinese')
def __init__(self, file_path: str):
self.file_path = file_path
content = ""
loader = PyPDFLoader(file_path)
for page in loader.load():
content += page.page_content
sentences = ChineseTextSplitter(True).split_text(content)
embeddings = SentenceEmbedding.__model.encode(sentences)
self.vectorstore = Chroma.add_texts(iter(sentences), embeddings)
def get_vectorstore(self):
return self.vectorstore
def search(self, query:str):
embeddings = SentenceEmbedding.__model.encode(query)
self.vectorstore
if __name__ == '__main__':
SentenceEmbedding("C:\\Users\\16922\\Desktop\\文档1.pdf").
Loading…
Cancel
Save