From f79396743677a12ba3e8552ceb62410e66292009 Mon Sep 17 00:00:00 2001 From: Frieren <1692219062wang@gmail.com> Date: Fri, 10 May 2024 02:25:42 +0800 Subject: [PATCH] init --- .idea/LLM.iml | 8 +++ .idea/inspectionProfiles/Project_Default.xml | 20 ++++++++ .../inspectionProfiles/profiles_settings.xml | 6 +++ .idea/misc.xml | 4 ++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 +++ .idea/workspace.xml | 50 +++++++++++++++++++ README.md | 0 requirements.txt | 4 ++ src/base/chinese_text_splitter.py | 25 ++++++++++ 10 files changed, 131 insertions(+) create mode 100644 .idea/LLM.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 src/base/chinese_text_splitter.py diff --git a/.idea/LLM.iml b/.idea/LLM.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/LLM.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..02399c7 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,20 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..7830771 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9f996f8 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..d70da44 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + 1715274683455 + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ecdfcaa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +langchain~=0.1.19 +httpx-sse +langchainhub +pyjwt \ No newline at end of file diff --git a/src/base/chinese_text_splitter.py b/src/base/chinese_text_splitter.py new file mode 100644 index 0000000..14914ec --- /dev/null +++ b/src/base/chinese_text_splitter.py @@ -0,0 +1,25 @@ +import re +from typing import List + +from langchain.text_splitter import CharacterTextSplitter + + +class ChineseTextSplitter(CharacterTextSplitter): + def __init__(self, pdf: bool = False, **kwargs): + super().__init__(**kwargs) + self.pdf = pdf + + def split_text(self, text: str) -> List[str]: + if self.pdf: + text = re.sub(r"\n{3,}", "\n", text) + text = re.sub('\s', ' ', text) + text = text.replace("\n\n", "") + sent_sep_pattern = re.compile( + '([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') + sent_list = [] + for ele in sent_sep_pattern.split(text): + if sent_sep_pattern.match(ele) and sent_list: + sent_list[-1] += ele + elif ele: + sent_list.append(ele) + return sent_list \ No newline at end of file