From f79396743677a12ba3e8552ceb62410e66292009 Mon Sep 17 00:00:00 2001
From: Frieren <1692219062wang@gmail.com>
Date: Fri, 10 May 2024 02:25:42 +0800
Subject: [PATCH] init
---
.idea/LLM.iml | 8 +++
.idea/inspectionProfiles/Project_Default.xml | 20 ++++++++
.../inspectionProfiles/profiles_settings.xml | 6 +++
.idea/misc.xml | 4 ++
.idea/modules.xml | 8 +++
.idea/vcs.xml | 6 +++
.idea/workspace.xml | 50 +++++++++++++++++++
README.md | 0
requirements.txt | 4 ++
src/base/chinese_text_splitter.py | 25 ++++++++++
10 files changed, 131 insertions(+)
create mode 100644 .idea/LLM.iml
create mode 100644 .idea/inspectionProfiles/Project_Default.xml
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/vcs.xml
create mode 100644 .idea/workspace.xml
create mode 100644 README.md
create mode 100644 requirements.txt
create mode 100644 src/base/chinese_text_splitter.py
diff --git a/.idea/LLM.iml b/.idea/LLM.iml
new file mode 100644
index 0000000..d0876a7
--- /dev/null
+++ b/.idea/LLM.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..02399c7
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..7830771
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..9f996f8
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..d70da44
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,50 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1715274683455
+
+
+ 1715274683455
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ecdfcaa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+langchain~=0.1.19
+httpx-sse
+langchainhub
+pyjwt
\ No newline at end of file
diff --git a/src/base/chinese_text_splitter.py b/src/base/chinese_text_splitter.py
new file mode 100644
index 0000000..14914ec
--- /dev/null
+++ b/src/base/chinese_text_splitter.py
@@ -0,0 +1,25 @@
+import re
+from typing import List
+
+from langchain.text_splitter import CharacterTextSplitter
+
+
+class ChineseTextSplitter(CharacterTextSplitter):
+ def __init__(self, pdf: bool = False, **kwargs):
+ super().__init__(**kwargs)
+ self.pdf = pdf
+
+ def split_text(self, text: str) -> List[str]:
+ if self.pdf:
+ text = re.sub(r"\n{3,}", "\n", text)
+ text = re.sub('\s', ' ', text)
+ text = text.replace("\n\n", "")
+ sent_sep_pattern = re.compile(
+ '([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))')
+ sent_list = []
+ for ele in sent_sep_pattern.split(text):
+ if sent_sep_pattern.match(ele) and sent_list:
+ sent_list[-1] += ele
+ elif ele:
+ sent_list.append(ele)
+ return sent_list
\ No newline at end of file