From 7fcd005f06ad1f8a97e70084544a2ac47f6f08b6 Mon Sep 17 00:00:00 2001
From: jialin <jialinkuang@126.com>
Date: Mon, 25 Nov 2024 13:31:42 +0800
Subject: [PATCH] chore: audio

---
 config/routes.ts                              |  16 +-
 src/locales/en-US/playground.ts               |   9 +-
 src/locales/zh-CN/playground.ts               |   7 +-
 .../llmodels/components/advance-config.tsx    |   5 +
 src/pages/llmodels/components/table-list.tsx  |  85 ++++++++---
 src/pages/llmodels/config/index.ts            |   3 +-
 src/pages/llmodels/config/types.ts            |   2 +
 src/pages/playground/apis/index.ts            |  29 ++++
 .../playground/components/ground-images.tsx   |   2 +-
 .../playground/components/ground-stt.tsx      |   7 +-
 .../playground/components/ground-tts.tsx      | 140 ++++--------------
 src/pages/playground/speech.tsx               |   4 +-
 src/utils/fetch-chunk-data.ts                 |   1 +
 13 files changed, 159 insertions(+), 151 deletions(-)

diff --git a/config/routes.ts b/config/routes.ts
index 6e7f45a2..537e802f 100644
--- a/config/routes.ts
+++ b/config/routes.ts
@@ -25,14 +25,6 @@ export default [
         icon: 'Comment',
         component: './playground/index'
       },
-      {
-        name: 'speech',
-        title: 'Speech',
-        path: '/playground/speech',
-        key: 'speech',
-        icon: 'Comment',
-        component: './playground/speech'
-      },
       {
         name: 'text2images',
         title: 'Text2Images',
@@ -41,6 +33,14 @@ export default [
         icon: 'Comment',
         component: './playground/images'
       },
+      {
+        name: 'speech',
+        title: 'Speech',
+        path: '/playground/speech',
+        key: 'speech',
+        icon: 'Comment',
+        component: './playground/speech'
+      },
       {
         name: 'embedding',
         title: 'embedding',
diff --git a/src/locales/en-US/playground.ts b/src/locales/en-US/playground.ts
index 2207ff7e..d82a84ef 100644
--- a/src/locales/en-US/playground.ts
+++ b/src/locales/en-US/playground.ts
@@ -75,5 +75,12 @@ export default {
   'playground.rerank.rank': 'Rank',
   'playground.rerank.score': 'Score',
   'playground.rerank.query.holder': 'Input your query',
-  'playground.image.prompt': 'Input Prompt'
+  'playground.image.prompt': 'Input Prompt',
+  'playground.audio.texttospeech': 'Text to Speech',
+  'playground.audio.speechtotext': 'Speech to Text',
+  'playground.audio.texttospeech.tips': 'Generated speech will appear here',
+  'playground.audio.speechtotext.tips':
+    'Upload an audio file or start recording',
+  'playground.audio.enablemic':
+    "Enable microphone access in your browser's settings."
 };
diff --git a/src/locales/zh-CN/playground.ts b/src/locales/zh-CN/playground.ts
index 789aff9b..e2b3e7cd 100644
--- a/src/locales/zh-CN/playground.ts
+++ b/src/locales/zh-CN/playground.ts
@@ -75,5 +75,10 @@ export default {
   'playground.rerank.rank': '排序',
   'playground.rerank.score': '分数',
   'playground.rerank.query.holder': '输入查询',
-  'playground.image.prompt': '输入提示'
+  'playground.image.prompt': '输入提示',
+  'playground.audio.texttospeech': '文本转语音',
+  'playground.audio.speechtotext': '语音转文本',
+  'playground.audio.texttospeech.tips': '生成的语音将出现在这里',
+  'playground.audio.speechtotext.tips': '上传音频文件或开始录音',
+  'playground.audio.enablemic': '请允许浏览器访问麦克风，以便开始录音'
 };
diff --git a/src/pages/llmodels/components/advance-config.tsx b/src/pages/llmodels/components/advance-config.tsx
index 8a1954d2..797c33b8 100644
--- a/src/pages/llmodels/components/advance-config.tsx
+++ b/src/pages/llmodels/components/advance-config.tsx
@@ -264,6 +264,11 @@ const AdvanceConfig: React.FC<AdvanceConfigProps> = (props) => {
                 value: backendOptionsMap.vllm,
                 disabled:
                   source === modelSourceMap.local_path_value ? false : isGGUF
+              },
+              {
+                label: 'vox-box',
+                value: backendOptionsMap.voxBox,
+                disabled: false
               }
             ]}
             disabled={
diff --git a/src/pages/llmodels/components/table-list.tsx b/src/pages/llmodels/components/table-list.tsx
index b3cfb3a3..6834e67d 100644
--- a/src/pages/llmodels/components/table-list.tsx
+++ b/src/pages/llmodels/components/table-list.tsx
@@ -423,6 +423,66 @@ const Models: React.FC<ModelsProps> = ({
     []
   );
 
+  const renderModelTags = useCallback((record: ListItem) => {
+    if (record.reranker) {
+      return (
+        <Tag
+          style={{
+            margin: 0,
+            opacity: 0.8,
+            transform: 'scale(0.9)'
+          }}
+          color="geekblue"
+        >
+          Reranker
+        </Tag>
+      );
+    }
+
+    if (record.embedding_only && !record.reranker) {
+      return (
+        <Tag
+          style={{
+            margin: 0,
+            opacity: 0.8,
+            transform: 'scale(0.9)'
+          }}
+          color="geekblue"
+        >
+          Embedding Only
+        </Tag>
+      );
+    }
+    if (record.text_to_speech) {
+      return (
+        <Tag
+          style={{
+            margin: 0,
+            opacity: 0.8,
+            transform: 'scale(0.9)'
+          }}
+          color="geekblue"
+        >
+          {intl.formatMessage({ id: 'playground.audio.texttospeech' })}
+        </Tag>
+      );
+    }
+    if (record.speech_to_text) {
+      return (
+        <Tag
+          style={{
+            margin: 0,
+            opacity: 0.8,
+            transform: 'scale(0.9)'
+          }}
+          color="geekblue"
+        >
+          {intl.formatMessage({ id: 'playground.audio.speechtotext' })}
+        </Tag>
+      );
+    }
+    return null;
+  }, []);
   const renderChildren = useCallback(
     (list: any, parent?: any) => {
       return (
@@ -548,30 +608,7 @@ const Models: React.FC<ModelsProps> = ({
                   <AutoTooltip ghost>
                     <span className="m-r-5">{text}</span>
                   </AutoTooltip>
-                  {record.reranker && (
-                    <Tag
-                      style={{
-                        margin: 0,
-                        opacity: 0.8,
-                        transform: 'scale(0.9)'
-                      }}
-                      color="geekblue"
-                    >
-                      Reranker
-                    </Tag>
-                  )}
-                  {record.embedding_only && !record.reranker && (
-                    <Tag
-                      style={{
-                        margin: 0,
-                        opacity: 0.8,
-                        transform: 'scale(0.9)'
-                      }}
-                      color="geekblue"
-                    >
-                      Embedding Only
-                    </Tag>
-                  )}
+                  {renderModelTags(record)}
                 </span>
               );
             }}
diff --git a/src/pages/llmodels/config/index.ts b/src/pages/llmodels/config/index.ts
index 03f82bf6..b2dd28ed 100644
--- a/src/pages/llmodels/config/index.ts
+++ b/src/pages/llmodels/config/index.ts
@@ -69,7 +69,8 @@ export const ollamaModelOptions = [
 
 export const backendOptionsMap = {
   llamaBox: 'llama-box',
-  vllm: 'vllm'
+  vllm: 'vllm',
+  voxBox: 'vox-box'
 };
 
 export const modelSourceMap: Record<string, string> = {
diff --git a/src/pages/llmodels/config/types.ts b/src/pages/llmodels/config/types.ts
index 17f0468e..477f35b6 100644
--- a/src/pages/llmodels/config/types.ts
+++ b/src/pages/llmodels/config/types.ts
@@ -11,6 +11,8 @@ export interface ListItem {
   model_scope_model_id: string;
   embedding_only?: boolean;
   ready_replicas: number;
+  speech_to_text?: boolean;
+  text_to_speech?: boolean;
   replicas: number;
   s3Address: string;
   name: string;
diff --git a/src/pages/playground/apis/index.ts b/src/pages/playground/apis/index.ts
index 2ea35218..3bff3a4f 100644
--- a/src/pages/playground/apis/index.ts
+++ b/src/pages/playground/apis/index.ts
@@ -10,6 +10,10 @@ export const OPENAI_MODELS = '/v1-openai/models';
 
 export const RERANKER_API = '/rerank';
 
+export const AUDIO_TEXT_TO_SPEECH_API = '/v1-openai/audio/speech';
+
+export const AUDIO_SPEECH_TO_TEXT_API = '/v1-openai/audio/transcriptions';
+
 export async function execChatCompletions(params: any) {
   return request(`${CHAT_API}`, {
     method: 'POST',
@@ -81,3 +85,28 @@ export const createImages = async (
   }
   return res.json();
 };
+
+// ============ audio ============
+export const textToSpeech = async (params: any, options?: any) => {
+  const res = await fetch(AUDIO_TEXT_TO_SPEECH_API, {
+    method: 'POST',
+    body: JSON.stringify(params),
+    signal: params.signal
+  });
+  if (!res.ok) {
+    throw new Error('Network response was not ok');
+  }
+  return res.json();
+};
+
+export const speechToText = async (params: any, options?: any) => {
+  const res = await fetch(AUDIO_SPEECH_TO_TEXT_API, {
+    method: 'POST',
+    body: JSON.stringify(params),
+    signal: params.signal
+  });
+  if (!res.ok) {
+    throw new Error('Network response was not ok');
+  }
+  return res.json();
+};
diff --git a/src/pages/playground/components/ground-images.tsx b/src/pages/playground/components/ground-images.tsx
index 00582695..0b38550a 100644
--- a/src/pages/playground/components/ground-images.tsx
+++ b/src/pages/playground/components/ground-images.tsx
@@ -255,7 +255,7 @@ const GroundImages: React.FC<MessageProps> = forwardRef((props, ref) => {
 
       const result: any = await fetchChunkedData({
         data: params,
-        // url: 'http://192.168.50.27:40639/v1/images/generations',
+        // url: 'http://192.168.1.3:40487/v1/images/generations',
         url: CREAT_IMAGE_API,
         signal: requestToken.current.signal,
         headers: {
diff --git a/src/pages/playground/components/ground-stt.tsx b/src/pages/playground/components/ground-stt.tsx
index cd06e142..0c1b6e63 100644
--- a/src/pages/playground/components/ground-stt.tsx
+++ b/src/pages/playground/components/ground-stt.tsx
@@ -321,7 +321,9 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
     return (
       <div className="tips-text">
         <IconFont type={'icon-audio'} style={{ fontSize: 20 }}></IconFont>
-        <span>Upload an audio file or start recording</span>
+        <span>
+          {intl.formatMessage({ id: 'playground.audio.speechtotext.tips' })}
+        </span>
       </div>
     );
   };
@@ -469,7 +471,7 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
                   fontWeight: 500
                 }}
               >
-                Enable microphone access in your browser&rsquo;s settings.
+                {intl.formatMessage({ id: 'playground.audio.enablemic' })}
               </span>
             </div>
           )}
@@ -544,6 +546,7 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
         payLoad={{
           messages: viewCodeMessage
         }}
+        api="audio/transcriptions"
         parameters={parameters}
         onCancel={handleCloseViewCode}
         title={intl.formatMessage({ id: 'playground.viewcode' })}
diff --git a/src/pages/playground/components/ground-tts.tsx b/src/pages/playground/components/ground-tts.tsx
index 5d3a2799..1ca6be05 100644
--- a/src/pages/playground/components/ground-tts.tsx
+++ b/src/pages/playground/components/ground-tts.tsx
@@ -1,24 +1,21 @@
 import IconFont from '@/components/icon-font';
 import SpeechContent from '@/components/speech-content';
 import useOverlayScroller from '@/hooks/use-overlay-scroller';
-import { fetchChunkedData, readStreamData } from '@/utils/fetch-chunk-data';
+import { fetchChunkedData } from '@/utils/fetch-chunk-data';
 import { ThunderboltOutlined } from '@ant-design/icons';
 import { useIntl, useSearchParams } from '@umijs/max';
 import { Spin } from 'antd';
 import classNames from 'classnames';
-import _ from 'lodash';
 import 'overlayscrollbars/overlayscrollbars.css';
 import {
   forwardRef,
   memo,
   useEffect,
   useImperativeHandle,
-  useMemo,
   useRef,
   useState
 } from 'react';
 import { CHAT_API } from '../apis';
-import { Roles, generateMessages } from '../config';
 import { TTSParamsConfig as paramsConfig } from '../config/params-config';
 import { MessageItem } from '../config/types';
 import '../style/ground-left.less';
@@ -43,7 +40,16 @@ const initialValues = {
 const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
   const { modelList } = props;
   const messageId = useRef<number>(0);
-  const [messageList, setMessageList] = useState<MessageItem[]>([]);
+  const [messageList, setMessageList] = useState<
+    {
+      prompt: string;
+      voice: string;
+      format: string;
+      speed: number;
+      uid: number;
+      autoplay: boolean;
+    }[]
+  >([]);
 
   const intl = useIntl();
   const [searchParams] = useSearchParams();
@@ -78,51 +84,10 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
     };
   });
 
-  const viewCodeMessage = useMemo(() => {
-    return generateMessages([
-      { role: Roles.System, content: systemMessage },
-      ...messageList
-    ]);
-  }, [messageList, systemMessage]);
-
   const setMessageId = () => {
     messageId.current = messageId.current + 1;
   };
 
-  const handleNewMessage = (message?: { role: string; content: string }) => {
-    const newMessage = message || {
-      role:
-        _.last(messageList)?.role === Roles.User ? Roles.Assistant : Roles.User,
-      content: ''
-    };
-    messageList.push({
-      ...newMessage,
-      uid: messageId.current + 1
-    });
-    setMessageId();
-    setMessageList([...messageList]);
-  };
-
-  const joinMessage = (chunk: any) => {
-    setTokenResult({
-      ...(chunk?.usage ?? {})
-    });
-
-    if (!chunk || !_.get(chunk, 'choices', []).length) {
-      return;
-    }
-    contentRef.current =
-      contentRef.current + _.get(chunk, 'choices.0.delta.content', '');
-    setMessageList([
-      ...messageList,
-      ...currentMessageRef.current,
-      {
-        role: Roles.Assistant,
-        content: contentRef.current,
-        uid: messageId.current
-      }
-    ]);
-  };
   const handleStopConversation = () => {
     controllerRef.current?.abort?.();
     setLoading(false);
@@ -134,39 +99,15 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
       setLoading(true);
       setMessageId();
       setTokenResult(null);
+      setCurrentPrompt(current?.content || '');
 
       controllerRef.current?.abort?.();
       controllerRef.current = new AbortController();
       const signal = controllerRef.current.signal;
-      currentMessageRef.current = current
-        ? [
-            {
-              ...current,
-              uid: messageId.current
-            }
-          ]
-        : [];
-
-      contentRef.current = '';
-      setMessageList((pre) => {
-        return [...pre, ...currentMessageRef.current];
-      });
-
-      const messageParams = [
-        { role: Roles.System, content: systemMessage },
-        ...messageList,
-        ...currentMessageRef.current
-      ];
-
-      const messages = generateMessages(messageParams);
 
       const chatParams = {
-        messages: messages,
         ...parameters,
-        stream: true,
-        stream_options: {
-          include_usage: true
-        }
+        prompt: current?.content || currentPrompt
       };
       const result: any = await fetchChunkedData({
         data: chatParams,
@@ -174,26 +115,16 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
         signal
       });
 
-      if (result?.error) {
-        setTokenResult({
-          error: true,
-          errorMessage:
-            result?.data?.error?.message || result?.data?.message || ''
-        });
-        return;
-      }
-      setMessageId();
-      const { reader, decoder } = result;
-      await readStreamData(reader, decoder, (chunk: any) => {
-        if (chunk?.error) {
-          setTokenResult({
-            error: true,
-            errorMessage: chunk?.error?.message || chunk?.message || ''
-          });
-          return;
+      setMessageList([
+        {
+          prompt: current?.content || currentPrompt,
+          voice: parameters.voice,
+          format: parameters.response_format,
+          speed: parameters.speed,
+          uid: messageId.current,
+          autoplay: checkvalueRef.current
         }
-        joinMessage(chunk);
-      });
+      ]);
     } catch (error) {
       // console.log('error:', error);
     } finally {
@@ -210,23 +141,7 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
   };
 
   const handleSendMessage = (message: Omit<MessageItem, 'uid'>) => {
-    // submitMessage(currentMessage);
-    setMessageId();
-    setLoading(true);
-
-    setTimeout(() => {
-      setMessageList([
-        {
-          prompt: message.content,
-          voice: parameters.voice,
-          format: parameters.response_format,
-          speed: parameters.speed,
-          uid: messageId.current,
-          autoplay: checkvalueRef.current
-        }
-      ]);
-      setLoading(false);
-    }, 1000);
+    submitMessage(message);
   };
 
   const handleCloseViewCode = () => {
@@ -236,7 +151,6 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
   const handleSelectModel = () => {};
 
   const handleOnCheckChange = (e: any) => {
-    console.log('handleOnCheckChange', e);
     checkvalueRef.current = e.target.checked;
   };
   useEffect(() => {
@@ -287,7 +201,11 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
                       className="font-size-32 text-secondary"
                     ></IconFont>
                   </span>
-                  <span>Generated speech will appear here</span>
+                  <span>
+                    {intl.formatMessage({
+                      id: 'playground.audio.texttospeech.tips'
+                    })}
+                  </span>
                 </div>
               )}
               {loading && (
@@ -314,7 +232,6 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
             disabled={!parameters.model}
             isEmpty={true}
             handleSubmit={handleSendMessage}
-            addMessage={handleNewMessage}
             handleAbortFetch={handleStopConversation}
             clearAll={handleClear}
             setModelSelections={handleSelectModel}
@@ -347,6 +264,7 @@ const GroundLeft: React.FC<MessageProps> = forwardRef((props, ref) => {
         payLoad={{
           prompt: currentPrompt
         }}
+        api="audio/speech"
         parameters={parameters}
         onCancel={handleCloseViewCode}
         title={intl.formatMessage({ id: 'playground.viewcode' })}
diff --git a/src/pages/playground/speech.tsx b/src/pages/playground/speech.tsx
index f18884c8..c3147ac6 100644
--- a/src/pages/playground/speech.tsx
+++ b/src/pages/playground/speech.tsx
@@ -30,12 +30,12 @@ const Playground: React.FC = () => {
   const [loaded, setLoaded] = useState(false);
   const optionsList = [
     {
-      label: 'Text To Speech',
+      label: intl.formatMessage({ id: 'playground.audio.texttospeech' }),
       value: TabsValueMap.Tab1,
       icon: <AudioOutlined />
     },
     {
-      label: 'Speech To Text',
+      label: intl.formatMessage({ id: 'playground.audio.speechtotext' }),
       value: TabsValueMap.Tab2,
       icon: <IconFont type={'icon-audio'}></IconFont>
     }
diff --git a/src/utils/fetch-chunk-data.ts b/src/utils/fetch-chunk-data.ts
index b70b6a24..8802bcaa 100644
--- a/src/utils/fetch-chunk-data.ts
+++ b/src/utils/fetch-chunk-data.ts
@@ -51,6 +51,7 @@ export const fetchChunkedData = async (params: {
       ...params.headers
     }
   });
+  console.log('response====', response);
   if (!response.ok) {
     return {
       error: true,