From 62192d9f712f3cf40a291ba1e90c0d545d05ab1c Mon Sep 17 00:00:00 2001
From: Mohammed Alnasser <mmnasser2000@gmail.com>
Date: Mon, 23 Oct 2023 07:39:28 -0400
Subject: [PATCH] Speech to text | Issue #21 (#375)

* Implemented speech to text.
Requires checking for tests

---------

Co-authored-by: Willy Douhard <willy.douhard@gmail.com>
---
 backend/chainlit/config.py                    |  4 +
 .../ask_multiple_files/.chainlit/config.toml  | 75 +++++++++++++++++++
 frontend/package.json                         |  2 +
 frontend/pnpm-lock.yaml                       | 24 ++++++
 .../organisms/chat/inputBox/input.tsx         | 46 +++++++++++-
 frontend/src/state/project.ts                 |  1 +
 6 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 cypress/e2e/ask_multiple_files/.chainlit/config.toml

diff --git a/backend/chainlit/config.py b/backend/chainlit/config.py
index 993f053cbf..005635f0b4 100644
--- a/backend/chainlit/config.py
+++ b/backend/chainlit/config.py
@@ -50,6 +50,9 @@
 # Authorize users to upload files with messages
 multi_modal = true
 
+# Allows user to use speech to text
+# speech_to_text = true
+
 [UI]
 # Name of the app and chatbot.
 name = "Chatbot"
@@ -145,6 +148,7 @@ class Theme(DataClassJsonMixin):
 class FeaturesSettings(DataClassJsonMixin):
     prompt_playground: bool = True
     multi_modal: bool = True
+    speech_to_text: bool = True
 
 
 @dataclass()
diff --git a/cypress/e2e/ask_multiple_files/.chainlit/config.toml b/cypress/e2e/ask_multiple_files/.chainlit/config.toml
new file mode 100644
index 0000000000..2006dfbc52
--- /dev/null
+++ b/cypress/e2e/ask_multiple_files/.chainlit/config.toml
@@ -0,0 +1,75 @@
+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+
+[features]
+# Show the prompt playground
+prompt_playground = true
+
+# Authorize users to upload files with messages
+multi_modal = true
+
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+
+# Show the readme while the conversation is empty.
+show_readme_as_default = true
+
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+
+# The default value for the expand messages settings.
+default_expand_messages = false
+
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+
+# Link to your github repo. This will add a github button in the UI's header.
+# github = ""
+
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+
+# Allows user to use speech to text
+# speech_to_text = true
+
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+
+    [UI.theme.light.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+    #background = "#FAFAFA"
+    #paper = "#FFFFFF"
+
+    [UI.theme.dark.primary]
+        #main = "#F80061"
+        #dark = "#980039"
+        #light = "#FFE7EB"
+
+
+[meta]
+generated_by = "0.7.301"
diff --git a/frontend/package.json b/frontend/package.json
index 84afa984ab..98b94c8595 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -27,6 +27,7 @@
     "react-hotkeys-hook": "^4.4.1",
     "react-markdown": "^8.0.7",
     "react-router-dom": "^6.15.0",
+    "react-speech-recognition": "^3.10.0",
     "recoil": "^0.7.6",
     "remark-gfm": "^3.0.1",
     "socket.io-client": "^4.7.2",
@@ -39,6 +40,7 @@
     "@types/lodash": "^4.14.199",
     "@types/node": "^20.5.7",
     "@types/react": "^18.2.0",
+    "@types/react-speech-recognition": "^3.9.2",
     "@types/uuid": "^9.0.3",
     "@vitejs/plugin-react-swc": "^3.3.2",
     "typescript": "^5.2.2",
diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml
index 6d20aa3ec0..058c4bdc86 100644
--- a/frontend/pnpm-lock.yaml
+++ b/frontend/pnpm-lock.yaml
@@ -50,6 +50,9 @@ dependencies:
   react-router-dom:
     specifier: ^6.15.0
     version: 6.15.0(react-dom@18.2.0)(react@18.2.0)
+  react-speech-recognition:
+    specifier: ^3.10.0
+    version: 3.10.0(react@18.2.0)
   recoil:
     specifier: ^0.7.6
     version: 0.7.6(react-dom@18.2.0)(react@18.2.0)
@@ -82,6 +85,9 @@ devDependencies:
   '@types/react':
     specifier: ^18.2.0
     version: 18.2.0
+  '@types/react-speech-recognition':
+    specifier: ^3.9.2
+    version: 3.9.2
   '@types/uuid':
     specifier: ^9.0.3
     version: 9.0.3
@@ -870,6 +876,10 @@ packages:
       '@types/ms': 0.7.32
     dev: false
 
+  /@types/dom-speech-recognition@0.0.2:
+    resolution: {integrity: sha512-GZbxBsBYBMAbpPDYg64KhAr/V8MbrlsNqrjnZJGikLdNqlQbTu+u548jg7c31ZI30/vXAX+v31t/aDr4soiBEg==}
+    dev: true
+
   /@types/hast@2.3.6:
     resolution: {integrity: sha512-47rJE80oqPmFdVDCD7IheXBrVdwuBgsYwoczFvKmwfo2Mzsnt+V9OONsYauFmICb6lQPpCuXYJWejBNs4pDJRg==}
     dependencies:
@@ -901,6 +911,12 @@ packages:
   /@types/prop-types@15.7.8:
     resolution: {integrity: sha512-kMpQpfZKSCBqltAJwskgePRaYRFukDkm1oItcAbC3gNELR20XIBcN9VRgg4+m8DKsTfkWeA4m4Imp4DDuWy7FQ==}
 
+  /@types/react-speech-recognition@3.9.2:
+    resolution: {integrity: sha512-LS13Z4A8nluGWyT1NQncWoyaWARJdEojxmcRvaFDT9nTHpRkMgPjaYBJIc/9GBRYYFy8TQGaiCmUdH2g4M9INg==}
+    dependencies:
+      '@types/dom-speech-recognition': 0.0.2
+    dev: true
+
   /@types/react-transition-group@4.4.7:
     resolution: {integrity: sha512-ICCyBl5mvyqYp8Qeq9B5G/fyBSRC0zx3XM3sCC6KkcMsNeAHqXBKkmat4GqdJET5jtYUpZXrxI5flve5qhi2Eg==}
     dependencies:
@@ -1830,6 +1846,14 @@ packages:
       react: 18.2.0
     dev: false
 
+  /react-speech-recognition@3.10.0(react@18.2.0):
+    resolution: {integrity: sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw==}
+    peerDependencies:
+      react: '>=16.8.0'
+    dependencies:
+      react: 18.2.0
+    dev: false
+
   /react-transition-group@4.4.5(react-dom@18.2.0)(react@18.2.0):
     resolution: {integrity: sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==}
     peerDependencies:
diff --git a/frontend/src/components/organisms/chat/inputBox/input.tsx b/frontend/src/components/organisms/chat/inputBox/input.tsx
index acd7846368..cf3935b0a6 100644
--- a/frontend/src/components/organisms/chat/inputBox/input.tsx
+++ b/frontend/src/components/organisms/chat/inputBox/input.tsx
@@ -1,6 +1,11 @@
 import { useCallback, useEffect, useRef, useState } from 'react';
+import SpeechRecognition, {
+  useSpeechRecognition
+} from 'react-speech-recognition';
 import { useRecoilState, useSetRecoilState } from 'recoil';
 
+import KeyboardVoiceIcon from '@mui/icons-material/KeyboardVoice';
+import StopCircleIcon from '@mui/icons-material/StopCircle';
 import SendIcon from '@mui/icons-material/Telegram';
 import TuneIcon from '@mui/icons-material/Tune';
 import { Box, IconButton, Stack, TextField } from '@mui/material';
@@ -18,7 +23,7 @@ import HistoryButton from 'components/organisms/chat/history';
 
 import { attachmentsState } from 'state/chat';
 import { chatHistoryState } from 'state/chatHistory';
-import { chatSettingsOpenState } from 'state/project';
+import { chatSettingsOpenState, projectSettingsState } from 'state/project';
 
 import UploadButton from './UploadButton';
 
@@ -55,6 +60,16 @@ const Input = ({
 
   const [value, setValue] = useState('');
   const [isComposing, setIsComposing] = useState(false);
+  const [isRecording, setIsRecording] = useState(false);
+  const { transcript, browserSupportsSpeechRecognition } =
+    useSpeechRecognition();
+
+  const [pSettings] = useRecoilState(projectSettingsState);
+  const showTextToSpeech =
+    (pSettings?.features.speech_to_text === undefined
+      ? true
+      : pSettings?.features.speech_to_text) && browserSupportsSpeechRecognition;
+  const [lastTranscript, setLastTranscript] = useState('');
 
   useEffect(() => {
     const pasteEvent = (event: ClipboardEvent) => {
@@ -104,6 +119,13 @@ const Input = ({
     }
   }, [loading, disabled]);
 
+  useEffect(() => {
+    if (lastTranscript.length < transcript.length) {
+      setValue((text) => text + transcript.slice(lastTranscript.length));
+    }
+    setLastTranscript(transcript);
+  }, [transcript]);
+
   const submit = useCallback(() => {
     if (value === '' || disabled) {
       return;
@@ -169,6 +191,28 @@ const Input = ({
           <TuneIcon />
         </IconButton>
       )}
+      {showTextToSpeech &&
+        (isRecording ? (
+          <IconButton
+            onClick={() => {
+              setIsRecording(false);
+              SpeechRecognition.stopListening();
+            }}
+          >
+            <StopCircleIcon />
+          </IconButton>
+        ) : (
+          <IconButton
+            onClick={() => {
+              setIsRecording(true);
+              SpeechRecognition.startListening({
+                continuous: true
+              });
+            }}
+          >
+            <KeyboardVoiceIcon />
+          </IconButton>
+        ))}
       <UploadButton
         disabled={disabled}
         fileSpec={fileSpec}
diff --git a/frontend/src/state/project.ts b/frontend/src/state/project.ts
index fbaf80f066..f0687acb1b 100644
--- a/frontend/src/state/project.ts
+++ b/frontend/src/state/project.ts
@@ -21,6 +21,7 @@ export interface IProjectSettings {
   };
   features: {
     multi_modal?: boolean;
+    speech_to_text?: boolean;
   };
   userEnv: string[];
   dataPersistence: boolean;