Add new features and improve WhisperModule

This commit introduces several enhancements and new features to the WhisperModule and the application UI. In `App.xaml`, a new property `RenderOptions.BitmapScalingMode` is added to the `SettingsCheckbox` style. `WhisperModule.cs` sees the most changes, with new namespaces imported, new properties and methods added to the `WhisperModuleSettings` class, and updates to existing methods to handle new settings and state changes. The `WhisperModule` class now loads settings from a JSON file on initialization and saves settings when the application is closing. Two new sound files are added to the project in `MagicChatbox.csproj`. The UI in `MainWindow.xaml` is updated with a new section for configuring speech-to-text settings. Lastly, the `OnClosing` method in `MainWindow.xaml.cs` is updated to save the settings of the `WhisperModule` when the application is closing.
BoiHanny · Mar 15, 2024 · f34a5d6 · f34a5d6
1 parent 715d575
commit f34a5d6
Show file tree

Hide file tree

Showing 7 changed files with 348 additions and 16 deletions.
diff --git a/vrcosc-magicchatbox/App.xaml b/vrcosc-magicchatbox/App.xaml
@@ -1153,6 +1153,7 @@
         <Style x:Key="SettingsCheckbox" TargetType="{x:Type CheckBox}">
             <Setter Property="SnapsToDevicePixels" Value="False" />
             <Setter Property="OverridesDefaultStyle" Value="true" />
+            <Setter Property="RenderOptions.BitmapScalingMode" Value="NearestNeighbor" />
             <Setter Property="FocusVisualStyle" Value="{x:Null}" />
             <Setter Property="Cursor" Value="Hand" />
             <Setter Property="Template">

diff --git a/vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs b/vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs
@@ -8,11 +8,17 @@
 using System.Collections.Generic;
 using System.Linq;
 using vrcosc_magicchatbox.ViewModels;
+using Newtonsoft.Json;
+using System.Media;
+using System.Reflection;
+using System.Windows;
 
 namespace vrcosc_magicchatbox.Classes.Modules
 {
     public partial class WhisperModuleSettings : ObservableObject
     {
+        private const string SettingsFileName = "WhisperModuleSettings.json";
+
         [ObservableProperty]
         private List<RecordingDeviceInfo> availableDevices;
 
@@ -28,9 +34,118 @@ public partial class WhisperModuleSettings : ObservableObject
         [ObservableProperty]
         private bool isRecording = false;
 
+        [ObservableProperty]
+        private List<string> speechToTextLanguages = new List<string>();
+
+        [ObservableProperty]
+        private string selectedSpeechToTextLanguage;
+
+        [ObservableProperty]
+        private bool autoLanguageDetection = true;
+
+        [ObservableProperty]
+        private int silenceAutoTurnOffDuration = 3000;
+
         public WhisperModuleSettings()
         {
             RefreshDevices();
+            RefreshSpeechToTextLanguages();
+        }
+
+        public void SaveSettings()
+        {
+            var settingsJson = JsonConvert.SerializeObject(this, Formatting.Indented);
+            File.WriteAllText(SettingsFileName, settingsJson);
+        }
+
+        public static WhisperModuleSettings LoadSettings()
+        {
+            if (File.Exists(SettingsFileName))
+            {
+                var settingsJson = File.ReadAllText(SettingsFileName);
+                return JsonConvert.DeserializeObject<WhisperModuleSettings>(settingsJson);
+            }
+
+            return new WhisperModuleSettings();
+        }
+
+        private void RefreshSpeechToTextLanguages()
+        {
+            // Ordered by a hypothetical "most commonly used" metric, adjust as needed
+            SpeechToTextLanguages = new List<string>
+        {
+            "English",
+            "Chinese",
+            "Spanish",
+            "Hindi",
+            "Arabic",
+            "Portuguese",
+            "Bengali",
+            "Russian",
+            "Japanese",
+            "French",
+            "German",
+            "Korean",
+            "Italian",
+            "Turkish",
+            "Polish",
+            "Dutch",
+            "Indonesian",
+            "Thai",
+            "Swedish",
+            "Danish",
+            "Norwegian",
+            "Finnish",
+            "Vietnamese",
+            "Czech",
+            "Greek",
+            "Romanian",
+            "Hungarian",
+            "Slovak",
+            "Ukrainian",
+            "Bulgarian",
+            "Croatian",
+            "Serbian",
+            "Lithuanian",
+            "Latvian",
+            "Estonian",
+            "Slovenian",
+            "Hebrew",
+            "Persian",
+            "Armenian",
+            "Azerbaijani",
+            "Kazakh",
+            "Uzbek",
+            "Tajik",
+            "Georgian",
+            "Mongolian",
+            "Afrikaans",
+            "Swahili",
+            "Maori",
+            "Nepali",
+            "Marathi",
+            "Kannada",
+            "Tamil",
+            "Telugu",
+            "Malay",
+            "Malayalam",
+            "Bosnian",
+            "Macedonian",
+            "Albanian",
+            "Filipino",
+            "Tagalog",
+            "Urdu",
+            "Welsh",
+            "Icelandic",
+            "Maltese",
+            "Galician",
+            "Belarusian",
+            "Catalan"
+        };
+
+            // Assuming SelectedSpeechToTextLanguage should be set to the most common language initially
+            if (string.IsNullOrWhiteSpace(SelectedSpeechToTextLanguage))
+                SelectedSpeechToTextLanguage = SpeechToTextLanguages.FirstOrDefault();
         }
 
         public string GetSelectedDeviceName()
@@ -45,6 +160,8 @@ public string GetSelectedDeviceName()
             }
         }
 
+
+
 
 
         public void RefreshDevices()
@@ -81,6 +198,7 @@ public partial class WhisperModule : ObservableObject
         private DateTime lastSoundTimestamp = DateTime.Now;
         private bool isCurrentlySpeaking = false;
         private DateTime speakingStartedTimestamp = DateTime.Now;
+        private bool isProcessingShortPause = false;
 
         public event Action<string> TranscriptionReceived;
 
@@ -89,17 +207,35 @@ public partial class WhisperModule : ObservableObject
 
         public WhisperModule()
         {
+            Settings = WhisperModuleSettings.LoadSettings();
+            Settings.PropertyChanged += Settings_PropertyChanged;
             InitializeWaveIn();
         }
 
+        public void OnApplicationClosing()
+        {
+            Settings.SaveSettings();
+        }
+
+        private void Settings_PropertyChanged(object sender, System.ComponentModel.PropertyChangedEventArgs e)
+        {
+            if (e.PropertyName == nameof(Settings.SelectedDeviceIndex))
+            {
+                StopRecording();
+                InitializeWaveIn();
+            }
+        }
+
         private void InitializeWaveIn()
         {
-            waveIn?.Dispose();
+            waveIn?.Dispose(); // Dispose any existing instance
 
             if (settings.SelectedDeviceIndex == -1)
             {
                 UpdateUI("No valid audio input device selected.", false);
-                throw new InvalidOperationException("No valid audio input device selected.");
+                // Consider handling this scenario without throwing an exception,
+                // perhaps by disabling recording functionality until a valid device is selected.
+                return;
             }
 
             waveIn = new WaveInEvent
@@ -132,6 +268,7 @@ public void StartRecording()
             }
             UpdateUI("Ready to speak?", true);
             waveIn.StartRecording();
+            //PlaySound("start.wav");
             settings.IsRecording = true;
         }
 
@@ -158,40 +295,96 @@ public void StopRecording()
             {
                 ProcessAudioStreamAsync(audioStream);
             }
+            audioStream = new MemoryStream();
         }
 
+        //private void PlaySound(string soundFileName)
+        //{
+        //    var assembly = Assembly.GetExecutingAssembly();
+        //    string resourceName = assembly.GetName().Name + ".Sounds." + soundFileName;
+
+        //    using (Stream stream = assembly.GetManifestResourceStream(resourceName))
+        //    {
+        //        if (stream == null)
+        //        {
+        //            throw new InvalidOperationException("Could not find resource sound file: " + resourceName);
+        //        }
+
+        //        SoundPlayer player = new SoundPlayer(stream);
+        //        player.Play();
+        //    }
+        //}
+
+
+
         private void OnDataAvailable(object sender, WaveInEventArgs e)
         {
             float maxAmplitude = CalculateMaxAmplitude(e.Buffer, e.BytesRecorded);
-            bool isLoudEnough = maxAmplitude > settings.NoiseGateThreshold;
-
-            settings.IsNoiseGateOpen = isLoudEnough;
+            bool isLoudEnough = maxAmplitude > Settings.NoiseGateThreshold;
+            Settings.IsNoiseGateOpen = isLoudEnough;
 
             if (isLoudEnough)
             {
                 if (!isCurrentlySpeaking)
                 {
-                    speakingStartedTimestamp = DateTime.Now; // Mark the start of speaking
+                    speakingStartedTimestamp = DateTime.Now;
                     isCurrentlySpeaking = true;
                 }
 
-                // Update elapsed speaking time continuously while speaking
                 var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
                 UpdateUI($"Speaking detected, recording... (Duration: {speakingDuration:0.0}s)", true);
-
                 audioStream.Write(e.Buffer, 0, e.BytesRecorded);
                 lastSoundTimestamp = DateTime.Now;
             }
-            else if (isCurrentlySpeaking && DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds > 500)
+            else if (isCurrentlySpeaking)
             {
-                var speakingDuration = DateTime.Now.Subtract(speakingStartedTimestamp).TotalSeconds;
-                isCurrentlySpeaking = false;
-                UpdateUI($"Processing audio... (Duration: {speakingDuration:0.0}s)", true);
-                ProcessAudioStreamAsync(audioStream);
-                audioStream = new MemoryStream(); // Reset the stream for new data after processing
+                var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;
+
+                if (silenceDuration > 500 && silenceDuration <= Settings.SilenceAutoTurnOffDuration)
+                {
+                    if (!isProcessingShortPause)
+                    {
+                        isProcessingShortPause = true;
+                        // Offload to a background task since we can't await in this event handler
+                        Task.Run(() => ProcessShortPauseAsync()).ContinueWith(_ =>
+                        {
+                            // Use Dispatcher.Invoke to ensure that the following actions are performed on the UI thread.
+                            Application.Current.Dispatcher.Invoke(() =>
+                            {
+                                // Actions to take after processing the short pause, ensuring thread safety for UI operations
+                                isProcessingShortPause = false;
+                                // Any other UI updates or state changes that need to be made safely on the UI thread
+                            });
+                        });
+
+                    }
+                }
+                else if (silenceDuration > Settings.SilenceAutoTurnOffDuration)
+                {
+                    isCurrentlySpeaking = false;
+                    UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, stopping recording...", true);
+                    StopRecording();
+                }
             }
         }
 
+        private async Task ProcessShortPauseAsync()
+        {
+            await ProcessAudioStreamAsync(audioStream);
+            // Ensure the continuation logic here is thread-safe, especially if updating the UI
+            App.Current.Dispatcher.Invoke(() =>
+            {
+                isProcessingShortPause = false;
+                audioStream = new MemoryStream(); // Reset for new data
+                lastSoundTimestamp = DateTime.Now; // Reset timestamp
+                                                   // Optionally update the UI or reset flags
+            });
+        }
+
+
+
+
+
         private async void UpdateUI(string message, bool isVisible)
         {
             ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabelTxt = message;
@@ -215,7 +408,7 @@ private float CalculateMaxAmplitude(byte[] buffer, int bytesRecorded)
             return samples.Max(sample => Math.Abs(sample / 32768f));
         }
 
-        private async void ProcessAudioStreamAsync(MemoryStream stream)
+        private async Task ProcessAudioStreamAsync(MemoryStream stream)
         {
             if (stream.Length == 0)
             {
@@ -249,7 +442,7 @@ private async Task<string> TranscribeAudioAsync(Stream audioStream)
                     await audioStream.CopyToAsync(writer);
                 }
 
-                var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath));
+                var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection?null:Settings.SelectedSpeechToTextLanguage));
 
                 return response;
             }

diff --git a/vrcosc-magicchatbox/MagicChatbox.csproj b/vrcosc-magicchatbox/MagicChatbox.csproj
@@ -84,6 +84,8 @@
     <None Remove="Img\MagicOSC_icon.png" />
     <None Remove="Json\voices.json" />
     <None Remove="NLog.config" />
+    <None Remove="Sounds\start.wav" />
+    <None Remove="Sounds\stop.wav" />
   </ItemGroup>
 
   <ItemGroup>
@@ -154,6 +156,12 @@
     <Resource Include="Img\Icons\yes.png" />
     <Resource Include="Img\MagicOSC_icon.png" />
     <Resource Include="Img\Icons\NetworkStats_ico.png" />
+    <Resource Include="Sounds\start.wav">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </Resource>
+    <Resource Include="Sounds\stop.wav">
+      <CopyToOutputDirectory>Never</CopyToOutputDirectory>
+    </Resource>
     <Content Include="NLog.config">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </Content>