Skip to content

Commit

Permalink
Add new features and improve WhisperModule
Browse files Browse the repository at this point in the history
This commit introduces several enhancements and new features to the WhisperModule and the application UI. In `App.xaml`, a new property `RenderOptions.BitmapScalingMode` is added to the `SettingsCheckbox` style. `WhisperModule.cs` sees the most changes, with new namespaces imported, new properties and methods added to the `WhisperModuleSettings` class, and updates to existing methods to handle new settings and state changes. The `WhisperModule` class now loads settings from a JSON file on initialization and saves settings when the application is closing. Two new sound files are added to the project in `MagicChatbox.csproj`. The UI in `MainWindow.xaml` is updated with a new section for configuring speech-to-text settings. Lastly, the `OnClosing` method in `MainWindow.xaml.cs` is updated to save the settings of the `WhisperModule` when the application is closing.
  • Loading branch information
BoiHanny committed Mar 15, 2024
1 parent 715d575 commit f34a5d6
Show file tree
Hide file tree
Showing 7 changed files with 348 additions and 16 deletions.
1 change: 1 addition & 0 deletions vrcosc-magicchatbox/App.xaml
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,7 @@
<Style x:Key="SettingsCheckbox" TargetType="{x:Type CheckBox}">
<Setter Property="SnapsToDevicePixels" Value="False" />
<Setter Property="OverridesDefaultStyle" Value="true" />
<Setter Property="RenderOptions.BitmapScalingMode" Value="NearestNeighbor" />
<Setter Property="FocusVisualStyle" Value="{x:Null}" />
<Setter Property="Cursor" Value="Hand" />
<Setter Property="Template">
Expand Down
225 changes: 209 additions & 16 deletions vrcosc-magicchatbox/Classes/Modules/WhisperModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@
using System.Collections.Generic;
using System.Linq;
using vrcosc_magicchatbox.ViewModels;
using Newtonsoft.Json;
using System.Media;
using System.Reflection;
using System.Windows;

namespace vrcosc_magicchatbox.Classes.Modules
{
public partial class WhisperModuleSettings : ObservableObject
{
private const string SettingsFileName = "WhisperModuleSettings.json";

[ObservableProperty]
private List<RecordingDeviceInfo> availableDevices;

Expand All @@ -28,9 +34,118 @@ public partial class WhisperModuleSettings : ObservableObject
[ObservableProperty]
private bool isRecording = false;

[ObservableProperty]
private List<string> speechToTextLanguages = new List<string>();

[ObservableProperty]
private string selectedSpeechToTextLanguage;

[ObservableProperty]
private bool autoLanguageDetection = true;

[ObservableProperty]
private int silenceAutoTurnOffDuration = 3000;

public WhisperModuleSettings()
{
RefreshDevices();
RefreshSpeechToTextLanguages();
}

public void SaveSettings()
{
var settingsJson = JsonConvert.SerializeObject(this, Formatting.Indented);
File.WriteAllText(SettingsFileName, settingsJson);
}

public static WhisperModuleSettings LoadSettings()
{
if (File.Exists(SettingsFileName))
{
var settingsJson = File.ReadAllText(SettingsFileName);
return JsonConvert.DeserializeObject<WhisperModuleSettings>(settingsJson);
}

return new WhisperModuleSettings();
}

private void RefreshSpeechToTextLanguages()
{
// Ordered by a hypothetical "most commonly used" metric, adjust as needed
SpeechToTextLanguages = new List<string>
{
"English",
"Chinese",
"Spanish",
"Hindi",
"Arabic",
"Portuguese",
"Bengali",
"Russian",
"Japanese",
"French",
"German",
"Korean",
"Italian",
"Turkish",
"Polish",
"Dutch",
"Indonesian",
"Thai",
"Swedish",
"Danish",
"Norwegian",
"Finnish",
"Vietnamese",
"Czech",
"Greek",
"Romanian",
"Hungarian",
"Slovak",
"Ukrainian",
"Bulgarian",
"Croatian",
"Serbian",
"Lithuanian",
"Latvian",
"Estonian",
"Slovenian",
"Hebrew",
"Persian",
"Armenian",
"Azerbaijani",
"Kazakh",
"Uzbek",
"Tajik",
"Georgian",
"Mongolian",
"Afrikaans",
"Swahili",
"Maori",
"Nepali",
"Marathi",
"Kannada",
"Tamil",
"Telugu",
"Malay",
"Malayalam",
"Bosnian",
"Macedonian",
"Albanian",
"Filipino",
"Tagalog",
"Urdu",
"Welsh",
"Icelandic",
"Maltese",
"Galician",
"Belarusian",
"Catalan"
};

// Assuming SelectedSpeechToTextLanguage should be set to the most common language initially
if (string.IsNullOrWhiteSpace(SelectedSpeechToTextLanguage))
SelectedSpeechToTextLanguage = SpeechToTextLanguages.FirstOrDefault();
}

public string GetSelectedDeviceName()
Expand All @@ -45,6 +160,8 @@ public string GetSelectedDeviceName()
}
}





public void RefreshDevices()
Expand Down Expand Up @@ -81,6 +198,7 @@ public partial class WhisperModule : ObservableObject
private DateTime lastSoundTimestamp = DateTime.Now;
private bool isCurrentlySpeaking = false;
private DateTime speakingStartedTimestamp = DateTime.Now;
private bool isProcessingShortPause = false;

public event Action<string> TranscriptionReceived;

Expand All @@ -89,17 +207,35 @@ public partial class WhisperModule : ObservableObject

public WhisperModule()
{
Settings = WhisperModuleSettings.LoadSettings();
Settings.PropertyChanged += Settings_PropertyChanged;
InitializeWaveIn();
}

public void OnApplicationClosing()
{
Settings.SaveSettings();
}

private void Settings_PropertyChanged(object sender, System.ComponentModel.PropertyChangedEventArgs e)
{
if (e.PropertyName == nameof(Settings.SelectedDeviceIndex))
{
StopRecording();
InitializeWaveIn();
}
}

private void InitializeWaveIn()
{
waveIn?.Dispose();
waveIn?.Dispose(); // Dispose any existing instance

if (settings.SelectedDeviceIndex == -1)
{
UpdateUI("No valid audio input device selected.", false);
throw new InvalidOperationException("No valid audio input device selected.");
// Consider handling this scenario without throwing an exception,
// perhaps by disabling recording functionality until a valid device is selected.
return;
}

waveIn = new WaveInEvent
Expand Down Expand Up @@ -132,6 +268,7 @@ public void StartRecording()
}
UpdateUI("Ready to speak?", true);
waveIn.StartRecording();
//PlaySound("start.wav");
settings.IsRecording = true;
}

Expand All @@ -158,40 +295,96 @@ public void StopRecording()
{
ProcessAudioStreamAsync(audioStream);
}
audioStream = new MemoryStream();
}

//private void PlaySound(string soundFileName)
//{
// var assembly = Assembly.GetExecutingAssembly();
// string resourceName = assembly.GetName().Name + ".Sounds." + soundFileName;

// using (Stream stream = assembly.GetManifestResourceStream(resourceName))
// {
// if (stream == null)
// {
// throw new InvalidOperationException("Could not find resource sound file: " + resourceName);
// }

// SoundPlayer player = new SoundPlayer(stream);
// player.Play();
// }
//}



private void OnDataAvailable(object sender, WaveInEventArgs e)
{
float maxAmplitude = CalculateMaxAmplitude(e.Buffer, e.BytesRecorded);
bool isLoudEnough = maxAmplitude > settings.NoiseGateThreshold;

settings.IsNoiseGateOpen = isLoudEnough;
bool isLoudEnough = maxAmplitude > Settings.NoiseGateThreshold;
Settings.IsNoiseGateOpen = isLoudEnough;

if (isLoudEnough)
{
if (!isCurrentlySpeaking)
{
speakingStartedTimestamp = DateTime.Now; // Mark the start of speaking
speakingStartedTimestamp = DateTime.Now;
isCurrentlySpeaking = true;
}

// Update elapsed speaking time continuously while speaking
var speakingDuration = (DateTime.Now - speakingStartedTimestamp).TotalSeconds;
UpdateUI($"Speaking detected, recording... (Duration: {speakingDuration:0.0}s)", true);

audioStream.Write(e.Buffer, 0, e.BytesRecorded);
lastSoundTimestamp = DateTime.Now;
}
else if (isCurrentlySpeaking && DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds > 500)
else if (isCurrentlySpeaking)
{
var speakingDuration = DateTime.Now.Subtract(speakingStartedTimestamp).TotalSeconds;
isCurrentlySpeaking = false;
UpdateUI($"Processing audio... (Duration: {speakingDuration:0.0}s)", true);
ProcessAudioStreamAsync(audioStream);
audioStream = new MemoryStream(); // Reset the stream for new data after processing
var silenceDuration = DateTime.Now.Subtract(lastSoundTimestamp).TotalMilliseconds;

if (silenceDuration > 500 && silenceDuration <= Settings.SilenceAutoTurnOffDuration)
{
if (!isProcessingShortPause)
{
isProcessingShortPause = true;
// Offload to a background task since we can't await in this event handler
Task.Run(() => ProcessShortPauseAsync()).ContinueWith(_ =>
{
// Use Dispatcher.Invoke to ensure that the following actions are performed on the UI thread.
Application.Current.Dispatcher.Invoke(() =>
{
// Actions to take after processing the short pause, ensuring thread safety for UI operations
isProcessingShortPause = false;
// Any other UI updates or state changes that need to be made safely on the UI thread
});
});

}
}
else if (silenceDuration > Settings.SilenceAutoTurnOffDuration)
{
isCurrentlySpeaking = false;
UpdateUI($"Silence detected for more than {Settings.SilenceAutoTurnOffDuration / 1000.0} seconds, stopping recording...", true);
StopRecording();
}
}
}

private async Task ProcessShortPauseAsync()
{
await ProcessAudioStreamAsync(audioStream);
// Ensure the continuation logic here is thread-safe, especially if updating the UI
App.Current.Dispatcher.Invoke(() =>
{
isProcessingShortPause = false;
audioStream = new MemoryStream(); // Reset for new data
lastSoundTimestamp = DateTime.Now; // Reset timestamp
// Optionally update the UI or reset flags
});
}





private async void UpdateUI(string message, bool isVisible)
{
ViewModel.Instance.IntelliChatModule.Settings.IntelliChatUILabelTxt = message;
Expand All @@ -215,7 +408,7 @@ private float CalculateMaxAmplitude(byte[] buffer, int bytesRecorded)
return samples.Max(sample => Math.Abs(sample / 32768f));
}

private async void ProcessAudioStreamAsync(MemoryStream stream)
private async Task ProcessAudioStreamAsync(MemoryStream stream)
{
if (stream.Length == 0)
{
Expand Down Expand Up @@ -249,7 +442,7 @@ private async Task<string> TranscribeAudioAsync(Stream audioStream)
await audioStream.CopyToAsync(writer);
}

var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath));
var response = await OpenAIModule.Instance.OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(new AudioTranscriptionRequest(tempFilePath, language: Settings.AutoLanguageDetection?null:Settings.SelectedSpeechToTextLanguage));

return response;
}
Expand Down
8 changes: 8 additions & 0 deletions vrcosc-magicchatbox/MagicChatbox.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@
<None Remove="Img\MagicOSC_icon.png" />
<None Remove="Json\voices.json" />
<None Remove="NLog.config" />
<None Remove="Sounds\start.wav" />
<None Remove="Sounds\stop.wav" />
</ItemGroup>

<ItemGroup>
Expand Down Expand Up @@ -154,6 +156,12 @@
<Resource Include="Img\Icons\yes.png" />
<Resource Include="Img\MagicOSC_icon.png" />
<Resource Include="Img\Icons\NetworkStats_ico.png" />
<Resource Include="Sounds\start.wav">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</Resource>
<Resource Include="Sounds\stop.wav">
<CopyToOutputDirectory>Never</CopyToOutputDirectory>
</Resource>
<Content Include="NLog.config">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
Expand Down
Loading

0 comments on commit f34a5d6

Please sign in to comment.