Adding SpeechToText Sample

Neuronlab · Oct 3, 2016 · 709257d · 709257d
1 parent a7e74d4
commit 709257d
Show file tree

Hide file tree

Showing 28 changed files with 1,405 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -253,3 +253,6 @@ paket-files/
 
 # Node
 /**/node_modules
+
+# Visual Studio Code
+.vscode/
diff --git a/CSharp/intelligence-SpeechToText/App_Start/WebApiConfig.cs b/CSharp/intelligence-SpeechToText/App_Start/WebApiConfig.cs
@@ -0,0 +1,33 @@
+namespace SpeechToText
+{
+    using System.Web.Http;
+    using Newtonsoft.Json;
+    using Newtonsoft.Json.Serialization;
+
+    public static class WebApiConfig
+    {
+        public static void Register(HttpConfiguration config)
+        {
+            // Json settings
+            config.Formatters.JsonFormatter.SerializerSettings.NullValueHandling = NullValueHandling.Ignore;
+            config.Formatters.JsonFormatter.SerializerSettings.ContractResolver = new CamelCasePropertyNamesContractResolver();
+            config.Formatters.JsonFormatter.SerializerSettings.Formatting = Formatting.Indented;
+            JsonConvert.DefaultSettings = () => new JsonSerializerSettings
+            {
+                ContractResolver = new CamelCasePropertyNamesContractResolver(),
+                Formatting = Formatting.Indented,
+                NullValueHandling = NullValueHandling.Ignore,
+            };
+
+            // Web API configuration and services
+
+            // Web API routes
+            config.MapHttpAttributeRoutes();
+
+            config.Routes.MapHttpRoute(
+                name: "DefaultApi",
+                routeTemplate: "api/{controller}/{id}",
+                defaults: new { id = RouteParameter.Optional });
+        }
+    }
+}
diff --git a/CSharp/intelligence-SpeechToText/Controllers/MessagesController.cs b/CSharp/intelligence-SpeechToText/Controllers/MessagesController.cs
@@ -0,0 +1,182 @@
+namespace SpeechToText.Controllers
+{
+    using System;
+    using System.Diagnostics;
+    using System.IO;
+    using System.Linq;
+    using System.Net;
+    using System.Net.Http;
+    using System.Net.Http.Headers;
+    using System.Threading.Tasks;
+    using System.Web.Http;
+    using Microsoft.Bot.Connector;
+    using Services;
+
+    [BotAuthentication]
+    public class MessagesController : ApiController
+    {
+        private readonly MicrosoftCognitiveSpeechService speechService = new MicrosoftCognitiveSpeechService();
+
+        /// <summary>
+        /// POST: api/Messages
+        /// Receive a message from a user and reply to it
+        /// </summary>
+        public async Task<HttpResponseMessage> Post([FromBody]Activity activity)
+        {
+            if (activity.Type == ActivityTypes.Message)
+            {
+                var connector = new ConnectorClient(new Uri(activity.ServiceUrl));
+                string message;
+
+                try
+                {
+                    var audioAttachment = activity.Attachments?.FirstOrDefault(a => a.ContentType.Equals("audio/wav") || a.ContentType.Equals("application/octet-stream"));
+                    if (audioAttachment != null)
+                    {
+                        var stream = await GetImageStream(connector, audioAttachment);
+                        var text = await this.speechService.GetTextFromAudioAsync(stream);
+                        message = ProcessText(activity.Text, text);
+                    }
+                    else
+                    {
+                        message = "Did you upload an audio file? I'm more of an audible person. Try sending me a wav file";
+                    }
+                }
+                catch (Exception e)
+                {
+                    message = "Oops! Something went wrong. Try again later.";
+
+                    Trace.TraceError(e.ToString());
+                }
+
+                Activity reply = activity.CreateReply(message);
+                await connector.Conversations.ReplyToActivityAsync(reply);
+            }
+            else
+            {
+                await this.HandleSystemMessage(activity);
+            }
+
+            var response = this.Request.CreateResponse(HttpStatusCode.OK);
+            return response;
+        }
+
+        private static string ProcessText(string input, string text)
+        {
+            string message = "You said : " + text + ".";
+
+            input = input?.Trim();
+
+            if (!string.IsNullOrEmpty(input))
+            {
+                var normalizedInput = input.ToUpper();
+
+                if (normalizedInput.Equals("WORD"))
+                {
+                    var wordCount = text.Split(' ').Count(x => !string.IsNullOrEmpty(x));
+                    message += " Word Count: " + wordCount;
+                }
+                else if (normalizedInput.Equals("CHARACTER"))
+                {
+                    var characterCount = text.Count(c => c != ' ');
+                    message += " Character Count: " + characterCount;
+                }
+                else if (normalizedInput.Equals("SPACE"))
+                {
+                    var spaceCount = text.Count(c => c == ' ');
+                    message += " Space Count: " + spaceCount;
+                }
+                else if (normalizedInput.Equals("VOWEL"))
+                {
+                    var vowelCount = text.ToUpper().Count("AEIOU".Contains);
+                    message += " Vowel Count: " + vowelCount;
+                }
+                else
+                {
+                    var keywordCount = text.ToUpper().Split(' ').Count(w => w == normalizedInput);
+                    message += " Keyword " + input + " found " + keywordCount + " times.";
+                }
+            }
+
+            return message;
+        }
+
+        /// <summary>
+        /// Handles the system activity.
+        /// </summary>
+        /// <param name="activity">The activity.</param>
+        /// <returns>Activity</returns>
+        private async Task<Activity> HandleSystemMessage(Activity activity)
+        {
+            switch (activity.Type)
+            {
+                case ActivityTypes.DeleteUserData:
+                    // Implement user deletion here
+                    // If we handle user deletion, return a real message
+                    break;
+                case ActivityTypes.ConversationUpdate:
+                    // Greet the user the first time the bot is added to a conversation.
+                    if (activity.MembersAdded.Any(m => m.Id == activity.Recipient.Id))
+                    {
+                        var connector = new ConnectorClient(new Uri(activity.ServiceUrl));
+
+                        var response = activity.CreateReply();
+                        response.Text = "Hi! I am SpeechToText Bot. I can understand the content of any audio and convert it to text. Try sending me a wav file.";
+
+                        await connector.Conversations.ReplyToActivityAsync(response);
+                    }
+
+                    break;
+                case ActivityTypes.ContactRelationUpdate:
+                    // Handle add/remove from contact lists
+                    break;
+                case ActivityTypes.Typing:
+                    // Handle knowing that the user is typing
+                    break;
+                case ActivityTypes.Ping:
+                    break;
+            }
+
+            return null;
+        }
+
+        private static async Task<Stream> GetImageStream(ConnectorClient connector, Attachment imageAttachment)
+        {
+            using (var httpClient = new HttpClient())
+            {
+                // The Skype attachment URLs are secured by JwtToken,
+                // you should set the JwtToken of your bot as the authorization header for the GET request your bot initiates to fetch the image.
+                // https://github.com/Microsoft/BotBuilder/issues/662
+                var uri = new Uri(imageAttachment.ContentUrl);
+                if (uri.Host.EndsWith("skype.com") && uri.Scheme == "https")
+                {
+                    httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", await GetTokenAsync(connector));
+                    httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/octet-stream"));
+                }
+                else
+                {
+                    httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue(imageAttachment.ContentType));
+                }
+
+                return await httpClient.GetStreamAsync(uri);
+            }
+        }
+
+        /// <summary>
+        /// Gets the JwT token of the bot. 
+        /// </summary>
+        /// <param name="connector"></param>
+        /// <returns>JwT token of the bot</returns>
+        private static async Task<string> GetTokenAsync(ConnectorClient connector)
+        {
+            var credentials = connector.Credentials as MicrosoftAppCredentials;
+            if (credentials != null)
+            {
+                return await credentials.GetTokenAsync();
+            }
+
+            return null;
+        }
+
+    }
+}
diff --git a/CSharp/intelligence-SpeechToText/Global.asax b/CSharp/intelligence-SpeechToText/Global.asax
@@ -0,0 +1 @@
+<%@ Application Codebehind="Global.asax.cs" Inherits="SpeechToText.WebApiApplication" Language="C#" %>
diff --git a/CSharp/intelligence-SpeechToText/Global.asax.cs b/CSharp/intelligence-SpeechToText/Global.asax.cs
@@ -0,0 +1,12 @@
+namespace SpeechToText
+{
+    using System.Web.Http;
+
+    public class WebApiApplication : System.Web.HttpApplication
+    {
+        protected void Application_Start()
+        {
+            GlobalConfiguration.Configure(WebApiConfig.Register);
+        }
+    }
+}
diff --git a/CSharp/intelligence-SpeechToText/Properties/AssemblyInfo.cs b/CSharp/intelligence-SpeechToText/Properties/AssemblyInfo.cs
@@ -0,0 +1,34 @@
+using System.Reflection;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("SpeechToText")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("SpeechToText")]
+[assembly: AssemblyCopyright("Copyright ©  2016")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("a8ba1066-5695-4d71-abb4-65e5a5e0c3d4")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Revision and Build Numbers 
+// by using the '*' as shown below:
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
diff --git a/CSharp/intelligence-SpeechToText/README.md b/CSharp/intelligence-SpeechToText/README.md
@@ -0,0 +1,88 @@
+# Speech To Text Bot Sample
+
+A sample bot that illustrates how to use the Microsoft Cognitive Services Bing Speech API to analyze an audio file and return the text.
+
+[![Deploy to Azure](http://azuredeploy.net/deploybutton.png)](https://azuredeploy.net)
+
+### Prerequisites
+
+The minimum prerequisites to run this sample are:
+* The latest update of Visual Studio 2015. You can download the community version [here](http://www.visualstudio.com) for free.
+* The Bot Framework Emulator. To install the Bot Framework Emulator, download it from [here](https://aka.ms/bf-bc-emulator). Please refer to [this documentation article](https://docs.botframework.com/en-us/csharp/builder/sdkreference/gettingstarted.html#emulator) to know more about the Bot Framework Emulator.
+* **[Recommended]** Visual Studio Code for IntelliSense and debugging, download it from [here](https://code.visualstudio.com/) for free.
+* This sample currently uses a free trial Microsoft Cognitive service key with limited QPS. Please subscribe to Bing Speech Api services [here](https://www.microsoft.com/cognitive-services/en-us/subscriptions) and update the `MicrosoftSpeechApiKey` key in key in [Web.config](Web.config) file to try it out further.
+
+### Usage
+
+Attach an audio file (wav format) and send an optional command as text. 
+Supported Commands:
+* `WORD` - Counts the number of words.
+* `CHARACTER` - Counts the number of characters excluding spaces.
+* `SPACE` - Counts the number of spaces.
+* `VOWEL` - Counts the number of vowels.
+* Any other word will count the occurrences of that word in the transcribed text
+
+### Code Highlights
+
+Microsoft Cognitive Services provides a Speech Recognition API to convert audio into text. Check out [Bing Speech API](https://www.microsoft.com/cognitive-services/en-us/speech-api) for a complete reference of Speech APIs available. In this sample we are using the Speech Recognition API using the [REST API](https://www.microsoft.com/cognitive-services/en-us/Speech-api/documentation/API-Reference-REST/BingVoiceRecognition).
+
+In this sample we are using the API to get the text and send it back to the user. Check out the use of the `MicrosoftCognitiveSpeechService.GetTextFromAudioAsync()` method in the [Controllers/MessagesController](Controllers/MessagesController.cs) class.
+````C#
+var audioAttachment = activity.Attachments?.FirstOrDefault(a => a.ContentType.Equals("audio/wav"));
+if (audioAttachment != null)
+{
+    using (var client = new HttpClient())
+    {
+        var stream = await client.GetStreamAsync(audioAttachment.ContentUrl);
+        var text = await this.speechService.GetTextFromAudioAsync(stream);
+        message = ProcessText(activity.Text, text);
+    }
+}
+````
+
+and here is the implementation of `MicrosoftCognitiveSpeechService.GetTextFromAudioAsync()` in [Services/MicrosoftCognitiveSpeechService.cs](Services/MicrosoftCognitiveSpeechService.cs)
+````C#
+/// <summary>
+/// Gets text from an audio stream.
+/// </summary>
+/// <param name="audiostream"></param>
+/// <returns>Transcribed text. </returns>
+public async Task<string> GetTextFromAudioAsync(Stream audiostream)
+{
+    var requestUri = @"https://speech.platform.bing.com/recognize?scenarios=smd&appid=D4D52672-91D7-4C74-8AD8-42B1D98141A5&locale=en-US&device.os=bot&version=3.0&format=json&instanceid=565D69FF-E928-4B7E-87DA-9A750B96D9E3&requestid=" + Guid.NewGuid();
+
+    using (var client = new HttpClient())
+    {
+        var token = Authentication.Instance.GetAccessToken();
+        client.DefaultRequestHeaders.Add("Authorization", "Bearer " + token.access_token);
+
+        using (var binaryContent = new ByteArrayContent(StreamToBytes(audiostream)))
+        {
+            binaryContent.Headers.TryAddWithoutValidation("content-type", "audio/wav; codec=\"audio/pcm\"; samplerate=16000");
+
+            var response = await client.PostAsync(requestUri, binaryContent);
+            var responseString = await response.Content.ReadAsStringAsync();
+            dynamic data = JsonConvert.DeserializeObject(responseString);
+            return data.header.name;
+        }
+    }
+}
+````
+
+### Outcome
+
+You will see the following when connecting the Bot to the Emulator and send it an audio file and a command:
+
+Input:
+
+["What's the weather like?"](audio/whatstheweatherlike.wav)
+
+Output:
+
+![Sample Outcome](images/outcome-emulator.png)
+
+### More Information
+
+To get more information about how to get started in Bot Builder for .NET and Microsoft Cognitive Services Bing Speech API please review the following resources:
+* [Bot Builder for .NET](https://docs.botframework.com/en-us/csharp/builder/sdkreference/index.html)
+* [Microsoft Cognitive Services Bing Speech API](https://www.microsoft.com/cognitive-services/en-us/speech-api)
diff --git a/CSharp/intelligence-SpeechToText/Services/AccessTokenInfo.cs b/CSharp/intelligence-SpeechToText/Services/AccessTokenInfo.cs
@@ -0,0 +1,13 @@
+namespace SpeechToText.Services
+{
+    public class AccessTokenInfo
+    {
+        public string access_token { get; set; }
+
+        public string token_type { get; set; }
+
+        public int expires_in { get; set; }
+
+        public string scope { get; set; }
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		<%@ Application Codebehind="Global.asax.cs" Inherits="SpeechToText.WebApiApplication" Language="C#" %>