Skip to content

Commit

Permalink
Adding SpeechToText Sample
Browse files Browse the repository at this point in the history
  • Loading branch information
iassal committed Oct 3, 2016
1 parent a7e74d4 commit 709257d
Show file tree
Hide file tree
Showing 28 changed files with 1,405 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,6 @@ paket-files/

# Node
/**/node_modules

# Visual Studio Code
.vscode/
33 changes: 33 additions & 0 deletions CSharp/intelligence-SpeechToText/App_Start/WebApiConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
namespace SpeechToText
{
using System.Web.Http;
using Newtonsoft.Json;
using Newtonsoft.Json.Serialization;

public static class WebApiConfig
{
public static void Register(HttpConfiguration config)
{
// Json settings
config.Formatters.JsonFormatter.SerializerSettings.NullValueHandling = NullValueHandling.Ignore;
config.Formatters.JsonFormatter.SerializerSettings.ContractResolver = new CamelCasePropertyNamesContractResolver();
config.Formatters.JsonFormatter.SerializerSettings.Formatting = Formatting.Indented;
JsonConvert.DefaultSettings = () => new JsonSerializerSettings
{
ContractResolver = new CamelCasePropertyNamesContractResolver(),
Formatting = Formatting.Indented,
NullValueHandling = NullValueHandling.Ignore,
};

// Web API configuration and services

// Web API routes
config.MapHttpAttributeRoutes();

config.Routes.MapHttpRoute(
name: "DefaultApi",
routeTemplate: "api/{controller}/{id}",
defaults: new { id = RouteParameter.Optional });
}
}
}
182 changes: 182 additions & 0 deletions CSharp/intelligence-SpeechToText/Controllers/MessagesController.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
namespace SpeechToText.Controllers
{
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Threading.Tasks;
using System.Web.Http;
using Microsoft.Bot.Connector;
using Services;

[BotAuthentication]
public class MessagesController : ApiController
{
private readonly MicrosoftCognitiveSpeechService speechService = new MicrosoftCognitiveSpeechService();

/// <summary>
/// POST: api/Messages
/// Receive a message from a user and reply to it
/// </summary>
public async Task<HttpResponseMessage> Post([FromBody]Activity activity)
{
if (activity.Type == ActivityTypes.Message)
{
var connector = new ConnectorClient(new Uri(activity.ServiceUrl));
string message;

try
{
var audioAttachment = activity.Attachments?.FirstOrDefault(a => a.ContentType.Equals("audio/wav") || a.ContentType.Equals("application/octet-stream"));
if (audioAttachment != null)
{
var stream = await GetImageStream(connector, audioAttachment);
var text = await this.speechService.GetTextFromAudioAsync(stream);
message = ProcessText(activity.Text, text);
}
else
{
message = "Did you upload an audio file? I'm more of an audible person. Try sending me a wav file";
}
}
catch (Exception e)
{
message = "Oops! Something went wrong. Try again later.";

Trace.TraceError(e.ToString());
}

Activity reply = activity.CreateReply(message);
await connector.Conversations.ReplyToActivityAsync(reply);
}
else
{
await this.HandleSystemMessage(activity);
}

var response = this.Request.CreateResponse(HttpStatusCode.OK);
return response;
}

private static string ProcessText(string input, string text)
{
string message = "You said : " + text + ".";

input = input?.Trim();

if (!string.IsNullOrEmpty(input))
{
var normalizedInput = input.ToUpper();

if (normalizedInput.Equals("WORD"))
{
var wordCount = text.Split(' ').Count(x => !string.IsNullOrEmpty(x));
message += " Word Count: " + wordCount;
}
else if (normalizedInput.Equals("CHARACTER"))
{
var characterCount = text.Count(c => c != ' ');
message += " Character Count: " + characterCount;
}
else if (normalizedInput.Equals("SPACE"))
{
var spaceCount = text.Count(c => c == ' ');
message += " Space Count: " + spaceCount;
}
else if (normalizedInput.Equals("VOWEL"))
{
var vowelCount = text.ToUpper().Count("AEIOU".Contains);
message += " Vowel Count: " + vowelCount;
}
else
{
var keywordCount = text.ToUpper().Split(' ').Count(w => w == normalizedInput);
message += " Keyword " + input + " found " + keywordCount + " times.";
}
}

return message;
}

/// <summary>
/// Handles the system activity.
/// </summary>
/// <param name="activity">The activity.</param>
/// <returns>Activity</returns>
private async Task<Activity> HandleSystemMessage(Activity activity)
{
switch (activity.Type)
{
case ActivityTypes.DeleteUserData:
// Implement user deletion here
// If we handle user deletion, return a real message
break;
case ActivityTypes.ConversationUpdate:
// Greet the user the first time the bot is added to a conversation.
if (activity.MembersAdded.Any(m => m.Id == activity.Recipient.Id))
{
var connector = new ConnectorClient(new Uri(activity.ServiceUrl));

var response = activity.CreateReply();
response.Text = "Hi! I am SpeechToText Bot. I can understand the content of any audio and convert it to text. Try sending me a wav file.";

await connector.Conversations.ReplyToActivityAsync(response);
}

break;
case ActivityTypes.ContactRelationUpdate:
// Handle add/remove from contact lists
break;
case ActivityTypes.Typing:
// Handle knowing that the user is typing
break;
case ActivityTypes.Ping:
break;
}

return null;
}

private static async Task<Stream> GetImageStream(ConnectorClient connector, Attachment imageAttachment)
{
using (var httpClient = new HttpClient())
{
// The Skype attachment URLs are secured by JwtToken,
// you should set the JwtToken of your bot as the authorization header for the GET request your bot initiates to fetch the image.
// https://github.com/Microsoft/BotBuilder/issues/662
var uri = new Uri(imageAttachment.ContentUrl);
if (uri.Host.EndsWith("skype.com") && uri.Scheme == "https")
{
httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("Bearer", await GetTokenAsync(connector));
httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/octet-stream"));
}
else
{
httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue(imageAttachment.ContentType));
}

return await httpClient.GetStreamAsync(uri);
}
}

/// <summary>
/// Gets the JwT token of the bot.
/// </summary>
/// <param name="connector"></param>
/// <returns>JwT token of the bot</returns>
private static async Task<string> GetTokenAsync(ConnectorClient connector)
{
var credentials = connector.Credentials as MicrosoftAppCredentials;
if (credentials != null)
{
return await credentials.GetTokenAsync();
}

return null;
}

}
}
1 change: 1 addition & 0 deletions CSharp/intelligence-SpeechToText/Global.asax
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<%@ Application Codebehind="Global.asax.cs" Inherits="SpeechToText.WebApiApplication" Language="C#" %>
12 changes: 12 additions & 0 deletions CSharp/intelligence-SpeechToText/Global.asax.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
namespace SpeechToText
{
using System.Web.Http;

public class WebApiApplication : System.Web.HttpApplication
{
protected void Application_Start()
{
GlobalConfiguration.Configure(WebApiConfig.Register);
}
}
}
34 changes: 34 additions & 0 deletions CSharp/intelligence-SpeechToText/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
using System.Reflection;
using System.Runtime.InteropServices;

// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("SpeechToText")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("")]
[assembly: AssemblyProduct("SpeechToText")]
[assembly: AssemblyCopyright("Copyright © 2016")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]

// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("a8ba1066-5695-4d71-abb4-65e5a5e0c3d4")]

// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
// You can specify all the values or you can default the Revision and Build Numbers
// by using the '*' as shown below:
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
88 changes: 88 additions & 0 deletions CSharp/intelligence-SpeechToText/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Speech To Text Bot Sample

A sample bot that illustrates how to use the Microsoft Cognitive Services Bing Speech API to analyze an audio file and return the text.

[![Deploy to Azure](http://azuredeploy.net/deploybutton.png)](https://azuredeploy.net)

### Prerequisites

The minimum prerequisites to run this sample are:
* The latest update of Visual Studio 2015. You can download the community version [here](http://www.visualstudio.com) for free.
* The Bot Framework Emulator. To install the Bot Framework Emulator, download it from [here](https://aka.ms/bf-bc-emulator). Please refer to [this documentation article](https://docs.botframework.com/en-us/csharp/builder/sdkreference/gettingstarted.html#emulator) to know more about the Bot Framework Emulator.
* **[Recommended]** Visual Studio Code for IntelliSense and debugging, download it from [here](https://code.visualstudio.com/) for free.
* This sample currently uses a free trial Microsoft Cognitive service key with limited QPS. Please subscribe to Bing Speech Api services [here](https://www.microsoft.com/cognitive-services/en-us/subscriptions) and update the `MicrosoftSpeechApiKey` key in key in [Web.config](Web.config) file to try it out further.

### Usage

Attach an audio file (wav format) and send an optional command as text.
Supported Commands:
* `WORD` - Counts the number of words.
* `CHARACTER` - Counts the number of characters excluding spaces.
* `SPACE` - Counts the number of spaces.
* `VOWEL` - Counts the number of vowels.
* Any other word will count the occurrences of that word in the transcribed text

### Code Highlights

Microsoft Cognitive Services provides a Speech Recognition API to convert audio into text. Check out [Bing Speech API](https://www.microsoft.com/cognitive-services/en-us/speech-api) for a complete reference of Speech APIs available. In this sample we are using the Speech Recognition API using the [REST API](https://www.microsoft.com/cognitive-services/en-us/Speech-api/documentation/API-Reference-REST/BingVoiceRecognition).

In this sample we are using the API to get the text and send it back to the user. Check out the use of the `MicrosoftCognitiveSpeechService.GetTextFromAudioAsync()` method in the [Controllers/MessagesController](Controllers/MessagesController.cs) class.
````C#
var audioAttachment = activity.Attachments?.FirstOrDefault(a => a.ContentType.Equals("audio/wav"));
if (audioAttachment != null)
{
using (var client = new HttpClient())
{
var stream = await client.GetStreamAsync(audioAttachment.ContentUrl);
var text = await this.speechService.GetTextFromAudioAsync(stream);
message = ProcessText(activity.Text, text);
}
}
````

and here is the implementation of `MicrosoftCognitiveSpeechService.GetTextFromAudioAsync()` in [Services/MicrosoftCognitiveSpeechService.cs](Services/MicrosoftCognitiveSpeechService.cs)
````C#
/// <summary>
/// Gets text from an audio stream.
/// </summary>
/// <param name="audiostream"></param>
/// <returns>Transcribed text. </returns>
public async Task<string> GetTextFromAudioAsync(Stream audiostream)
{
var requestUri = @"https://speech.platform.bing.com/recognize?scenarios=smd&appid=D4D52672-91D7-4C74-8AD8-42B1D98141A5&locale=en-US&device.os=bot&version=3.0&format=json&instanceid=565D69FF-E928-4B7E-87DA-9A750B96D9E3&requestid=" + Guid.NewGuid();

using (var client = new HttpClient())
{
var token = Authentication.Instance.GetAccessToken();
client.DefaultRequestHeaders.Add("Authorization", "Bearer " + token.access_token);

using (var binaryContent = new ByteArrayContent(StreamToBytes(audiostream)))
{
binaryContent.Headers.TryAddWithoutValidation("content-type", "audio/wav; codec=\"audio/pcm\"; samplerate=16000");

var response = await client.PostAsync(requestUri, binaryContent);
var responseString = await response.Content.ReadAsStringAsync();
dynamic data = JsonConvert.DeserializeObject(responseString);
return data.header.name;
}
}
}
````

### Outcome

You will see the following when connecting the Bot to the Emulator and send it an audio file and a command:

Input:

["What's the weather like?"](audio/whatstheweatherlike.wav)

Output:

![Sample Outcome](images/outcome-emulator.png)

### More Information

To get more information about how to get started in Bot Builder for .NET and Microsoft Cognitive Services Bing Speech API please review the following resources:
* [Bot Builder for .NET](https://docs.botframework.com/en-us/csharp/builder/sdkreference/index.html)
* [Microsoft Cognitive Services Bing Speech API](https://www.microsoft.com/cognitive-services/en-us/speech-api)
13 changes: 13 additions & 0 deletions CSharp/intelligence-SpeechToText/Services/AccessTokenInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace SpeechToText.Services
{
public class AccessTokenInfo
{
public string access_token { get; set; }

public string token_type { get; set; }

public int expires_in { get; set; }

public string scope { get; set; }
}
}
Loading

0 comments on commit 709257d

Please sign in to comment.