From c075832928ced6bac71e65eb16d49f862a189265 Mon Sep 17 00:00:00 2001
From: Dmytro Struk <13853051+dmytrostruk@users.noreply.github.com>
Date: Wed, 19 Jun 2024 03:01:42 -0700
Subject: [PATCH] .Net: Examples with FrugalGPT techniques for LLM cost and
performance improvements (#6815)
### Motivation and Context
This PR shows how to use FrugalGPT techniques to reduce cost and improve
LLM-related task performance with Semantic Kernel filters.
More information about FrugalGPT: https://arxiv.org/abs/2305.05176
### Contribution Checklist
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone :smile:
---
.../Concepts/Optimization/FrugalGPT.cs | 308 ++++++++++++++++++
dotnet/samples/Concepts/README.md | 4 +
2 files changed, 312 insertions(+)
create mode 100644 dotnet/samples/Concepts/Optimization/FrugalGPT.cs
diff --git a/dotnet/samples/Concepts/Optimization/FrugalGPT.cs b/dotnet/samples/Concepts/Optimization/FrugalGPT.cs
new file mode 100644
index 000000000000..f5ede1764789
--- /dev/null
+++ b/dotnet/samples/Concepts/Optimization/FrugalGPT.cs
@@ -0,0 +1,308 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Runtime.CompilerServices;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+using Microsoft.SemanticKernel.Embeddings;
+using Microsoft.SemanticKernel.Memory;
+using Microsoft.SemanticKernel.PromptTemplates.Handlebars;
+using Microsoft.SemanticKernel.Services;
+
+namespace Optimization;
+
+///
+/// This example shows how to use FrugalGPT techniques to reduce cost and improve LLM-related task performance.
+/// More information here: https://arxiv.org/abs/2305.05176.
+///
+public sealed class FrugalGPT(ITestOutputHelper output) : BaseTest(output)
+{
+ ///
+ /// One of the FrugalGPT techniques is to reduce prompt size when using few-shot prompts.
+ /// If prompt contains a lof of examples to help LLM to provide the best result, it's possible to send only a couple of them to reduce amount of tokens.
+ /// Vector similarity can be used to pick the best examples from example set for specific request.
+ /// Following example shows how to optimize email classification request by reducing prompt size with vector similarity search.
+ ///
+ [Fact]
+ public async Task ReducePromptSizeAsync()
+ {
+ // Define email classification examples with email body and labels.
+ var examples = new List
+ {
+ "Hey, just checking in to see how you're doing! - Personal",
+ "Can you pick up some groceries on your way back home? We need milk and bread. - Personal, Tasks",
+ "Happy Birthday! Wishing you a fantastic day filled with love and joy. - Personal",
+ "Let's catch up over coffee this Saturday. It's been too long! - Personal, Events",
+ "Please review the attached document and provide your feedback by EOD. - Work",
+ "Our team meeting is scheduled for 10 AM tomorrow in the main conference room. - Work",
+ "The quarterly financial report is due next Monday. Ensure all data is updated. - Work, Tasks",
+ "Can you send me the latest version of the project plan? Thanks! - Work",
+ "You're invited to our annual summer picnic! RSVP by June 25th. - Events",
+ "Join us for a webinar on digital marketing trends this Thursday at 3 PM. - Events",
+ "Save the date for our charity gala on September 15th. We hope to see you there! - Events",
+ "Don't miss our customer appreciation event next week. Sign up now! - Events, Notifications",
+ "Your order has been shipped and will arrive by June 20th. - Notifications",
+ "We've updated our policies. Please review the changes. - Notifications",
+ "Your username was successfully changed. If this wasn't you, contact support immediately. - Notifications",
+ "The system upgrade will occur this weekend. - Notifications, Work",
+ "Don't forget to submit your timesheet by 5 PM today. - Tasks, Work",
+ "Pick up the dry cleaning before they close at 7 PM. - Tasks",
+ "Complete the online training module by the end of the week. - Tasks, Work",
+ "Send out the meeting invites for next week's project kickoff. - Tasks, Work"
+ };
+
+ // Initialize kernel with chat completion and embedding generation services.
+ // It's possible to combine different models from different AI providers to achieve the lowest token usage.
+ var kernel = Kernel.CreateBuilder()
+ .AddOpenAIChatCompletion(
+ modelId: "gpt-4",
+ apiKey: TestConfiguration.OpenAI.ApiKey)
+ .AddOpenAITextEmbeddingGeneration(
+ modelId: "text-embedding-3-small",
+ apiKey: TestConfiguration.OpenAI.ApiKey)
+ .Build();
+
+ // Initialize few-shot prompt.
+ var function = kernel.CreateFunctionFromPrompt(
+ new()
+ {
+ Template =
+ """
+ Available classification labels: Personal, Work, Events, Notifications, Tasks
+ Email classification examples:
+ {{#each Examples}}
+ {{this}}
+ {{/each}}
+
+ Email body to classify:
+ {{Request}}
+ """,
+ TemplateFormat = "handlebars"
+ },
+ new HandlebarsPromptTemplateFactory()
+ );
+
+ // Define arguments with few-shot examples and actual email for classification.
+ var arguments = new KernelArguments
+ {
+ ["Examples"] = examples,
+ ["Request"] = "Your dentist appointment is tomorrow at 10 AM. Please remember to bring your insurance card."
+ };
+
+ // Invoke defined function to see initial result.
+ var result = await kernel.InvokeAsync(function, arguments);
+
+ Console.WriteLine(result); // Personal, Notifications
+ Console.WriteLine(result.Metadata?["Usage"]?.AsJson()); // Total tokens: ~430
+
+ // Add few-shot prompt optimization filter.
+ // The filter uses in-memory store for vector similarity search and text embedding generation service to generate embeddings.
+ var memoryStore = new VolatileMemoryStore();
+ var textEmbeddingGenerationService = kernel.GetRequiredService();
+
+ // Register optimization filter.
+ kernel.PromptRenderFilters.Add(new FewShotPromptOptimizationFilter(memoryStore, textEmbeddingGenerationService));
+
+ // Get result again and compare the usage.
+ result = await kernel.InvokeAsync(function, arguments);
+
+ Console.WriteLine(result); // Personal, Notifications
+ Console.WriteLine(result.Metadata?["Usage"]?.AsJson()); // Total tokens: ~150
+ }
+
+ ///
+ /// LLM cascade technique allows to use multiple LLMs sequentially starting from cheaper model,
+ /// evaluate LLM result and return it in case it meets the quality criteria. Otherwise, proceed with next LLM in queue,
+ /// until the result will be acceptable.
+ /// Following example uses mock result generation and evaluation for demonstration purposes.
+ /// Result evaluation examples including BERTScore, BLEU, METEOR and COMET metrics can be found here:
+ /// https://github.com/microsoft/semantic-kernel/tree/main/dotnet/samples/Demos/QualityCheck.
+ ///
+ [Fact]
+ public async Task LLMCascadeAsync()
+ {
+ // Create kernel builder.
+ var builder = Kernel.CreateBuilder();
+
+ // Register chat completion services for demonstration purposes.
+ // This registration is similar to AddAzureOpenAIChatCompletion and AddOpenAIChatCompletion methods.
+ builder.Services.AddSingleton(new MockChatCompletionService("model1", "Hi there! I'm doing well, thank you! How about yourself?"));
+ builder.Services.AddSingleton(new MockChatCompletionService("model2", "Hello! I'm great, thanks for asking. How are you doing today?"));
+ builder.Services.AddSingleton(new MockChatCompletionService("model3", "Hey! I'm fine, thanks. How's your day going so far?"));
+
+ // Register LLM cascade filter with model execution order, acceptance criteria for result and service for output.
+ // In real use-cases, execution order should start from cheaper to more expensive models.
+ // If first model will produce acceptable result, then it will be returned immediately.
+ builder.Services.AddSingleton(new LLMCascadeFilter(
+ modelExecutionOrder: ["model1", "model2", "model3"],
+ acceptanceCriteria: result => result.Contains("Hey!"),
+ output: this.Output));
+
+ // Build kernel.
+ var kernel = builder.Build();
+
+ // Send a request.
+ var result = await kernel.InvokePromptAsync("Hi, how are you today?");
+
+ Console.WriteLine($"\nFinal result: {result}");
+
+ // Output:
+ // Executing request with model: model1
+ // Result from model1: Hi there! I'm doing well, thank you! How about yourself?
+ // Result does not meet the acceptance criteria, moving to the next model.
+
+ // Executing request with model: model2
+ // Result from model2: Hello! I'm great, thanks for asking. How are you doing today?
+ // Result does not meet the acceptance criteria, moving to the next model.
+
+ // Executing request with model: model3
+ // Result from model3: Hey! I'm fine, thanks. How's your day going so far?
+ // Returning result as it meets the acceptance criteria.
+
+ // Final result: Hey! I'm fine, thanks. How's your day going so far?
+ }
+
+ ///
+ /// Few-shot prompt optimization filter which takes all examples from kernel arguments and selects first examples,
+ /// which are similar to original request.
+ ///
+ private sealed class FewShotPromptOptimizationFilter(
+ IMemoryStore memoryStore,
+ ITextEmbeddingGenerationService textEmbeddingGenerationService) : IPromptRenderFilter
+ {
+ ///
+ /// Maximum number of examples to use which are similar to original request.
+ ///
+ private const int TopN = 5;
+
+ ///
+ /// Collection name to use in memory store.
+ ///
+ private const string CollectionName = "examples";
+
+ public async Task OnPromptRenderAsync(PromptRenderContext context, Func next)
+ {
+ // Get examples and original request from arguments.
+ var examples = context.Arguments["Examples"] as List;
+ var request = context.Arguments["Request"] as string;
+
+ if (examples is { Count: > 0 } && !string.IsNullOrEmpty(request))
+ {
+ var memoryRecords = new List();
+
+ // Generate embedding for each example.
+ var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(examples);
+
+ // Create memory record instances with example text and embedding.
+ for (var i = 0; i < examples.Count; i++)
+ {
+ memoryRecords.Add(MemoryRecord.LocalRecord(Guid.NewGuid().ToString(), examples[i], "description", embeddings[i]));
+ }
+
+ // Create collection and upsert all memory records for search.
+ // It's possible to do it only once and re-use the same examples for future requests.
+ await memoryStore.CreateCollectionAsync(CollectionName);
+ await memoryStore.UpsertBatchAsync(CollectionName, memoryRecords).ToListAsync();
+
+ // Generate embedding for original request.
+ var requestEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(request);
+
+ // Find top N examples which are similar to original request.
+ var topNExamples = await memoryStore.GetNearestMatchesAsync(CollectionName, requestEmbedding, TopN).ToListAsync();
+
+ // Override arguments to use only top N examples, which will be sent to LLM.
+ context.Arguments["Examples"] = topNExamples.Select(l => l.Item1.Metadata.Text);
+ }
+
+ // Continue prompt rendering operation.
+ await next(context);
+ }
+ }
+
+ ///
+ /// Example of LLM cascade filter which will invoke a function using multiple LLMs in specific order,
+ /// until the result will meet specified acceptance criteria.
+ ///
+ private sealed class LLMCascadeFilter(
+ List modelExecutionOrder,
+ Predicate acceptanceCriteria,
+ ITestOutputHelper output) : IFunctionInvocationFilter
+ {
+ public async Task OnFunctionInvocationAsync(FunctionInvocationContext context, Func next)
+ {
+ // Get registered chat completion services from kernel.
+ var registeredServices = context.Kernel
+ .GetAllServices()
+ .Select(service => (ModelId: service.GetModelId()!, Service: service));
+
+ // Define order of execution.
+ var order = modelExecutionOrder
+ .Select((value, index) => new { Value = value, Index = index })
+ .ToDictionary(k => k.Value, v => v.Index);
+
+ // Sort services by specified order.
+ var orderedServices = registeredServices.OrderBy(service => order[service.ModelId]);
+
+ // Try to invoke a function with each service and check the result.
+ foreach (var service in orderedServices)
+ {
+ // Define execution settings with model ID.
+ context.Arguments.ExecutionSettings = new Dictionary
+ {
+ { PromptExecutionSettings.DefaultServiceId, new() { ModelId = service.ModelId } }
+ };
+
+ output.WriteLine($"Executing request with model: {service.ModelId}");
+
+ // Invoke a function.
+ await next(context);
+
+ // Get a result.
+ var result = context.Result.ToString()!;
+
+ output.WriteLine($"Result from {service.ModelId}: {result}");
+
+ // Check if result meets specified acceptance criteria.
+ // If yes, stop execution loop, so last result will be returned.
+ if (acceptanceCriteria(result))
+ {
+ output.WriteLine("Returning result as it meets the acceptance criteria.");
+ return;
+ }
+
+ // Otherwise, proceed with next model.
+ output.WriteLine("Result does not meet the acceptance criteria, moving to the next model.\n");
+ }
+
+ // If LLMs didn't return acceptable result, the last result will be returned.
+ // It's also possible to throw an exception in such cases if needed.
+ // throw new Exception("Models didn't return a result that meets the acceptance criteria").
+ }
+ }
+
+ ///
+ /// Mock chat completion service for demonstration purposes.
+ ///
+ private sealed class MockChatCompletionService(string modelId, string mockResult) : IChatCompletionService
+ {
+ public IReadOnlyDictionary Attributes => new Dictionary { { AIServiceExtensions.ModelIdKey, modelId } };
+
+ public Task> GetChatMessageContentsAsync(
+ ChatHistory chatHistory,
+ PromptExecutionSettings? executionSettings = null,
+ Kernel? kernel = null,
+ CancellationToken cancellationToken = default)
+ {
+ return Task.FromResult>([new ChatMessageContent(AuthorRole.Assistant, mockResult)]);
+ }
+
+ public async IAsyncEnumerable GetStreamingChatMessageContentsAsync(
+ ChatHistory chatHistory,
+ PromptExecutionSettings? executionSettings = null,
+ Kernel? kernel = null,
+ [EnumeratorCancellation] CancellationToken cancellationToken = default)
+ {
+ yield return new StreamingChatMessageContent(AuthorRole.Assistant, mockResult);
+ }
+ }
+}
diff --git a/dotnet/samples/Concepts/README.md b/dotnet/samples/Concepts/README.md
index f0896534852c..7eaa2a8a7ae6 100644
--- a/dotnet/samples/Concepts/README.md
+++ b/dotnet/samples/Concepts/README.md
@@ -100,6 +100,10 @@ Down below you can find the code snippets that demonstrate the usage of many Sem
- [TextMemoryPlugin_GeminiEmbeddingGeneration](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/Memory/TextMemoryPlugin_GeminiEmbeddingGeneration.cs)
- [TextMemoryPlugin_MultipleMemoryStore](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/Memory/TextMemoryPlugin_MultipleMemoryStore.cs)
+## Optimization - Examples of different cost and performance optimization techniques
+
+- [FrugalGPT](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/Optimization/FrugalGPT.cs)
+
## Planners - Examples on using `Planners`
- [FunctionCallStepwisePlanning](https://github.com/microsoft/semantic-kernel/blob/main/dotnet/samples/Concepts/Planners/FunctionCallStepwisePlanning.cs)