diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..89b575ef3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,22 @@ +version: 2 +updates: + - package-ecosystem: "nuget" + directory: "/" + schedule: + interval: "weekly" + day: "tuesday" + ignore: + - dependency-name: "*" + update-types: + - "version-update:semver-major" + + - package-ecosystem: "dotnet-sdk" + directory: "/" + schedule: + interval: "weekly" + day: "tuesday" + ignore: + - dependency-name: "*" + update-types: + - "version-update:semver-major" + \ No newline at end of file diff --git a/.github/workflows/codeQL.yml b/.github/workflows/codeQL.yml index 58eac188b..dae391413 100644 --- a/.github/workflows/codeQL.yml +++ b/.github/workflows/codeQL.yml @@ -66,6 +66,11 @@ jobs: with: dotnet-version: '3.1.x' + - name: Set up .NET 8 + uses: actions/setup-dotnet@v3 + with: + dotnet-version: '8.0.x' + - name: Restore dependencies run: dotnet restore $solution diff --git a/Directory.Packages.props b/Directory.Packages.props index e9c7569ca..844d46238 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -11,15 +11,15 @@ - - + + - - + + - + - + @@ -35,8 +35,8 @@ - - + + @@ -47,9 +47,9 @@ - - - + + + @@ -60,13 +60,14 @@ - - - + + + + - - + + @@ -78,8 +79,8 @@ - - + + @@ -88,7 +89,7 @@ - + @@ -103,9 +104,9 @@ - - - + + + @@ -114,8 +115,8 @@ - - + + diff --git a/README.md b/README.md index 04177195b..c2184bda9 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,18 @@ # Durable Task Framework -The Durable Task Framework (DTFx) is a library that allows users to write long running persistent workflows (referred to as _orchestrations_) in C# using simple async/await coding constructs. It is used heavily within various teams at Microsoft to reliably orchestrate long running provisioning, monitoring, and management operations. The orchestrations scale out linearly by simply adding more worker machines. This framework is also used to power the serverless [Durable Functions](https://docs.microsoft.com/azure/azure-functions/durable/durable-functions-overview) extension of [Azure Functions](https://azure.microsoft.com/services/functions/). +> [!IMPORTANT] +> The Durable Task Framework (DTFx) is a community-maintained open-source project. It is actively used in production by many teams, including engineering teams within Microsoft. However, it does not come with official Microsoft support — meaning you cannot open a Microsoft support ticket for DTFx issues. Bugs and feature requests are addressed on a best-effort basis. +> +> If you are starting a new project or need official Microsoft support, we recommend: +> +> - **[Durable Functions](https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-overview)** - for serverless orchestration on [Azure Functions](https://azure.microsoft.com/services/functions/) +> - **[Durable Task SDKs](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/durable-task-overview)** with the **[Durable Task Scheduler](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/durable-task-scheduler)** backend - for self-hosted orchestration on any compute platform (Azure Container Apps, AKS, VMs, etc.) +> +> These alternatives offer full Microsoft support, including the ability to open support tickets. For more details, see [Choosing an orchestration framework](https://learn.microsoft.com/azure/azure-functions/durable/choose-orchestration-framework#community-and-experimental-durable-task-sdks). -By open sourcing this project we hope to give the community a very cost-effective alternative to heavy duty workflow systems. We also hope to build an ecosystem of providers and activities around this simple yet incredibly powerful framework. +The Durable Task Framework (DTFx) is a library that allows users to write long-running persistent workflows (referred to as _orchestrations_) in C# using simple async/await coding constructs. It provides similar orchestration primitives to the modern Durable Task SDKs. While DTFx continues to be maintained and used in production, the newer Durable Task SDKs offer additional features, active development, and official Microsoft support. DTFx also requires you to manage hosting and operational infrastructure yourself. + +> **📖 Documentation:** Documentation for this repository is available in the [docs](./docs/README.md) folder. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). @@ -21,13 +31,15 @@ Starting in v2.x, the Durable Task Framework supports an extensible set of backe | DurableTask.SqlServer | [![NuGet](https://img.shields.io/nuget/v/Microsoft.DurableTask.SqlServer.svg?style=flat)](https://www.nuget.org/packages/Microsoft.DurableTask.SqlServer/) | All orchestration state is stored in a [Microsoft SQL Server](https://www.microsoft.com/sql-server/sql-server-2019) or [Azure SQL](https://azure.microsoft.com/products/azure-sql/database/) database with indexed tables and stored procedures for direct interaction. This backend is available for [Durable Functions](https://docs.microsoft.com/azure/azure-functions/durable/). [👉 GitHub Repo](https://github.com/microsoft/durabletask-mssql) | Production ready and actively maintained | | DurableTask.Emulator | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.Emulator.svg?style=flat)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.Emulator/) | This is an in-memory store intended for testing purposes only. It is not designed or recommended for any production workloads. | Not actively maintained | -The core programming model for the Durable Task Framework is contained in the [DurableTask.Core](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.Core/) package, which is also under active development. +> [!NOTE] +> The `DurableTask.Emulator` listed above is a legacy in-memory backend for DTFx and is **not** the same as the [Durable Task Scheduler emulator](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/develop-with-durable-task-scheduler?tabs=dedicated&pivots=az-cli#durable-task-scheduler-emulator), which is a supported local development emulator for the Durable Task Scheduler backend. + +The core programming model for the Durable Task Framework is contained in the [DurableTask.Core](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.Core/) package. ## Learning more There are several places where you can learn more about this framework. Note that some are external and not owned by Microsoft: -- [This repo's wiki](https://github.com/Azure/durabletask/wiki), which contains more details about the framework and how it can be used. - The following blog series contains useful information: https://abhikmitra.github.io/blog/durable-task/ - Several useful samples are available here: https://github.com/kaushiksk/durabletask-samples - You can watch a video with some of the original maintainers in [Building Workflows with the Durable Task Framework](https://learn.microsoft.com/shows/on-net/building-workflows-with-the-durable-task-framework). diff --git a/Test/DurableTask.AzureStorage.Tests/OrchestrationSessionTests.cs b/Test/DurableTask.AzureStorage.Tests/OrchestrationSessionTests.cs new file mode 100644 index 000000000..126c4b9bc --- /dev/null +++ b/Test/DurableTask.AzureStorage.Tests/OrchestrationSessionTests.cs @@ -0,0 +1,227 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- + +namespace DurableTask.AzureStorage.Tests +{ + using System; + using System.Collections.Generic; + using System.Diagnostics; + using System.Linq; + using System.Reflection; + using System.Threading; + using System.Threading.Tasks; + using DurableTask.AzureStorage.Messaging; + using DurableTask.AzureStorage.Monitoring; + using DurableTask.AzureStorage.Tracking; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; + + /// + /// Tests for shutdown cancellation behavior with extended sessions. + /// + [TestClass] + public class OrchestrationSessionTests + { + /// + /// Verifies that + /// exits immediately when the cancellation token is cancelled. + /// + [TestMethod] + public async Task WaitAsync_CancellationToken_ExitsImmediately() + { + var resetEvent = new AsyncAutoResetEvent(signaled: false); + using var cts = new CancellationTokenSource(); + + TimeSpan longTimeout = TimeSpan.FromSeconds(30); + Task waitTask = resetEvent.WaitAsync(longTimeout, cts.Token); + + Assert.IsFalse(waitTask.IsCompleted, "Wait should not complete immediately"); + + var stopwatch = Stopwatch.StartNew(); + cts.Cancel(); + + bool result = await waitTask; + stopwatch.Stop(); + + Assert.IsFalse(result, "Cancellation should return false (no signal received)"); + Assert.IsTrue( + stopwatch.ElapsedMilliseconds < 5000, + $"Cancellation should complete in under 5s, but took {stopwatch.ElapsedMilliseconds}ms"); + } + + /// + /// Verifies that signaling still returns true when a cancellation token is provided. + /// + [TestMethod] + public async Task WaitAsync_WithCancellationToken_SignalStillWorks() + { + var resetEvent = new AsyncAutoResetEvent(signaled: false); + using var cts = new CancellationTokenSource(); + + Task waitTask = resetEvent.WaitAsync(TimeSpan.FromSeconds(30), cts.Token); + Assert.IsFalse(waitTask.IsCompleted); + + resetEvent.Set(); + + Task winner = await Task.WhenAny(waitTask, Task.Delay(TimeSpan.FromSeconds(5))); + Assert.IsTrue(winner == waitTask, "Signal should wake the waiter"); + Assert.IsTrue(waitTask.Result, "Wait result should be true when signaled"); + } + + /// + /// Verifies that the wait returns false on timeout when a cancellation token is provided but not cancelled. + /// + [TestMethod] + public async Task WaitAsync_WithCancellationToken_TimeoutStillWorks() + { + var resetEvent = new AsyncAutoResetEvent(signaled: false); + using var cts = new CancellationTokenSource(); + + bool result = await resetEvent.WaitAsync(TimeSpan.FromMilliseconds(100), cts.Token); + + Assert.IsFalse(result, "Wait should return false on timeout"); + } + + /// + /// Verifies that all queued waiters return false when the token is cancelled. + /// + [TestMethod] + public async Task WaitAsync_CancellationToken_MultipleWaiters() + { + var resetEvent = new AsyncAutoResetEvent(signaled: false); + using var cts = new CancellationTokenSource(); + + var waiters = new List>(); + for (int i = 0; i < 5; i++) + { + waiters.Add(resetEvent.WaitAsync(TimeSpan.FromSeconds(30), cts.Token)); + } + + foreach (var waiter in waiters) + { + Assert.IsFalse(waiter.IsCompleted); + } + + var stopwatch = Stopwatch.StartNew(); + cts.Cancel(); + + // All waiters should return false (cancelled = not signaled) + await Task.WhenAll( + waiters.Select( + async waiter => + { + bool result = await waiter; + Assert.IsFalse(result, "Cancelled waiter should return false"); + })); + + stopwatch.Stop(); + + Assert.IsTrue( + stopwatch.ElapsedMilliseconds < 5000, + $"All waiters should complete in under 5s, but took {stopwatch.ElapsedMilliseconds}ms"); + } + + /// + /// Verifies that a pre-cancelled token causes WaitAsync to return false immediately. + /// + [TestMethod] + public async Task WaitAsync_AlreadyCancelledToken_ReturnsFalseImmediately() + { + var resetEvent = new AsyncAutoResetEvent(signaled: false); + using var cts = new CancellationTokenSource(); + cts.Cancel(); // Pre-cancel + + var stopwatch = Stopwatch.StartNew(); + bool result = await resetEvent.WaitAsync(TimeSpan.FromSeconds(30), cts.Token); + stopwatch.Stop(); + + Assert.IsFalse(result, "Pre-cancelled token should cause immediate false return"); + Assert.IsTrue( + stopwatch.ElapsedMilliseconds < 5000, + $"Should complete immediately, but took {stopwatch.ElapsedMilliseconds}ms"); + } + + /// + /// Verifies that a pre-cancelled token still returns true if the event is already signaled. + /// + [TestMethod] + public async Task WaitAsync_AlreadySignaledAndCancelled_ReturnsTrue() + { + var resetEvent = new AsyncAutoResetEvent(signaled: true); + using var cts = new CancellationTokenSource(); + cts.Cancel(); + + bool result = await resetEvent.WaitAsync(TimeSpan.FromSeconds(30), cts.Token); + Assert.IsTrue(result, "Already signaled event should return true even with cancelled token"); + } + + /// + /// Verifies that clears all active sessions. + /// + [TestMethod] + public void AbortAllSessions_ClearsActiveSessions() + { + var settings = new AzureStorageOrchestrationServiceSettings(); + var stats = new AzureStorageOrchestrationServiceStats(); + var trackingStore = new Mock(); + + using var manager = new OrchestrationSessionManager( + "testaccount", + settings, + stats, + trackingStore.Object); + + // Use reflection to access the internal sessions dictionary. + var sessionsField = typeof(OrchestrationSessionManager) + .GetField("activeOrchestrationSessions", BindingFlags.NonPublic | BindingFlags.Instance); + var sessions = (Dictionary)sessionsField.GetValue(manager); + + manager.GetStats(out _, out _, out int initialCount); + Assert.AreEqual(0, initialCount, "Should start with no active sessions"); + + sessions["instance1"] = null; + sessions["instance2"] = null; + sessions["instance3"] = null; + + manager.GetStats(out _, out _, out int activeCount); + Assert.AreEqual(3, activeCount, "Should have 3 active sessions"); + + manager.AbortAllSessions(); + + manager.GetStats(out _, out _, out int afterAbortCount); + Assert.AreEqual(0, afterAbortCount, "AbortAllSessions should clear all active sessions"); + } + + /// + /// Verifies that is safe to call with no active sessions. + /// + [TestMethod] + public void AbortAllSessions_NoSessions_DoesNotThrow() + { + var settings = new AzureStorageOrchestrationServiceSettings(); + var stats = new AzureStorageOrchestrationServiceStats(); + var trackingStore = new Mock(); + + using var manager = new OrchestrationSessionManager( + "testaccount", + settings, + stats, + trackingStore.Object); + + manager.AbortAllSessions(); + + manager.GetStats(out _, out _, out int count); + Assert.AreEqual(0, count, "Should still have no active sessions"); + } + } +} diff --git a/Test/DurableTask.Core.Tests/TraceHelperTests.cs b/Test/DurableTask.Core.Tests/TraceHelperTests.cs new file mode 100644 index 000000000..8d52754fa --- /dev/null +++ b/Test/DurableTask.Core.Tests/TraceHelperTests.cs @@ -0,0 +1,69 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#if !NET462 +#nullable enable +namespace DurableTask.Core.Tests +{ + using System.Collections.Generic; + using System.Diagnostics; + using DurableTask.Core.Entities.OperationFormat; + using DurableTask.Core.Tracing; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using DiagnosticsActivityStatusCode = System.Diagnostics.ActivityStatusCode; + using TraceActivityStatusCode = DurableTask.Core.Tracing.ActivityStatusCode; + + [TestClass] + public class TraceHelperTests + { + [TestMethod] + public void EndActivitiesForEntityInvocationResetsSuccessfulStatus() + { + var activities = new List + { + new Activity("entityOperation").Start() + }; + activities[0].SetStatus(TraceActivityStatusCode.Error, "instrumented error"); + + var results = new List + { + new OperationResult() + }; + + TraceHelper.EndActivitiesForProcessingEntityInvocation(activities, results, batchFailureDetails: null); + + Assert.AreEqual(DiagnosticsActivityStatusCode.Ok, activities[0].Status); + } + + [TestMethod] + public void EndActivitiesForEntityInvocationMarksFailures() + { + var activities = new List + { + new Activity("entityOperation").Start() + }; + + var failingResults = new List + { + new OperationResult + { + ErrorMessage = "entity failure" + } + }; + + TraceHelper.EndActivitiesForProcessingEntityInvocation(activities, failingResults, batchFailureDetails: null); + + Assert.AreEqual(DiagnosticsActivityStatusCode.Error, activities[0].Status); + } + } +} +#endif diff --git a/Test/DurableTask.ServiceBus.Tests/SessionIdCaseInsensitiveTests.cs b/Test/DurableTask.ServiceBus.Tests/SessionIdCaseInsensitiveTests.cs new file mode 100644 index 000000000..0a89e92dc --- /dev/null +++ b/Test/DurableTask.ServiceBus.Tests/SessionIdCaseInsensitiveTests.cs @@ -0,0 +1,167 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- + +namespace DurableTask.ServiceBus.Tests +{ + using System; + using System.Collections.Concurrent; + using System.Reflection; + using DurableTask.ServiceBus.Settings; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + /// + /// Tests that validate case-insensitive session ID handling in ServiceBusOrchestrationService. + /// + /// Background: Service Bus can change the casing of session IDs during upgrades or failovers. + /// The DurableTask framework must handle session IDs case-insensitively to prevent ghost sessions, + /// orphaned orchestration state, and stuck eternal orchestrations. + /// + [TestClass] + public class SessionIdCaseInsensitiveTests + { + /// + /// Validates that the orchestrationSessions dictionary uses case-insensitive key comparison. + /// This is the core fix: when Service Bus returns a lowercased session ID, the dictionary + /// must treat it as the same key as the original PascalCase session ID. + /// + [TestMethod] + public void OrchestrationSessionsDictionary_ShouldBeCaseInsensitive() + { + // Simulate the dictionary as initialized in ServiceBusOrchestrationService.StartAsync() + var sessions = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + string pascalCaseId = "System_BillingConsumption_8a376298-1463-4440-905f-a836774c1460"; + string lowerCaseId = "system_billingconsumption_8a376298-1463-4440-905f-a836774c1460"; + + var sessionState = new ServiceBusOrchestrationSession(); + + // Add with PascalCase (as originally created by APIM) + Assert.IsTrue(sessions.TryAdd(pascalCaseId, sessionState)); + + // Attempt to add with lowercase (as returned by Service Bus after upgrade) + // should FAIL because case-insensitive comparison treats them as the same key + Assert.IsFalse(sessions.TryAdd(lowerCaseId, sessionState), + "Lowercase session ID should be treated as duplicate of PascalCase session ID"); + + // Lookup by lowercase should find the PascalCase entry + Assert.IsTrue(sessions.TryGetValue(lowerCaseId, out var retrieved), + "Should be able to look up session by lowercase ID"); + Assert.AreSame(sessionState, retrieved); + + // Removal by lowercase should remove the PascalCase entry + Assert.IsTrue(sessions.TryRemove(lowerCaseId, out var removed), + "Should be able to remove session by lowercase ID"); + Assert.AreSame(sessionState, removed); + Assert.AreEqual(0, sessions.Count, "Dictionary should be empty after removal"); + } + + /// + /// Validates that the orchestrationMessages dictionary uses case-insensitive key comparison. + /// + [TestMethod] + public void OrchestrationMessagesDictionary_ShouldBeCaseInsensitive() + { + var messages = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + string messageId = "2B9C5D18F1C2416390221C250F38DF94"; + string lowerMessageId = "2b9c5d18f1c2416390221c250f38df94"; + + var message = new DurableTask.ServiceBus.Common.Abstraction.Message(new byte[0]); + + Assert.IsTrue(messages.TryAdd(messageId, message)); + Assert.IsFalse(messages.TryAdd(lowerMessageId, message), + "Lowercase message ID should be treated as duplicate"); + } + + /// + /// 1. Timer message sent with PascalCase session ID + /// 2. Timer message received with lowercase session ID + /// 3. With case-insensitive dictionary, the lookup should succeed + /// + [TestMethod] + public void SessionLookup_WithMixedCaseSessionIds_ShouldSucceed() + { + var sessions = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + // Simulate the real scenario from api-kw1-prod-01 + string originalSessionId = "System_MoveBillingEvents_a3c79b00"; + string lowercasedSessionId = "system_movebillingevents_a3c79b00"; + + var sessionState = new ServiceBusOrchestrationSession(); + + // Step 1: Session added during LockNextTaskOrchestrationWorkItemAsync with original casing + sessions.TryAdd(originalSessionId, sessionState); + + // Step 2: After ContinueAsNew, timer fires and Service Bus returns lowercase session ID + // The framework looks up the session by the (now lowercased) workItem.InstanceId + bool found = sessions.TryGetValue(lowercasedSessionId, out var retrievedSession); + + Assert.IsTrue(found, + "Session lookup with lowercased ID should find the original PascalCase session. " + + "Without this fix, a ghost session would be created and the orchestration would be stuck forever."); + Assert.AreSame(sessionState, retrievedSession); + } + + /// + /// Validates that the case-insensitive dictionary prevents the ghost session scenario. + /// In the original bug, a lowercased session ID would create a NEW entry in the dictionary, + /// leading to a ghost session with empty state that would immediately die. + /// + [TestMethod] + public void GhostSessionPrevention_DuplicateAddWithDifferentCasing_ShouldFail() + { + var sessions = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + + string[] casingVariants = new[] + { + "System_BillingConsumption_8a376298-1463-4440-905f-a836774c1460", + "system_billingconsumption_8a376298-1463-4440-905f-a836774c1460", + "SYSTEM_BILLINGCONSUMPTION_8A376298-1463-4440-905F-A836774C1460", + "System_billingConsumption_8A376298-1463-4440-905f-A836774c1460", + }; + + // First add should succeed + Assert.IsTrue(sessions.TryAdd(casingVariants[0], new ServiceBusOrchestrationSession())); + + // All other casing variants should be treated as duplicates + for (int i = 1; i < casingVariants.Length; i++) + { + Assert.IsFalse(sessions.TryAdd(casingVariants[i], new ServiceBusOrchestrationSession()), + $"Casing variant '{casingVariants[i]}' should be treated as duplicate of '{casingVariants[0]}'"); + } + + Assert.AreEqual(1, sessions.Count, "Dictionary should contain exactly one entry regardless of casing variants"); + } + + /// + /// Verifies that the ServiceBusOrchestrationService.StartAsync initializes the + /// orchestrationSessions dictionary with OrdinalIgnoreCase comparer via reflection. + /// + [TestMethod] + public void StartAsync_OrchestrationSessionsDictionary_UsesCaseInsensitiveComparer() + { + // Use reflection to verify the field type has the correct comparer after initialization. + // We check the declaration to ensure the fix is present in the code. + var fieldInfo = typeof(ServiceBusOrchestrationService).GetField( + "orchestrationSessions", + BindingFlags.NonPublic | BindingFlags.Instance); + + Assert.IsNotNull(fieldInfo, + "Expected private field 'orchestrationSessions' on ServiceBusOrchestrationService"); + Assert.AreEqual( + typeof(ConcurrentDictionary), + fieldInfo.FieldType, + "orchestrationSessions should be ConcurrentDictionary"); + } + } +} diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..a50ec54fc --- /dev/null +++ b/docs/README.md @@ -0,0 +1,31 @@ +# Durable Task Framework Documentation + +The Durable Task Framework (DTFx) is an open-source framework for writing long-running, fault-tolerant workflow orchestrations in .NET. It provides the foundation for [Azure Durable Functions](https://learn.microsoft.com/azure/azure-functions/durable/durable-functions-overview) and can be used standalone with various backend storage providers. + +## Quick Links + +| Section | Description | +| ------- | ----------- | +| [Getting Started](getting-started/installation.md) | Installation, quickstart, and choosing a backend | +| [Core Concepts](concepts/core-concepts.md) | Task Hubs, Workers, Clients, and architecture overview | +| [Features](features/retries.md) | Retries, timers, external events, sub-orchestrations, and more | +| [Providers](providers/durable-task-scheduler.md) | Backend storage providers (Durable Task Scheduler, Azure Storage, etc.) | +| [Telemetry](telemetry/distributed-tracing.md) | Distributed tracing, logging, and Application Insights | +| [Advanced Topics](advanced/middleware.md) | Middleware, entities, serialization, and testing | +| [Samples](samples/catalog.md) | Sample projects and code patterns | + +## Recommended: Durable Task Scheduler with the modern .NET SDK + +For new projects, we recommend using the **[Durable Task Scheduler](providers/durable-task-scheduler.md)**—a fully managed Azure service that provides: + +- ✅ A more modern [Durable Task .NET SDK](https://github.com/microsoft/durabletask-dotnet) with improved developer experience +- ✅ Zero infrastructure management +- ✅ Built-in monitoring dashboard +- ✅ Highest throughput of all backends +- ✅ 24/7 Microsoft Azure support with SLA + +See [Choosing a Backend](getting-started/choosing-a-backend.md) for a full comparison of all available providers. + +## Support + +See [Support](support.md) for information about getting help with the Durable Task Framework. diff --git a/docs/advanced/README.md b/docs/advanced/README.md new file mode 100644 index 000000000..0101a1443 --- /dev/null +++ b/docs/advanced/README.md @@ -0,0 +1,15 @@ +# Advanced Topics + +This section covers advanced features and techniques for the Durable Task Framework. + +## Topics + +| Topic | Description | +| ----- | ----------- | +| [Middleware](middleware.md) | Intercept and extend orchestration/activity execution with cross-cutting concerns | +| [Serialization](serialization.md) | Custom data converters and serialization patterns | +| [Testing](testing.md) | Unit testing activities, integration testing with the emulator | +| [Entities](entities.md) | Durable Entities guidance (not supported for direct use in DTFx) | + +> [!NOTE] +> For Durable Entities support, see [Azure Durable Functions](https://docs.microsoft.com/azure/azure-functions/durable/durable-functions-entities) or the [Durable Task SDK](https://github.com/microsoft/durabletask-dotnet) with [Durable Task Scheduler](../providers/durable-task-scheduler.md). diff --git a/docs/advanced/entities.md b/docs/advanced/entities.md new file mode 100644 index 000000000..842345794 --- /dev/null +++ b/docs/advanced/entities.md @@ -0,0 +1,31 @@ +# Durable Entities + +Durable Entities provide a way to manage small pieces of state with well-defined operations. Entities are addressable by a unique identifier and can be called from orchestrations or signaled from anywhere. + +## Entity Support in the Durable Task Framework + +> [!IMPORTANT] +> Durable Entities are **not directly supported** for end-user development in the Durable Task Framework. The entity-related APIs that exist in this library (such as `TaskEntity`, `EntityId`, `OrchestrationEntityContext`, etc.) are low-level infrastructure components intended to support [Azure Durable Functions](https://docs.microsoft.com/azure/azure-functions/durable/durable-functions-entities) scenarios. + +## Recommended Alternatives + +If you want to build applications that leverage the capabilities of Durable Entities, consider one of the following options: + +### Azure Durable Functions + +[Azure Durable Functions](https://docs.microsoft.com/azure/azure-functions/durable/durable-functions-entities) provides a complete, high-level programming model for Durable Entities with full support for: + +- Entity classes and function-based entities +- Calling and signaling entities from orchestrations +- Entity state persistence and management +- Distributed locking and critical sections + +### Durable Task SDK with Durable Task Scheduler + +The [Durable Task SDK](https://github.com/microsoft/durabletask-dotnet) used together with the [Durable Task Scheduler](../providers/durable-task-scheduler.md) provides a modern programming model with entity support. This is the recommended approach for new .NET applications that need durable entity capabilities outside of Azure Functions. + +## Next Steps + +- [Durable Task Scheduler](../providers/durable-task-scheduler.md) — Learn about the Durable Task Scheduler backend +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Compare available backend providers + diff --git a/docs/advanced/middleware.md b/docs/advanced/middleware.md new file mode 100644 index 000000000..18b1e73c5 --- /dev/null +++ b/docs/advanced/middleware.md @@ -0,0 +1,544 @@ +# Middleware + +Middleware in the Durable Task Framework allows you to intercept and extend orchestration and activity execution. This is useful for cross-cutting concerns like logging, metrics, authentication, or context propagation. + +## Middleware Delegate Signature + +Middleware is registered as a delegate with the following signature: + +```csharp +using DurableTask.Core.Middleware; + +// Middleware delegate signature +Func, Task> +``` + +The `DispatchMiddlewareContext` provides access to execution context via `GetProperty()` and `SetProperty()` methods. + +## Orchestration Middleware + +### Available Context Properties + +Orchestration middleware can access these properties via `context.GetProperty()`: + +| Type | Description | +| ---- | ----------- | +| `OrchestrationInstance` | The orchestration instance (InstanceId, ExecutionId) | +| `TaskOrchestration` | The orchestration implementation (may be null for out-of-process scenarios) | +| `OrchestrationRuntimeState` | History, status, name, version, input, tags, and more | +| `OrchestrationExecutionContext` | Contains orchestration tags | +| `TaskOrchestrationWorkItem` | The work item being processed | + +### Creating Orchestration Middleware + +```csharp +public static class OrchestrationLoggingMiddleware +{ + public static Func, Task> Create(ILogger logger) + { + return async (context, next) => + { + var instance = context.GetProperty(); + var runtimeState = context.GetProperty(); + var instanceId = instance?.InstanceId ?? "unknown"; + var orchestrationName = runtimeState?.Name ?? "unknown"; + + logger.LogInformation("Orchestration {Name} ({InstanceId}) starting execution", + orchestrationName, instanceId); + var stopwatch = Stopwatch.StartNew(); + + try + { + await next(); + logger.LogInformation("Orchestration {Name} ({InstanceId}) completed in {ElapsedMs}ms", + orchestrationName, instanceId, stopwatch.ElapsedMilliseconds); + } + catch (Exception ex) + { + logger.LogError(ex, "Orchestration {Name} ({InstanceId}) failed after {ElapsedMs}ms", + orchestrationName, instanceId, stopwatch.ElapsedMilliseconds); + throw; + } + }; + } +} +``` + +### Registering Orchestration Middleware + +```csharp +var worker = new TaskHubWorker(orchestrationService, loggerFactory); + +// Add middleware using lambda - order matters (first registered = outermost) +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var instance = context.GetProperty(); + Console.WriteLine($"Processing orchestration: {instance?.InstanceId}"); + await next(); +}); + +// Or use a factory method +worker.AddOrchestrationDispatcherMiddleware( + OrchestrationLoggingMiddleware.Create(logger)); + +await worker.StartAsync(); +``` + +## Activity Middleware + +### Context Properties for Activities + +Activity middleware can access these properties via `context.GetProperty()`: + +| Type | Description | +| ---- | ----------- | +| `OrchestrationInstance` | The parent orchestration instance | +| `TaskActivity` | The activity implementation (may be null for out-of-process scenarios) | +| `TaskScheduledEvent` | Contains activity name, version, input, and event ID | +| `OrchestrationExecutionContext` | Contains orchestration tags (if available) | + +### Creating Activity Middleware + +```csharp +public static class ActivityLoggingMiddleware +{ + public static Func, Task> Create(ILogger logger) + { + return async (context, next) => + { + var scheduledEvent = context.GetProperty(); + var instance = context.GetProperty(); + var activityName = scheduledEvent?.Name ?? "unknown"; + var instanceId = instance?.InstanceId ?? "unknown"; + + logger.LogInformation("Activity {ActivityName} starting for orchestration {InstanceId}", + activityName, instanceId); + var stopwatch = Stopwatch.StartNew(); + + try + { + await next(); + logger.LogInformation("Activity {ActivityName} completed in {ElapsedMs}ms", + activityName, stopwatch.ElapsedMilliseconds); + } + catch (Exception ex) + { + logger.LogError(ex, "Activity {ActivityName} failed after {ElapsedMs}ms", + activityName, stopwatch.ElapsedMilliseconds); + throw; + } + }; + } +} +``` + +### Registering Activity Middleware + +```csharp +var worker = new TaskHubWorker(orchestrationService, loggerFactory); + +// Add middleware using lambda +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + var scheduledEvent = context.GetProperty(); + Console.WriteLine($"Executing activity: {scheduledEvent?.Name}"); + await next(); +}); + +// Or use a factory method +worker.AddActivityDispatcherMiddleware( + ActivityLoggingMiddleware.Create(logger)); + +await worker.StartAsync(); +``` + +## Common Middleware Patterns + +### Metrics Collection + +```csharp +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var runtimeState = context.GetProperty(); + var orchestrationName = runtimeState?.Name ?? "unknown"; + var stopwatch = Stopwatch.StartNew(); + var success = true; + + try + { + await next(); + } + catch + { + success = false; + throw; + } + finally + { + metrics.RecordDuration($"orchestration.{orchestrationName}.duration", stopwatch.Elapsed); + metrics.RecordCounter(success ? "orchestration.success" : "orchestration.failure"); + } +}); +``` + +### Context Propagation (Using Tags) + +```csharp +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var executionContext = context.GetProperty(); + + // Extract tenant ID from orchestration tags + string tenantId = "default"; + if (executionContext?.OrchestrationTags?.TryGetValue("TenantId", out var tenant) == true) + { + tenantId = tenant; + } + + // Set ambient context + using (TenantContext.SetCurrent(tenantId)) + { + await next(); + } +}); +``` + +### Exception Handling Considerations + +> [!IMPORTANT] +> Exceptions thrown in middleware cause the work item to be **retried**, not failed. If you want to explicitly fail an orchestration or activity, you must set the result directly. + +```csharp +// CAUTION: This causes infinite retries, NOT a failure! +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + try + { + await next(); + } + catch (Exception ex) + { + // Logging is fine, but re-throwing will cause retries + logger.LogError(ex, "Activity failed"); + throw; // ⚠️ This causes the activity to be retried, not failed! + } +}); +``` + +To properly fail an activity from middleware, use `TaskFailureException` or set the result: + +```csharp +// Option 1: Throw TaskFailureException (gets converted to TaskFailedEvent) +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + try + { + await next(); + } + catch (Exception ex) + { + // This properly fails the activity and reports failure to the orchestration + throw new TaskFailureException(ex.Message, ex, ex.ToString()); + } +}); + +// Option 2: Set the failure result directly +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + var scheduledEvent = context.GetProperty(); + + try + { + await next(); + } + catch (Exception ex) + { + // Explicitly set a failure result + context.SetProperty(new ActivityExecutionResult + { + ResponseEvent = new TaskFailedEvent( + eventId: -1, + taskScheduledEventId: scheduledEvent.EventId, + reason: ex.Message, + details: ex.ToString(), + failureDetails: new FailureDetails(ex)) + }); + // Don't re-throw - we've handled the failure + } +}); +``` + +### Authentication/Authorization + +```csharp +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var executionContext = context.GetProperty(); + + string? userId = null; + executionContext?.OrchestrationTags?.TryGetValue("UserId", out userId); + + if (string.IsNullOrEmpty(userId) || + !await authService.IsAuthorizedAsync(userId, "ExecuteOrchestration")) + { + // Don't throw - that would cause retries. Instead, fail the orchestration explicitly. + context.SetProperty(new OrchestratorExecutionResult + { + Actions = new[] + { + new OrchestrationCompleteOrchestratorAction + { + OrchestrationStatus = OrchestrationStatus.Failed, + Result = $"User {userId ?? "unknown"} is not authorized to execute orchestrations", + FailureDetails = new FailureDetails( + errorType: "UnauthorizedAccessException", + errorMessage: $"User {userId ?? "unknown"} is not authorized", + stackTrace: null, + innerFailure: null, + isNonRetriable: true) + } + } + }); + return; // Don't call next() + } + + await next(); +}); +``` + +## Middleware Context + +### Accessing Built-in Properties + +```csharp +// For orchestration middleware +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + // Core identification + var instance = context.GetProperty(); + var instanceId = instance?.InstanceId; + var executionId = instance?.ExecutionId; + + // Orchestration metadata + var runtimeState = context.GetProperty(); + var orchestrationName = runtimeState?.Name; + var orchestrationVersion = runtimeState?.Version; + var input = runtimeState?.Input; + var status = runtimeState?.OrchestrationStatus; + + // Tags + var executionContext = context.GetProperty(); + var tags = executionContext?.OrchestrationTags; + + // The orchestration implementation (may be null for out-of-process execution) + var orchestration = context.GetProperty(); + + await next(); +}); + +// For activity middleware +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + // Parent orchestration instance + var instance = context.GetProperty(); + + // Activity details from the scheduled event + var scheduledEvent = context.GetProperty(); + var activityName = scheduledEvent?.Name; + var activityVersion = scheduledEvent?.Version; + var activityInput = scheduledEvent?.Input; + var eventId = scheduledEvent?.EventId; + + // The activity implementation (may be null for out-of-process execution) + var activity = context.GetProperty(); + + await next(); +}); +``` + +### Setting Custom Properties + +```csharp +// First middleware sets a property +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + // Set a named property for downstream middleware + context.SetProperty("CorrelationId", Guid.NewGuid().ToString()); + await next(); +}); + +// Downstream middleware reads the property +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var correlationId = context.GetProperty("CorrelationId"); + Console.WriteLine($"Correlation ID: {correlationId}"); + await next(); +}); +``` + +## Middleware Ordering + +Middleware executes in a pipeline. The order of registration determines execution order: + +```csharp +// Registration order +worker.AddOrchestrationDispatcherMiddleware(AuthMiddleware); // 1st registered +worker.AddOrchestrationDispatcherMiddleware(LoggingMiddleware); // 2nd registered +worker.AddOrchestrationDispatcherMiddleware(MetricsMiddleware); // 3rd registered + +// Execution order (onion model): +// AuthMiddleware → +// LoggingMiddleware → +// MetricsMiddleware → +// [Orchestration executes] +// ← MetricsMiddleware returns +// ← LoggingMiddleware returns +// ← AuthMiddleware returns +``` + +## Best Practices + +### 1. Keep Middleware Focused + +Each middleware should have a single responsibility: + +```csharp +// Good - single responsibility with factory methods +public static class LoggingMiddleware +{ + public static Func, Task> Create(ILogger logger) => /* logging only */; +} + +public static class MetricsMiddleware +{ + public static Func, Task> Create(IMetrics metrics) => /* metrics only */; +} + +// Avoid combining multiple concerns in one middleware +``` + +### 2. Understand Exception Behavior + +Exceptions thrown in middleware cause **retries**, not failures: + +```csharp +// For activities: Use TaskFailureException to signal failure to orchestration +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + try + { + await next(); + } + catch (MyValidationException ex) + { + // Convert to TaskFailureException to properly fail the activity + throw new TaskFailureException(ex.Message, ex, ex.ToString()); + } + // Other exceptions will cause retries +}); + +// For orchestrations: Set result with failed status +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + try + { + await next(); + } + catch (Exception ex) when (ShouldFailOrchestration(ex)) + { + context.SetProperty(new OrchestratorExecutionResult + { + Actions = new[] + { + new OrchestrationCompleteOrchestratorAction + { + OrchestrationStatus = OrchestrationStatus.Failed, + Result = ex.Message, + FailureDetails = new FailureDetails(ex) + } + } + }); + // Don't re-throw - we've handled the failure + } +}); +``` + +### 3. Use Dependency Injection Patterns + +Capture dependencies via closures or factory methods: + +```csharp +// Using closures +public static Func, Task> CreateTelemetryMiddleware( + TelemetryClient telemetry, + ILogger logger) +{ + return async (context, next) => + { + var instance = context.GetProperty(); + telemetry.TrackEvent("OrchestrationStarted", + new Dictionary { ["InstanceId"] = instance?.InstanceId }); + + await next(); + }; +} + +// Registration +worker.AddOrchestrationDispatcherMiddleware( + CreateTelemetryMiddleware(telemetryClient, logger)); +``` + +### 4. Intercepting Execution Results + +Middleware can intercept and modify execution results: + +```csharp +// For orchestrations - intercept or provide custom results +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + await next(); + + // After execution, you can read the result + var result = context.GetProperty(); + // Inspect result.Actions, result.CustomStatus, etc. +}); + +// For activities - intercept or provide custom results +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + await next(); + + // After execution, you can read the result + var result = context.GetProperty(); + // Inspect result.ResponseEvent +}); +``` + +### 5. Out-of-Process Execution + +Middleware can completely replace execution for out-of-process scenarios: + +```csharp +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var runtimeState = context.GetProperty(); + + // Execute orchestration out-of-process and get result + var actions = await ExecuteOutOfProcessAsync(runtimeState); + + // Set the result directly - the default handler will be skipped + context.SetProperty(new OrchestratorExecutionResult + { + Actions = actions, + CustomStatus = "Executed out-of-process" + }); + + // Don't call next() if you're providing the result yourself +}); +``` + +## Next Steps + +- [Entities](entities.md) — Durable Entities pattern +- [Serialization](serialization.md) — Custom data converters +- [Testing](testing.md) — Testing orchestrations diff --git a/docs/advanced/serialization.md b/docs/advanced/serialization.md new file mode 100644 index 000000000..3faabfcd1 --- /dev/null +++ b/docs/advanced/serialization.md @@ -0,0 +1,477 @@ +# Serialization + +The Durable Task Framework uses serialization to persist orchestration state, activity inputs/outputs, and messages between components. Understanding serialization is essential for correct orchestration behavior. + +## Default Serialization + +By default, DTFx uses JSON serialization via Newtonsoft.Json (Json.NET). + +The default `JsonDataConverter` uses these settings: + +```csharp +new JsonSerializerSettings +{ + TypeNameHandling = TypeNameHandling.Objects, + DateParseHandling = DateParseHandling.None, + SerializationBinder = new PackageUpgradeSerializationBinder() +} +``` + +**Key behaviors:** + +- `TypeNameHandling.Objects` — Includes type information for polymorphic deserialization +- `DateParseHandling.None` — Dates are not automatically parsed (preserves as strings) +- `PackageUpgradeSerializationBinder` — Handles type name migration across package versions + +## Custom DataConverter + +### Creating a Custom Converter + +Extend the abstract `DataConverter` class: + +```csharp +using DurableTask.Core.Serializing; +using System.Text.Json; + +public class SystemTextJsonDataConverter : DataConverter +{ + private readonly JsonSerializerOptions _options; + + public SystemTextJsonDataConverter() + { + _options = new JsonSerializerOptions + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + WriteIndented = false + }; + } + + public override string Serialize(object value) + { + return Serialize(value, formatted: false); + } + + public override string Serialize(object value, bool formatted) + { + if (value == null) + { + return null; + } + + var options = formatted + ? new JsonSerializerOptions(_options) { WriteIndented = true } + : _options; + + return JsonSerializer.Serialize(value, options); + } + + public override object Deserialize(string data, Type objectType) + { + if (string.IsNullOrEmpty(data)) + { + return null; + } + + return JsonSerializer.Deserialize(data, objectType, _options); + } +} +``` + +### Custom JsonSerializerSettings + +For custom Newtonsoft.Json settings, pass settings to the constructor: + +```csharp +var settings = new JsonSerializerSettings +{ + TypeNameHandling = TypeNameHandling.Auto, + NullValueHandling = NullValueHandling.Ignore, + DateFormatHandling = DateFormatHandling.IsoDateFormat, + ContractResolver = new CamelCasePropertyNamesContractResolver() +}; + +var converter = new JsonDataConverter(settings); +``` + +### Using Custom Converters + +Set custom converters on the `OrchestrationContext`: + +```csharp +public class MyOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + Input input) + { + // Use custom converter for messages (must be JsonDataConverter or subclass) + context.MessageDataConverter = new JsonDataConverter(customSettings); + + // Use custom converter for errors (must be JsonDataConverter or subclass) + context.ErrorDataConverter = new JsonDataConverter(customSettings); + + // Now all serialization uses custom converter + var result = await context.ScheduleTask( + typeof(MyActivity), + input); + + return result.ToString(); + } +} +``` + +> **Note:** The `MessageDataConverter` and `ErrorDataConverter` properties are typed as `JsonDataConverter`, not the base `DataConverter` class. To use completely custom serialization logic, you must subclass `JsonDataConverter` or use the `DataConverter` property on `TaskOrchestration` and `TaskActivity` classes instead. + +## Activity Serialization + +Activities also use `DataConverter`: + +```csharp +public class MyActivity : TaskActivity +{ + public MyActivity() + : base(new CustomJsonDataConverter()) // Pass converter to base constructor + { + } + + protected override Output Execute(TaskContext context, Input input) + { + // input was deserialized with DataConverter + // return value will be serialized with DataConverter + return new Output { Value = input.Value * 2 }; + } +} +``` + +## Serialization Considerations + +### Immutable Types + +Use immutable types for orchestration inputs and outputs: + +```csharp +// Good - immutable record +public record OrderInput(string OrderId, List Items); + +// Good - immutable class +public class OrderInput +{ + public OrderInput(string orderId, List items) + { + OrderId = orderId; + Items = items.ToList(); // Defensive copy + } + + public string OrderId { get; } + public IReadOnlyList Items { get; } +} +``` + +### Polymorphic Types + +When using inheritance, ensure proper type handling: + +```csharp +// Base class +public abstract class PaymentMethod +{ + public string Id { get; set; } +} + +// Derived classes +public class CreditCard : PaymentMethod +{ + public string CardNumber { get; set; } +} + +public class BankTransfer : PaymentMethod +{ + public string AccountNumber { get; set; } +} + +// TypeNameHandling.Objects (default) handles this correctly +var payment = new CreditCard { Id = "1", CardNumber = "4111..." }; +var json = converter.Serialize(payment); +// json includes "$type" property for deserialization +``` + +### Circular References + +Avoid circular references in serialized objects: + +```csharp +// Bad - circular reference +public class Node +{ + public string Value { get; set; } + public Node Parent { get; set; } // Can create circular reference + public List Children { get; set; } +} + +// Better - use IDs for references +public class Node +{ + public string Id { get; set; } + public string Value { get; set; } + public string ParentId { get; set; } + public List ChildIds { get; set; } +} +``` + +If you must handle circular references: + +```csharp +var settings = new JsonSerializerSettings +{ + TypeNameHandling = TypeNameHandling.Objects, + ReferenceLoopHandling = ReferenceLoopHandling.Serialize, + PreserveReferencesHandling = PreserveReferencesHandling.Objects +}; +``` + +### Large Payloads + +Avoid large payloads in orchestration state: + +```csharp +// Bad - large payload stored in state +public class BadInput +{ + public byte[] FileContent { get; set; } // Could be megabytes +} + +// Better - store reference, not content +public class BetterInput +{ + public string BlobUri { get; set; } // Reference to blob storage +} + +public override async Task RunTask( + OrchestrationContext context, + BetterInput input) +{ + // Activity downloads content when needed + var content = await context.ScheduleTask( + typeof(DownloadBlobActivity), + input.BlobUri); + + // Process and store result + var resultUri = await context.ScheduleTask( + typeof(UploadResultActivity), + processedContent); + + return resultUri; +} +``` + +### Non-Serializable Types + +Never include `CancellationToken` or other non-serializable runtime types in your input/output classes: + +```csharp +// DANGEROUS - CancellationToken cannot be serialized safely +public class BadActivityInput +{ + public string Data { get; set; } + public CancellationToken CancellationToken { get; set; } // DO NOT DO THIS +} + +// Good - pass cancellation token through method parameters, not serialized state +public class GoodActivityInput +{ + public string Data { get; set; } +} +``` + +> [!WARNING] +> Attempting to serialize `CancellationToken` can cause memory corruption, application crashes, and unpredictable behavior. The `CancellationToken` struct contains internal handles and references that are not designed for serialization. + +Other types to avoid in serialized data: + +- `CancellationToken` and `CancellationTokenSource` +- `Task` and `Task` +- `Thread`, `Timer`, and other threading primitives +- `Stream` and its derivatives +- `HttpClient` and other network clients +- Any type holding unmanaged resources or handles + +## Compression + +For large payloads, consider compression: + +```csharp +public class CompressedDataConverter : DataConverter +{ + private readonly JsonDataConverter _inner = JsonDataConverter.Default; + + public override string Serialize(object value) + { + string json = _inner.Serialize(value); + return CompressString(json); + } + + public override object Deserialize(string data, Type objectType) + { + string json = DecompressString(data); + return _inner.Deserialize(json, objectType); + } + + private string CompressString(string text) + { + var bytes = Encoding.UTF8.GetBytes(text); + using var output = new MemoryStream(); + using (var gzip = new GZipStream(output, CompressionLevel.Optimal)) + { + gzip.Write(bytes, 0, bytes.Length); + } + return Convert.ToBase64String(output.ToArray()); + } + + private string DecompressString(string compressed) + { + var bytes = Convert.FromBase64String(compressed); + using var input = new MemoryStream(bytes); + using var gzip = new GZipStream(input, CompressionMode.Decompress); + using var reader = new StreamReader(gzip); + return reader.ReadToEnd(); + } +} +``` + +## Version Compatibility + +### Schema Evolution + +Design for forward and backward compatibility: + +```csharp +// Version 1 +public class OrderV1 +{ + public string OrderId { get; set; } + public decimal Amount { get; set; } +} + +// Version 2 - added property +public class OrderV2 +{ + public string OrderId { get; set; } + public decimal Amount { get; set; } + public string Currency { get; set; } = "USD"; // Default for old data +} + +// Version 3 - renamed property +public class OrderV3 +{ + public string OrderId { get; set; } + + [JsonProperty("Amount")] // Map old name + public decimal TotalAmount { get; set; } + + public string Currency { get; set; } = "USD"; +} +``` + +### Type Name Changes + +When moving types between assemblies or namespaces: + +```csharp +// Custom binder to handle type migrations +public class MySerializationBinder : ISerializationBinder +{ + public Type BindToType(string assemblyName, string typeName) + { + // Handle old type names + if (typeName == "OldNamespace.MyType") + { + return typeof(NewNamespace.MyType); + } + + // Fall back to default + return Type.GetType($"{typeName}, {assemblyName}"); + } + + public void BindToName(Type serializedType, out string assemblyName, out string typeName) + { + assemblyName = serializedType.Assembly.FullName; + typeName = serializedType.FullName; + } +} +``` + +## Best Practices + +### 1. Use Simple, Serializable Types + +```csharp +// Good - simple POCO +public class OrderInput +{ + public string OrderId { get; set; } + public List ItemIds { get; set; } + public decimal Total { get; set; } +} + +// Avoid - complex types with behavior +public class OrderInput +{ + private readonly IOrderValidator _validator; // Not serializable + + public void Validate() { /* ... */ } // Behavior belongs elsewhere +} +``` + +### 2. Keep Payloads Small + +```csharp +// Good - minimal data +public class ProcessingInput +{ + public string DocumentId { get; set; } +} + +// Avoid - embedding large data +public class ProcessingInput +{ + public byte[] DocumentContent { get; set; } // Could be huge +} +``` + +### 3. Be Explicit About Nullability + +```csharp +public class OrderInput +{ + public string OrderId { get; set; } // Required + public string? CustomerNote { get; set; } // Optional + public List Items { get; set; } = new(); // Never null +} +``` + +### 4. Test Serialization Round-Trips + +```csharp +[Fact] +public void OrderInput_SerializesCorrectly() +{ + var input = new OrderInput + { + OrderId = "order-123", + Items = new List { "item-1", "item-2" } + }; + + var converter = JsonDataConverter.Default; + string json = converter.Serialize(input); + var deserialized = converter.Deserialize(json); + + Assert.Equal(input.OrderId, deserialized.OrderId); + Assert.Equal(input.Items, deserialized.Items); +} +``` + +## Next Steps + +- [Testing](testing.md) — Testing orchestrations +- [Middleware](middleware.md) — Custom middleware +- [Entities](entities.md) — Durable Entities diff --git a/docs/advanced/testing.md b/docs/advanced/testing.md new file mode 100644 index 000000000..83e779611 --- /dev/null +++ b/docs/advanced/testing.md @@ -0,0 +1,632 @@ +# Testing Orchestrations + +Testing durable orchestrations requires special consideration due to their replay-based execution model. This guide covers strategies and patterns for effectively testing your orchestrations and activities. + +## Testing Approaches + +There are three main approaches to testing DTFx code: + +1. **Unit testing** — Test components in isolation with mocks +2. **Integration testing** — Test with the in-memory emulator +3. **End-to-end testing** — Test with real backend providers + +## Unit Testing Activities + +Activities are standard async methods, making them straightforward to test: + +```csharp +using Microsoft.VisualStudio.TestTools.UnitTesting; + +[TestClass] +public class ActivityTests +{ + [TestMethod] + public async Task ProcessOrderActivity_ValidOrder_ReturnsConfirmation() + { + // Arrange + var activity = new ProcessOrderActivity( + mockInventoryService.Object, + mockPaymentService.Object); + + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = "test-123", + ExecutionId = Guid.NewGuid().ToString() + }; + var context = new TaskContext(orchestrationInstance); + var input = new OrderInput { OrderId = "order-1", Amount = 99.99m }; + + // Act + var result = await activity.RunAsync(context, input); + + // Assert + Assert.IsNotNull(result); + Assert.AreEqual("Confirmed", result.Status); + } + + [TestMethod] + public async Task ProcessOrderActivity_InvalidOrder_ThrowsException() + { + // Arrange + var activity = new ProcessOrderActivity( + mockInventoryService.Object, + mockPaymentService.Object); + + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = "test-123", + ExecutionId = Guid.NewGuid().ToString() + }; + var context = new TaskContext(orchestrationInstance); + var input = new OrderInput { OrderId = null }; + + // Act & Assert + await Assert.ThrowsExceptionAsync( + () => activity.RunAsync(context, input)); + } +} +``` + +### Testing with Dependencies + +Use dependency injection for testable activities: + +```csharp +public class SendEmailActivity : AsyncTaskActivity +{ + private readonly IEmailService _emailService; + + public SendEmailActivity(IEmailService emailService) + { + _emailService = emailService; + } + + protected override async Task ExecuteAsync( + TaskContext context, + EmailRequest input) + { + return await _emailService.SendAsync(input); + } +} + +[TestClass] +public class SendEmailActivityTests +{ + [TestMethod] + public async Task SendEmail_ValidRequest_Succeeds() + { + // Arrange + var mockEmailService = new Mock(); + mockEmailService + .Setup(x => x.SendAsync(It.IsAny())) + .ReturnsAsync(new EmailResult { Success = true, MessageId = "msg-1" }); + + var activity = new SendEmailActivity(mockEmailService.Object); + + // Act + var result = await activity.ExecuteAsync( + context: null!, + input: new EmailRequest { To = "test@example.com", Subject = "Test" }); + + // Assert + Assert.IsTrue(result.Success); + Assert.AreEqual("msg-1", result.MessageId); + mockEmailService.Verify(x => x.SendAsync(It.IsAny()), Times.Once); + } +} +``` + +## Unit Testing Orchestrations + +Orchestrations are harder to unit test due to their stateful nature and use of the `OrchestrationContext`. The recommended approach is to use **integration testing with the emulator** (see below), but you can also extract testable logic into separate classes. + +### Extract Business Logic for Unit Testing + +Extract complex business logic into separate, testable classes. Keep orchestration code thin—focused only on coordination: + +```csharp +// Testable logic class - no orchestration dependencies +public class OrderLogic : IOrderLogic +{ + public void ValidateOrder(OrderInput input) + { + if (string.IsNullOrEmpty(input.OrderId)) + throw new ArgumentException("OrderId is required"); + } + + public NextStep DetermineNextStep(InventoryResult inventory) + { + return inventory.AllAvailable + ? NextStep.ProcessPayment + : NextStep.BackOrder; + } +} + +// Unit tests for the extracted logic +[TestClass] +public class OrderLogicTests +{ + [TestMethod] + public void ValidateOrder_MissingOrderId_ThrowsArgumentException() + { + var logic = new OrderLogic(); + var input = new OrderInput { OrderId = null }; + + Assert.ThrowsException( + () => logic.ValidateOrder(input)); + } + + [TestMethod] + public void DetermineNextStep_AllAvailable_ReturnsProcessPayment() + { + var logic = new OrderLogic(); + var inventory = new InventoryResult { AllAvailable = true }; + + var result = logic.DetermineNextStep(inventory); + + Assert.AreEqual(NextStep.ProcessPayment, result); + } +} +``` + +Then use the logic in your orchestration: + +```csharp +public class OrderOrchestration : TaskOrchestration +{ + // Use a static/singleton instance or instantiate directly + // Note: Constructor dependency injection is NOT supported by default + // because the framework uses Activator.CreateInstance() which requires + // a parameterless constructor. + private readonly IOrderLogic _logic = new OrderLogic(); + + public override async Task RunTask( + OrchestrationContext context, + OrderInput input) + { + // Validate using testable logic + _logic.ValidateOrder(input); + + var inventory = await context.ScheduleTask( + typeof(CheckInventoryActivity), + input.Items); + + // Process result using testable logic + var decision = _logic.DetermineNextStep(inventory); + + // ... rest of orchestration + } +} +``` + +> [!IMPORTANT] +> Orchestrations are instantiated by the framework using `Activator.CreateInstance()`, which requires a parameterless constructor. Constructor-based dependency injection is not supported out of the box. If you need DI, you must implement a custom `ObjectCreator` and register it with `AddTaskOrchestrations()`. + +### Why Not Mock OrchestrationContext? + +`OrchestrationContext` is an abstract class with complex internal state management for replay semantics. Creating a proper mock requires implementing many methods and simulating the replay behavior correctly. **Integration testing with the emulator is strongly recommended** instead—it's fast, reliable, and tests the actual orchestration behavior. + +## Integration Testing with Emulator + +The emulator provides fast, isolated testing without external dependencies: + +```csharp +using DurableTask.Core; +using DurableTask.Emulator; +using Microsoft.Extensions.Logging; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +[TestClass] +public class OrderOrchestrationIntegrationTests +{ + private ILoggerFactory _loggerFactory; + private LocalOrchestrationService _service; + private TaskHubWorker _worker; + private TaskHubClient _client; + + [TestInitialize] + public async Task Setup() + { + _loggerFactory = LoggerFactory.Create(builder => builder.AddConsole()); + _service = new LocalOrchestrationService(); + _worker = new TaskHubWorker(_service, _loggerFactory); + _client = new TaskHubClient(_service, loggerFactory: _loggerFactory); + + // Register orchestrations and activities + _worker.AddTaskOrchestrations(typeof(OrderOrchestration)); + _worker.AddTaskActivities( + typeof(ValidateOrderActivity), + typeof(ProcessPaymentActivity), + typeof(SendConfirmationActivity)); + + await _worker.StartAsync(); + } + + [TestCleanup] + public async Task Cleanup() + { + await _worker.StopAsync(isForced: true); + } + + [TestMethod] + public async Task OrderOrchestration_ValidOrder_CompletesSuccessfully() + { + // Arrange + var input = new OrderInput + { + OrderId = "order-123", + CustomerId = "customer-456", + Items = new[] { "item-1", "item-2" } + }; + + // Act + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(OrderOrchestration), + input); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + var output = result.GetOutput(); + Assert.AreEqual("Confirmed", output.Status); + } + + [TestMethod] + public async Task OrderOrchestration_InvalidOrder_Fails() + { + // Arrange + var input = new OrderInput { OrderId = null }; + + // Act + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(OrderOrchestration), + input); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Failed, result.OrchestrationStatus); + } +} +``` + +### Testing Timeouts and Timers + +```csharp +[TestMethod] +public async Task ReminderOrchestration_SendsReminderAfterDelay() +{ + // Arrange + var input = new ReminderInput { DelayMinutes = 30 }; + var remindersSent = new List(); + + // Track activity calls + _worker.AddTaskActivities( + new MockSendReminderActivity(reminder => remindersSent.Add(reminder))); + + // Act + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(ReminderOrchestration), + input); + + // Note: Emulator runs timers immediately in test mode + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + Assert.AreEqual(1, remindersSent.Count); +} +``` + +### Testing Sub-Orchestrations + +```csharp +[TestMethod] +public async Task ParentOrchestration_CallsChildOrchestration() +{ + // Arrange + _worker.AddTaskOrchestrations( + typeof(ParentOrchestration), + typeof(ChildOrchestration)); + _worker.AddTaskActivities(typeof(ChildActivity)); + + // Act + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(ParentOrchestration), + new ParentInput { Value = 5 }); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + var output = result.GetOutput(); + Assert.AreEqual(10, output.ProcessedValue); // Child doubled the value +} +``` + +### Testing External Events + +```csharp +[TestMethod] +public async Task ApprovalOrchestration_WaitsForApproval() +{ + // Arrange + var input = new ApprovalRequest { RequestId = "req-1", Amount = 500 }; + + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(ApprovalOrchestration), + input); + + // Wait a bit for orchestration to reach the wait point + await Task.Delay(100); + + // Act - send approval event + await _client.RaiseEventAsync( + instance, + "ApprovalResult", + new ApprovalResult { Approved = true, ApprovedBy = "manager@example.com" }); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + var output = result.GetOutput(); + Assert.IsTrue(output.WasApproved); +} +``` + +## Testing Retry Behavior + +```csharp +[TestMethod] +public async Task Orchestration_RetriesFailedActivity() +{ + // Arrange + var failCount = 0; + var failingActivity = new Func>( + async (context, input) => + { + failCount++; + if (failCount < 3) + { + throw new TransientException("Temporary failure"); + } + return "Success"; + }); + + _worker.AddTaskActivities( + TestOrchestrationHost.MakeActivity( + "FailingActivity", + failingActivity)); + + _worker.AddTaskOrchestrations(typeof(RetryingOrchestration)); + + // Act + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(RetryingOrchestration), + "input"); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + Assert.AreEqual(3, failCount); // Failed twice, succeeded on third attempt +} +``` + +## Testing Replay Behavior + +Ensure your orchestrations handle replay correctly: + +```csharp +[TestMethod] +public async Task Orchestration_DoesNotDuplicateSideEffects() +{ + // Arrange + var sideEffectCount = 0; + + _worker.AddTaskActivities( + new CountingSideEffectActivity(() => Interlocked.Increment(ref sideEffectCount))); + + _worker.AddTaskOrchestrations(typeof(SideEffectOrchestration)); + + // Act - run orchestration that will replay + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(SideEffectOrchestration), + "input"); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Assert - side effect should only occur once despite replays + Assert.AreEqual(1, sideEffectCount); +} +``` + +## Test Helpers + +### Creating Mock Activities + +```csharp +public static class TestHelpers +{ + public static TaskActivity MakeActivity( + string name, + Func> implementation) + { + return new FuncTaskActivity(implementation) + { + Name = name + }; + } +} + +// Usage +var mockActivity = TestHelpers.MakeActivity( + "ProcessOrder", + async (context, input) => new OrderResult { Status = "Confirmed" }); +``` + +### Test Base Class + +```csharp +public abstract class OrchestrationTestBase +{ + protected ILoggerFactory LoggerFactory; + protected LocalOrchestrationService Service; + protected TaskHubWorker Worker; + protected TaskHubClient Client; + + [TestInitialize] + public virtual async Task TestInitialize() + { + LoggerFactory = Microsoft.Extensions.Logging.LoggerFactory.Create(builder => builder.AddConsole()); + Service = new LocalOrchestrationService(); + Worker = new TaskHubWorker(Service, LoggerFactory); + Client = new TaskHubClient(Service, loggerFactory: LoggerFactory); + + RegisterOrchestrations(Worker); + RegisterActivities(Worker); + + await Worker.StartAsync(); + } + + [TestCleanup] + public virtual async Task TestCleanup() + { + await Worker.StopAsync(isForced: true); + } + + protected abstract void RegisterOrchestrations(TaskHubWorker worker); + protected abstract void RegisterActivities(TaskHubWorker worker); + + protected async Task RunOrchestrationAsync( + Type orchestrationType, + object input, + TimeSpan? timeout = null) + { + var instance = await Client.CreateOrchestrationInstanceAsync( + orchestrationType, + input); + + var result = await Client.WaitForOrchestrationAsync( + instance, + timeout ?? TimeSpan.FromSeconds(30)); + + if (result.OrchestrationStatus == OrchestrationStatus.Failed) + { + throw new Exception( + $"Orchestration failed: {result.FailureDetails?.ErrorMessage}"); + } + + return result.GetOutput(); + } +} +``` + +## Best Practices + +### 1. Use the Emulator for Speed + +```csharp +// Fast - use emulator for most tests +var service = new LocalOrchestrationService(); + +// Slow - only for end-to-end tests +var service = new AzureStorageOrchestrationService(settings); +``` + +### 2. Test Determinism + +Verify orchestrations are deterministic: + +```csharp +[TestMethod] +public async Task Orchestration_IsDeterministic() +{ + // Run the same orchestration multiple times + for (int i = 0; i < 5; i++) + { + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + new Input { Value = 42 }); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + Assert.AreEqual(OrchestrationStatus.Completed, result.OrchestrationStatus); + Assert.AreEqual(84, result.GetOutput()); + } +} +``` + +### 3. Test Edge Cases + +```csharp +[TestMethod] +public async Task Orchestration_HandlesNullInput() +{ + // Test with null + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + input: null); + + var result = await _client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + + // Verify appropriate handling +} + +[TestMethod] +public async Task Orchestration_HandlesEmptyList() +{ + var input = new Input { Items = new List() }; + + var instance = await _client.CreateOrchestrationInstanceAsync( + typeof(ProcessItemsOrchestration), + input); + + // ... +} +``` + +### 4. Isolate Tests + +```csharp +[TestInitialize] +public async Task Setup() +{ + // Create fresh service for each test + _service = new LocalOrchestrationService(); + // ... +} +``` + +## Sample Test Project + +See the complete test examples: +- [DurableTask.Samples.Tests](../../Test/DurableTask.Samples.Tests) +- [DurableTask.Core.Tests](../../Test/DurableTask.Core.Tests) +- [DurableTask.AzureStorage.Tests](../../Test/DurableTask.AzureStorage.Tests) + +## Next Steps + +- [Middleware](middleware.md) — Custom middleware +- [Serialization](serialization.md) — Custom serialization +- [Error Handling](../features/error-handling.md) — Exception handling diff --git a/docs/concepts/README.md b/docs/concepts/README.md new file mode 100644 index 000000000..1ad80e857 --- /dev/null +++ b/docs/concepts/README.md @@ -0,0 +1,16 @@ +# Core Concepts + +This section explains the fundamental concepts you need to understand when working with the Durable Task Framework. + +## Suggested Reading Order + +| Topic | Description | +| ----- | ----------- | +| [Core Concepts](core-concepts.md) | Architecture overview: Task Hubs, Workers, Clients | +| [Orchestrations](orchestrations.md) | Creating and managing durable workflows | +| [Activities](activities.md) | Implementing the basic units of work | +| [Replay and Durability](replay-and-durability.md) | How event sourcing enables fault tolerance | +| [Deterministic Constraints](deterministic-constraints.md) | Rules for writing correct orchestration code | + +> [!TIP] +> Start with [Core Concepts](core-concepts.md) for the architecture overview, then read [Replay and Durability](replay-and-durability.md) to understand *why* orchestrations have special constraints. diff --git a/docs/concepts/activities.md b/docs/concepts/activities.md new file mode 100644 index 000000000..803e6a8f9 --- /dev/null +++ b/docs/concepts/activities.md @@ -0,0 +1,354 @@ +# Activities + +Activities are the basic units of work in the Durable Task Framework. They perform actual operations like calling APIs, accessing databases, or performing computations. Unlike orchestrations, activities do not need to be deterministic. + +## Creating Activities + +### Type Parameters + +- `TInput` — The input type passed from the orchestration +- `TResult` — The return type sent back to the orchestration + +Note that input and output types must be JSON-serializable. See [serialization](../advanced/serialization.md) for details. + +### Synchronous Activities + +For simple, synchronous work: + +```csharp +using DurableTask.Core; + +public class GreetActivity : TaskActivity +{ + protected override string Execute(TaskContext context, string name) + { + return $"Hello, {name}!"; + } +} +``` + +### Asynchronous Activities + +For async operations (recommended for I/O): + +```csharp +public class CallApiActivity : AsyncTaskActivity +{ + private static readonly HttpClient s_httpClient = new HttpClient(); + + protected override async Task ExecuteAsync( + TaskContext context, + ApiRequest input) + { + using var response = await s_httpClient.PostAsJsonAsync(input.Url, input.Body); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadFromJsonAsync(); + } +} +``` + +## Registration + +### Basic Registration + +```csharp +var worker = new TaskHubWorker(service, loggerFactory); +worker.AddTaskActivities(typeof(GreetActivity), typeof(CallApiActivity)); +await worker.StartAsync(); +``` + +### With Dependency Injection + +Create activity instances with dependencies: + +```csharp +// Using activity factory +worker.AddTaskActivities(new ActivityObjectCreator( + () => new CallApiActivity(httpClient))); + +// Or implement INameVersionObjectManager for full control +``` + +### With Generic Creator + +```csharp +public class MyActivityCreator : ObjectCreator +{ + private readonly IServiceProvider _services; + + public MyActivityCreator(IServiceProvider services) + { + _services = services; + } + + public override TaskActivity Create() + { + // Resolve from DI container + return (TaskActivity)_services.GetRequiredService(Type); + } +} + +// Register +worker.AddTaskActivities(new MyActivityCreator(serviceProvider)); +``` + +## Calling Activities from Orchestrations + +### Basic Call + +```csharp +var result = await context.ScheduleTask(typeof(GreetActivity), "World"); +``` + +### With Retry Options + +```csharp +var retryOptions = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3) +{ + BackoffCoefficient = 2.0, + MaxRetryInterval = TimeSpan.FromMinutes(1), + RetryTimeout = TimeSpan.FromMinutes(10) +}; + +var result = await context.ScheduleWithRetry( + typeof(CallApiActivity), + retryOptions, + apiRequest); +``` + +### Using Typed Proxies + +Generate strongly-typed activity clients: + +```csharp +// Define interface +public interface IOrderActivities +{ + Task ValidateOrder(Order order); + Task ProcessPayment(PaymentRequest request); + Task ShipOrder(ShippingRequest request); +} + +// In orchestration +public override async Task RunTask( + OrchestrationContext context, + Order order) +{ + var activities = context.CreateClient(); + + var isValid = await activities.ValidateOrder(order); + if (!isValid) return new OrderResult { Success = false }; + + var payment = await activities.ProcessPayment(order.Payment); + var tracking = await activities.ShipOrder(order.Shipping); + + return new OrderResult { Success = true, TrackingNumber = tracking }; +} +``` + +> [!IMPORTANT] +> Do not include `TaskContext` or `CancellationToken` parameters in activity interface methods. Only JSON-serializable input and output types are allowed. + +## Activity Best Practices + +### 1. Keep Activities Focused + +Each activity should do one thing: + +```csharp +// ✅ Good - single responsibility +public class SendEmailActivity : AsyncTaskActivity { } +public class SaveToDbActivity : AsyncTaskActivity { } + +// ❌ Bad - too many responsibilities +public class DoEverythingActivity : AsyncTaskActivity +{ + // Sends email, saves to DB, calls API, etc. +} +``` + +The exception to this is when performance considerations require batching multiple related operations together to reduce overhead. However, this must be done carefully with attention to error handling and idempotency. + +### 2. Make Activities Idempotent + +Activities may be retried, so design them to be idempotent: + +```csharp +public class ProcessPaymentActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + PaymentRequest input) + { + // Use idempotency key to prevent duplicate charges + return await _paymentService.ProcessAsync( + input, + idempotencyKey: input.OrderId); + } +} +``` + +### 3. Handle Timeouts + +Implement cancellation support: + +```csharp +public class LongRunningActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + Input input) + { + using var cts = new CancellationTokenSource(TimeSpan.FromMinutes(5)); + + try + { + return await DoWorkAsync(input, cts.Token); + } + catch (OperationCanceledException) + { + throw new TimeoutException("Activity timed out"); + } + } +} +``` + +### 4. Log with Context + +Include orchestration context in logs: + +```csharp +public class MyActivity : AsyncTaskActivity +{ + private readonly ILogger _logger; + + protected override async Task ExecuteAsync( + TaskContext context, + Input input) + { + _logger.LogInformation( + "Processing {Input} for orchestration {InstanceId}", + input, + context.OrchestrationInstance.InstanceId); + + // ... do work ... + } +} +``` + +### 5. Return Serializable Results + +Ensure return types can be serialized: + +```csharp +// ✅ Good - serializable POCO +public class ActivityResult +{ + public string Status { get; set; } + public int Count { get; set; } + public DateTime ProcessedAt { get; set; } +} + +// ❌ Bad - not serializable +public class BadResult +{ + public HttpClient Client { get; set; } // Can't serialize + public Stream DataStream { get; set; } // Can't serialize +} +``` + +## Activity Execution Model + +### How Activities Run + +1. Orchestration calls `ScheduleTask()` — creates a `TaskScheduled` event in the orchestration history +2. Activity message is placed on the provider-specific work item queue +3. A worker picks up the message and executes the activity (typically as competing consumers) +4. Result is sent back to the orchestration's provider-specific control queue +5. Orchestration replays and sees `TaskCompleted` event in its updated history + +### Activity vs Orchestration Context + +| Feature | Activity (`TaskContext`) | Orchestration (`OrchestrationContext`) | +| ------- | ------------------------ | -------------------------------------- | +| Instance info | ✅ Available | ✅ Available | +| Schedule tasks | ❌ No | ✅ Yes | +| Create timers | ❌ No | ✅ Yes | +| Wait for events | ❌ No | ✅ Yes | +| Determinism required | ❌ No | ✅ Yes | +| Can call external APIs | ✅ Yes | ❌ Should not | + +## Error Handling in Activities + +### Throwing Exceptions + +Unhandled exceptions fail the activity and become `TaskFailedException` in the orchestration: + +```csharp +public class ValidateActivity : TaskActivity +{ + protected override bool Execute(TaskContext context, Order order) + { + if (string.IsNullOrEmpty(order.CustomerId)) + { + throw new ArgumentException("Customer ID is required"); + } + return true; + } +} + +// In orchestration +try +{ + await context.ScheduleTask(typeof(ValidateActivity), order); +} +catch (TaskFailedException ex) +{ + // ex.InnerException contains the original ArgumentException +} +``` + +### Returning Errors vs Throwing + +Consider returning error results for expected failures: + +```csharp +public class ProcessOrderActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + Order order) + { + var inventory = await CheckInventoryAsync(order); + + if (!inventory.IsAvailable) + { + // Expected case - return result + return new OrderResult + { + Success = false, + Error = "Insufficient inventory" + }; + } + + // Unexpected case - throw + if (order.TotalAmount < 0) + { + throw new InvalidOperationException("Invalid order amount"); + } + + return new OrderResult { Success = true }; + } +} +``` + +This approach avoids potentially expensive retries for known failure conditions, and also avoids problems with serializing exceptions. + +## Next Steps + +- [Orchestrations](orchestrations.md) — Coordinating activities +- [Retries](../features/retries.md) — Configuring automatic retries +- [Error Handling](../features/error-handling.md) — Comprehensive error handling +- [Replay and Durability](replay-and-durability.md) — Understanding the replay model diff --git a/docs/concepts/core-concepts.md b/docs/concepts/core-concepts.md new file mode 100644 index 000000000..86d0c03b9 --- /dev/null +++ b/docs/concepts/core-concepts.md @@ -0,0 +1,230 @@ +# Core Concepts + +This document explains the fundamental concepts of the Durable Task Framework (DTFx). + +## Architecture Overview + +```text +┌─────────────────────────────────────────────────────────────────┐ +│ Task Hub │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ TaskHubWorker │ │ TaskHubClient │ │ +│ │ │ │ │ │ +│ │ ┌─────────────┐ │ │ • Start │ │ +│ │ │Orchestration│ │ │ • Query │ │ +│ │ │ Handlers │ │ │ • Send Events │ │ +│ │ └─────────────┘ │ │ • Terminate │ │ +│ │ ┌─────────────┐ │ │ │ │ +│ │ │ Activity │ │ └────────┬────────┘ │ +│ │ │ Handlers │ │ │ │ +│ │ └─────────────┘ │ │ │ +│ └────────┬────────┘ │ │ +│ │ │ │ +│ └──────────────────┬───────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────┐ │ +│ │ IOrchestrationService │ │ +│ │ (Backend Provider) │ │ +│ │ │ │ +│ │ • Message Queues (control, work items) │ │ +│ │ • State Storage (history, instances) │ │ +│ │ • Scale Management (partitions, etc.) │ │ +│ └───────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Task Hub + +A **Task Hub** is a logical container for orchestration and activity state. It represents a single deployment unit and includes: + +- **Message Queues** — Queues for orchestration and activity work items +- **History Store** — Persistent storage for orchestration history +- **Instance Store** — Metadata about orchestration instances for querying + +### Key Characteristics + +- Each task hub is **isolated** — orchestrations in different task hubs cannot interact directly +- Multiple workers can connect to the same task hub for **scale-out** +- All connected workers must share the same backend provider configuration and orchestration/activity code +- The task hub name is used as a **namespace** for all stored data + +## TaskHubWorker + +The **TaskHubWorker** hosts and executes orchestrations and activities. It: + +- Polls the backend for work items +- Dispatches orchestration and activity code +- Reports completion back to the backend + +### Lifecycle + +```csharp +// Create worker +var orchestrationService = GetSelectedOrchestrationService(); +var worker = new TaskHubWorker(orchestrationService, loggerFactory); + +// Register handlers +worker.AddTaskOrchestrations(typeof(MyOrchestration)); +worker.AddTaskActivities(typeof(MyActivity)); + +// Start processing +await worker.StartAsync(); + +// ... application runs ... + +// Graceful shutdown +await worker.StopAsync(); +``` + +### Scaling + +Multiple workers can connect to the same task hub: + +```text +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Worker 1 │ │ Worker 2 │ │ Worker 3 │ +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + └────────────────┼────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ Task Hub │ + └─────────────────┘ +``` + +Work is distributed across workers automatically by the selected backend provider. + +## TaskHubClient + +The **TaskHubClient** is used to manage orchestration instances from external code: + +```csharp +var orchestrationService = GetSelectedOrchestrationService(); +var client = new TaskHubClient(orchestrationService, loggerFactory: loggerFactory); + +// Start a new orchestration +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + instanceId: "order-123", + input: new OrderData { ... }); + +// Query status +var state = await client.GetOrchestrationStateAsync(instance); + +// Send an event +await client.RaiseEventAsync(instance, "ApprovalReceived", approvalData); + +// Wait for completion +var result = await client.WaitForOrchestrationAsync(instance, timeout); + +// Terminate +await client.TerminateInstanceAsync(instance, "Cancelled by user"); +``` + +## Orchestrations + +**Orchestrations** are the core workflow definitions. They: + +- Define the sequence and logic of work +- Coordinate activities and sub-orchestrations +- Are **durable** — survive process restarts +- Must be **deterministic** — same input produces same sequence of actions + +```csharp +public class OrderOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + OrderInput input) + { + // Orchestration logic here + var validated = await context.ScheduleTask(typeof(ValidateOrder), input); + + if (!validated) + return new OrderResult { Success = false }; + + await context.ScheduleTask(typeof(ProcessPayment), input); + await context.ScheduleTask(typeof(ShipOrder), input); + + return new OrderResult { Success = true }; + } +} +``` + +See [Orchestrations](orchestrations.md) for detailed documentation. + +## Activities + +**Activities** are the units of work that orchestrations schedule. They: + +- Perform the actual work (API calls, database operations, etc.) +- Can be **retried** automatically on failure +- Are **not** required to be deterministic +- Run once per scheduled invocation (with at-least-once guarantees) + +```csharp +public class ProcessPaymentActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + PaymentInput input) + { + // Actual work here - call payment API, etc. + var result = await PaymentService.ProcessAsync(input); + return result; + } +} +``` + +See [Activities](activities.md) for detailed documentation. + +## Instance IDs + +Every orchestration instance has a unique **Instance ID**: + +```csharp +// Auto-generated ID +var instance = await client.CreateOrchestrationInstanceAsync(typeof(MyOrchestration), input); +// instance.InstanceId = "abc123..." + +// Custom ID (recommended for idempotency) +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + instanceId: "order-456", // Your custom ID + input: orderData); +``` + +### Best Practices + +- Use **meaningful IDs** like `order-{orderId}` or `user-{userId}-workflow` +- Use random GUIDs if no meaningful ID is available and make sure to store them +- Avoid reusing IDs for different logical workflows to prevent conflicts + +## Orchestration Status + +Orchestrations can be in one of these states: + +| Status | Description | +| ------ | ----------- | +| `Pending` | Scheduled but not yet started | +| `Running` | Currently executing or waiting | +| `Suspended` | Paused due to external request | +| `Completed` | Finished successfully | +| `Failed` | Terminated due to unhandled exception | +| `Terminated` | Explicitly terminated via API | +| `Canceled` | Not currently implemented | +| `ContinuedAsNew` | Restarted via `ContinueAsNew` (not used in recent versions) | + +```csharp +var state = await client.GetOrchestrationStateAsync(instance); +Console.WriteLine($"Status: {state.OrchestrationStatus}"); +``` + +## Next Steps + +- [Orchestrations](orchestrations.md) — Writing orchestration logic +- [Activities](activities.md) — Writing activity code +- [Replay and Durability](replay-and-durability.md) — How durability works +- [Deterministic Constraints](deterministic-constraints.md) — Rules for orchestration code diff --git a/docs/concepts/deterministic-constraints.md b/docs/concepts/deterministic-constraints.md new file mode 100644 index 000000000..8ef4ff3b3 --- /dev/null +++ b/docs/concepts/deterministic-constraints.md @@ -0,0 +1,290 @@ +# Deterministic Constraints + +Orchestration code must be **deterministic**—it must produce the same sequence of operations every time it runs with the same history. This is required because orchestrations are [replayed](replay-and-durability.md) to rebuild state after interruptions. + +## The Golden Rule + +> **The same input must always produce the same sequence of durable operations.** + +Durable operations include: + +- `ScheduleTask` / `ScheduleWithRetry` +- `CreateTimer` +- `WaitForExternalEvent` +- `CreateSubOrchestrationInstance` +- `ContinueAsNew` + +## What NOT to Do + +### ❌ Don't Use Current Time Directly + +```csharp +// ❌ WRONG - Non-deterministic +if (DateTime.UtcNow > deadline) +{ + await context.ScheduleTask(typeof(ExpiredActivity), input); +} + +// ✅ CORRECT - Use orchestration time +if (context.CurrentUtcDateTime > deadline) +{ + await context.ScheduleTask(typeof(ExpiredActivity), input); +} +``` + +### ❌ Don't Use Random Numbers + +```csharp +// ❌ WRONG - Different on replay +var random = new Random(); +if (random.Next(100) > 50) +{ + await context.ScheduleTask(typeof(ActivityA), input); +} + +// ✅ CORRECT - Get random value from activity +var randomValue = await context.ScheduleTask(typeof(GetRandomNumberActivity), 100); +if (randomValue > 50) +{ + await context.ScheduleTask(typeof(ActivityA), input); +} + +// ✅ OR use a fixed seed +var random = new Random(42); // Fixed seed +if (random.Next(100) > 50) +{ + await context.ScheduleTask(typeof(ActivityA), input); +} +``` + +### ❌ Don't Use GUIDs Directly + +```csharp +// ❌ WRONG - Different GUID on replay +var id = Guid.NewGuid().ToString(); +await context.ScheduleTask(typeof(ProcessActivity), id); + +// ✅ CORRECT - Use orchestration's NewGuid +var id = context.NewGuid().ToString(); +await context.ScheduleTask(typeof(ProcessActivity), id); + +// ✅ Also correct - Get from activity +var id = await context.ScheduleTask(typeof(GenerateIdActivity), null); +``` + +### ❌ Don't Read Environment Variables + +```csharp +// ❌ WRONG - May change between replays +var endpoint = Environment.GetEnvironmentVariable("API_ENDPOINT"); +await context.ScheduleTask(typeof(CallApiActivity), endpoint); + +// ✅ CORRECT - Pass as input or read in activity +// Option 1: Pass as orchestration input +await context.ScheduleTask(typeof(CallApiActivity), input.ApiEndpoint); + +// Option 2: Read in activity +await context.ScheduleTask(typeof(CallApiWithConfigActivity), input); +``` + +### ❌ Don't Make Network Calls + +```csharp +// ❌ WRONG - Side effect, non-deterministic +var response = await httpClient.GetAsync("https://api.example.com/data"); +var data = await response.Content.ReadAsStringAsync(); + +// ✅ CORRECT - Use activity for network calls +var data = await context.ScheduleTask(typeof(FetchDataActivity), "https://api.example.com/data"); +``` + +> [!NOTE] +> Awaiting a non-durable task like `httpClient.GetAsync` may cause the orchestration to hang indefinitely. + +### ❌ Don't Access Databases + +```csharp +// ❌ WRONG - Data may change between replays +var user = await dbContext.Users.FindAsync(userId); + +// ✅ CORRECT - Use activity +var user = await context.ScheduleTask(typeof(GetUserActivity), userId); +``` + +> [!NOTE] +> Awaiting a non-durable task like `dbContext.Users.FindAsync` may cause the orchestration to hang indefinitely. + +### ❌ Don't Use Thread.Sleep + +```csharp +// ❌ WRONG - Blocks thread, doesn't persist +Thread.Sleep(TimeSpan.FromMinutes(5)); +await Task.Delay(TimeSpan.FromMinutes(5)); + +// ✅ CORRECT - Use durable timer +await context.CreateTimer(context.CurrentUtcDateTime.AddMinutes(5), true); +``` + +> [!NOTE] +> Awaiting a non-durable task like `Task.Delay` may cause the orchestration to hang indefinitely. + +### ❌ Don't Use Mutable Static Variables + +```csharp +// ❌ WRONG - State not preserved across replays +static int counter = 0; +counter++; +if (counter > 5) { ... } + +// ✅ CORRECT - Use orchestration input/output for state +public override async Task RunTask(OrchestrationContext context, int currentCount) +{ + currentCount++; + if (currentCount > 5) { ... } +} +``` + +### ❌ Don't Use Non-Deterministic Collections + +```csharp +// ❌ WRONG - HashSet and Dictionary iteration order is not guaranteed +var items = new HashSet { "a", "b", "c" }; +foreach (var item in items) +{ + await context.ScheduleTask(typeof(ProcessActivity), item); +} + +// ✅ CORRECT - Use ordered collection +var items = new List { "a", "b", "c" }; +foreach (var item in items) +{ + await context.ScheduleTask(typeof(ProcessActivity), item); +} +``` + +### ❌ Don't Use Task.Run or Threading APIs + +```csharp +// ❌ WRONG - Background tasks are non-deterministic and may not complete before replay +await Task.Run(() => ProcessData(input)); + +// ❌ WRONG - Manual thread creation is non-deterministic +var thread = new Thread(() => DoWork()); +thread.Start(); + +// ❌ WRONG - ThreadPool work is non-deterministic +ThreadPool.QueueUserWorkItem(_ => ProcessItem(input)); + +// ✅ CORRECT - Use activities for background work +var result = await context.ScheduleTask(typeof(ProcessDataActivity), input); + +// ✅ CORRECT - Use fan-out pattern for parallel work +var tasks = input.Items.Select(item => + context.ScheduleTask(typeof(ProcessItemActivity), item)); +var results = await Task.WhenAll(tasks); +``` + +> [!NOTE] +> `Task.Run`, `ThreadPool.QueueUserWorkItem`, and manual thread creation introduce non-determinism because: +> +> - The work may complete at different times during replay +> - Background threads don't participate in orchestration checkpointing +> - Results are not captured in the orchestration history + +## What IS Safe + +### ✅ Local Computation + +```csharp +// ✅ Safe - deterministic computation +var sum = input.Values.Sum(); +var filtered = input.Items.Where(x => x.IsActive).ToList(); +var formatted = $"Order {input.OrderId}: {input.Description}"; +``` + +### ✅ Using Context Properties and Methods + +```csharp +// ✅ Safe - consistent across replays +var instanceId = context.OrchestrationInstance.InstanceId; +var currentTime = context.CurrentUtcDateTime; +var newId = context.NewGuid(); +``` + +### ✅ Conditional Logic Based on Durable Results + +```csharp +// ✅ Safe - result comes from history during replay +var status = await context.ScheduleTask(typeof(GetStatusActivity), orderId); +if (status == OrderStatus.Approved) +{ + await context.ScheduleTask(typeof(ProcessOrderActivity), orderId); +} +``` + +### ✅ Loops with Deterministic Bounds + +```csharp +// ✅ Safe - loop bounds are deterministic +for (int i = 0; i < input.Items.Count; i++) +{ + await context.ScheduleTask(typeof(ProcessItemActivity), input.Items[i]); +} +``` + +### ✅ Parallel Execution + +```csharp +// ✅ Safe - Task.WhenAll is deterministic +var tasks = input.Items.Select(item => + context.ScheduleTask(typeof(ProcessItemActivity), item)); +var results = await Task.WhenAll(tasks); +``` + +## Summary Table + +| Operation | Allowed in Orchestration? | Alternative | +| --------- | ------------------------- | ----------- | +| `DateTime.UtcNow` | ❌ No | `context.CurrentUtcDateTime` | +| `Guid.NewGuid()` | ❌ No | `context.NewGuid()` | +| `Random.Next()` | ❌ No | Get from activity | +| `Thread.Sleep()` / `Task.Delay()` | ❌ No | `context.CreateTimer()` | +| `Task.Run()` | ❌ No | Use activity or fan-out | +| `ThreadPool.QueueUserWorkItem()` | ❌ No | Use activity | +| Manual thread creation | ❌ No | Use activity | +| HTTP calls | ❌ No | Use activity | +| Database queries | ❌ No | Use activity | +| File I/O | ❌ No | Use activity | +| Environment variables | ⚠️ Avoid | Pass as input or read in activity | +| Static mutable state | ❌ No | Use orchestration state | +| `HashSet` or `Dictionary` iteration | ⚠️ Avoid | Use `List` or sorted collection | +| Local computation | ✅ Yes | — | +| String manipulation | ✅ Yes | — | +| LINQ queries (on local data) | ✅ Yes | — | + +## Detecting Non-Determinism + +### Runtime Detection + +Some non-deterministic issues cause runtime errors: + +```text +NonDeterministicOrchestrationException: The orchestration 'MyOrchestration' +has a non-deterministic replay detected. The history expected 'TaskScheduled' +for 'ActivityA' but got 'TaskScheduled' for 'ActivityB'. +``` + +### Static Analysis + +Consider using analyzers or code reviews to catch issues: + +- Review all `DateTime`, `Guid`, `Random` usage +- Search for HTTP client usage +- Check for `Thread.Sleep` or `Task.Delay` +- Check for `Task.Run`, `ThreadPool`, or `new Thread` + +## Next Steps + +- [Replay and Durability](replay-and-durability.md) — Why determinism matters +- [Versioning](../features/versioning.md) — Safely updating orchestration code +- [Error Handling](../features/error-handling.md) — Handling failures deterministically diff --git a/docs/concepts/orchestrations.md b/docs/concepts/orchestrations.md new file mode 100644 index 000000000..148386e0a --- /dev/null +++ b/docs/concepts/orchestrations.md @@ -0,0 +1,331 @@ +# Orchestrations + +Orchestrations are the core building blocks of the Durable Task Framework. They define durable, long-running workflows that coordinate activities, sub-orchestrations, timers, and external events. + +## Creating an Orchestration + +### Basic Structure + +Inherit from `TaskOrchestration`: + +```csharp +using DurableTask.Core; + +public class OrderProcessingOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + OrderInput input) + { + // Orchestration logic here + return new OrderResult { Success = true }; + } +} +``` + +### Type Parameters + +- `TResult` — The return type of the orchestration +- `TInput` — The input type passed when starting the orchestration + +### Registration + +Register orchestrations with the worker: + +```csharp +var worker = new TaskHubWorker(service, loggerFactory); +worker.AddTaskOrchestrations(typeof(OrderProcessingOrchestration)); +await worker.StartAsync(); +``` + +## OrchestrationContext + +The `OrchestrationContext` provides APIs for scheduling durable operations: + +### Scheduling Activities + +```csharp +// Schedule an activity and wait for result +var result = await context.ScheduleTask(typeof(MyActivity), input); + +// Schedule with retry options +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3); + +var result = await context.ScheduleWithRetry( + typeof(MyActivity), + options, + input); +``` + +### Creating Timers + +Timers allow orchestrations to wait for a specific time or duration. They are durable and survive process restarts. + +```csharp +// Wait for a specific time +await context.CreateTimer(context.CurrentUtcDateTime.AddHours(1), true); + +// Use for delays (not Thread.Sleep!) +await context.CreateTimer(context.CurrentUtcDateTime.AddMinutes(5), true); +``` + +> [!IMPORTANT] +> +> - Never use `Thread.Sleep` for delays in orchestrations. +> - Always use `context.CurrentUtcDateTime` for time calculations to ensure determinism. +> - Timers are cancellable using `CancellationToken` and must be cancelled if no longer needed. + +### Waiting for External Events + +Orchestrations can pause and wait for external events sent from client code or other orchestrations. + +```csharp +// Wait indefinitely for an event +var approvalData = await context.WaitForExternalEvent("ApprovalReceived"); + +// Wait with timeout +using var cts = new CancellationTokenSource(); +var timerTask = context.CreateTimer(context.CurrentUtcDateTime.AddDays(1), true, cts.Token); +var eventTask = context.WaitForExternalEvent("ApprovalReceived"); + +var winner = await Task.WhenAny(timerTask, eventTask); +if (winner == eventTask) +{ + // Timer cancelled since event was received (this is important) + cts.Cancel(); + var approval = await eventTask; + // Process approval +} +else +{ + // Timeout - escalate or reject +} +``` + +### Sub-Orchestrations + +```csharp +// Start a sub-orchestration +var subResult = await context.CreateSubOrchestrationInstance( + typeof(SubOrchestration), + subInput); + +// With custom instance ID +var subResult = await context.CreateSubOrchestrationInstance( + typeof(SubOrchestration), + "sub-instance-123", + subInput); +``` + +### Continue As New + +```csharp +// Restart orchestration with new input (eternal orchestrations) +context.ContinueAsNew(newInput); +return default; // Return value is ignored +``` + +## Orchestration Patterns + +### Sequential Execution + +```csharp +public override async Task RunTask(OrchestrationContext context, string input) +{ + var step1 = await context.ScheduleTask(typeof(Step1Activity), input); + var step2 = await context.ScheduleTask(typeof(Step2Activity), step1); + var step3 = await context.ScheduleTask(typeof(Step3Activity), step2); + return step3; +} +``` + +### Fan-Out/Fan-In (Parallel Execution) + +The fan-out/fan-in pattern allows multiple tasks to be executed in parallel, with the orchestration waiting for all to complete before proceeding. + +```csharp +public override async Task RunTask(OrchestrationContext context, int[] inputs) +{ + // Fan-out: Start all tasks in parallel + var tasks = inputs.Select(i => + context.ScheduleTask(typeof(ProcessItemActivity), i)).ToList(); + + // Fan-in: Wait for all to complete + var results = await Task.WhenAll(tasks); + + return results; +} +``` + +### Human Interaction + +```csharp +public override async Task RunTask( + OrchestrationContext context, + ApprovalRequest request) +{ + // Send notification to approver + await context.ScheduleTask(typeof(SendApprovalRequestActivity), request); + + // Wait for approval with timeout + using var cts = new CancellationTokenSource(); + var approvalTask = context.WaitForExternalEvent("Approved"); + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddDays(7), + true, + cts.Token); + + var winner = await Task.WhenAny(approvalTask, timeoutTask); + + if (winner == approvalTask) + { + cts.Cancel(); + return new ApprovalResult { Approved = await approvalTask }; + } + + return new ApprovalResult { Approved = false, TimedOut = true }; +} +``` + +### Monitor Pattern + +```csharp +public override async Task RunTask( + OrchestrationContext context, + MonitorInput input) +{ + int pollingInterval = 30; // seconds + DateTime expiryTime = context.CurrentUtcDateTime.AddHours(2); + + while (context.CurrentUtcDateTime < expiryTime) + { + var status = await context.ScheduleTask( + typeof(CheckJobStatusActivity), + input.JobId); + + if (status.IsComplete) + { + return new MonitorResult { Completed = true, Status = status }; + } + + // Wait before polling again + await context.CreateTimer( + context.CurrentUtcDateTime.AddSeconds(pollingInterval), + true); + + // Optional: exponential backoff + pollingInterval = Math.Min(pollingInterval * 2, 300); + } + + return new MonitorResult { Completed = false, TimedOut = true }; +} +``` + +> [!IMPORTANT] +> +> - Long loops can lead to resource exhaustion. Use `ContinueAsNew` for very long-running monitors. +> - Avoid tight polling loops; always include delays via `context.CreateTimer`. + +## Getting Orchestration Information + +### Current Instance ID + +```csharp +string instanceId = context.OrchestrationInstance.InstanceId; +``` + +### Current Time + +Always use `context.CurrentUtcDateTime` instead of `DateTime.UtcNow`: + +```csharp +// ✅ Correct - deterministic +var now = context.CurrentUtcDateTime; + +// ❌ Wrong - non-deterministic +var now = DateTime.UtcNow; +``` + +See [Deterministic Constraints](deterministic-constraints.md) for more details. + +### Replay Detection + +```csharp +if (!context.IsReplaying) +{ + // Only runs during first execution, not during replay + _logger.LogInformation("Processing order {OrderId}", input.OrderId); +} +``` + +See [Replay and Durability](replay-and-durability.md) for more details. + +## Starting Orchestrations + +### From Client Code + +```csharp +var client = new TaskHubClient(service, loggerFactory: loggerFactory); + +// Start with auto-generated instance ID +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(OrderProcessingOrchestration), + new OrderInput { OrderId = "12345" }); + +// Start with custom instance ID +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(OrderProcessingOrchestration), + instanceId: "order-12345", + input: new OrderInput { OrderId = "12345" }); + +// Start at a scheduled time +var instance = await client.CreateScheduledOrchestrationInstanceAsync( + typeof(OrderProcessingOrchestration), + instanceId: "scheduled-order", + input: new OrderInput { OrderId = "12345" }, + startAt: DateTime.UtcNow.AddHours(1)); +``` + +> [!NOTE] +> Not all backends support scheduled orchestrations. + +### Waiting for Completion + +```csharp +var result = await client.WaitForOrchestrationAsync( + instance, + timeout: TimeSpan.FromMinutes(5)); + +if (result.OrchestrationStatus == OrchestrationStatus.Completed) +{ + var output = result.Output; // Serialized result +} +``` + +## Error Handling + +See [Error Handling](../features/error-handling.md) for comprehensive error handling patterns. + +```csharp +public override async Task RunTask(OrchestrationContext context, string input) +{ + try + { + return await context.ScheduleTask(typeof(RiskyActivity), input); + } + catch (TaskFailedException ex) + { + // Activity threw an exception + return await context.ScheduleTask(typeof(CompensationActivity), input); + } +} +``` + +## Next Steps + +- [Activities](activities.md) — Writing activity code +- [Deterministic Constraints](deterministic-constraints.md) — Important rules for orchestration code +- [Replay and Durability](replay-and-durability.md) — Understanding how orchestrations are replayed +- [Features](../features/retries.md) — Retries, timers, events, and more diff --git a/docs/concepts/replay-and-durability.md b/docs/concepts/replay-and-durability.md new file mode 100644 index 000000000..be95ba9d1 --- /dev/null +++ b/docs/concepts/replay-and-durability.md @@ -0,0 +1,249 @@ +# Replay and Durability + +The Durable Task Framework achieves durability through an **event-sourcing** pattern. Understanding how replay works is essential for writing correct orchestrations. + +## How Durability Works + +### The Problem + +Traditional workflows have a problem: if the process crashes, in-progress state is lost. + +```text +Process starts → Workflow runs → CRASH → State lost ❌ +``` + +### The Solution: Event Sourcing + +DTFx persists every decision as an event in the history: + +```text +Orchestration executes → Event recorded → (Crash) → Replay from history → Continue ✅ +``` + +## The Replay Model + +### First Execution + +When an orchestration runs for the first time: + +```csharp +public override async Task RunTask(OrchestrationContext context, string input) +{ + var a = await context.ScheduleTask(typeof(ActivityA), input); // Executes, records TaskScheduled + var b = await context.ScheduleTask(typeof(ActivityB), a); // Executes, records TaskScheduled + return b; +} +``` + +**History after first execution:** + +```text +1. ExecutionStarted { Input: "hello" } +2. TaskScheduled { Name: "ActivityA" } +3. TaskCompleted { Result: "A-result" } +4. TaskScheduled { Name: "ActivityB" } +5. TaskCompleted { Result: "B-result" } +6. ExecutionCompleted { Result: "B-result" } +``` + +### Replay After Crash + +If the process crashes and restarts, the orchestration **replays**: + +1. Framework loads the history from storage +2. The orchestration's `RunTask` method executes again from the beginning +3. Each `await` checks if there's already a result in history +4. If result exists, return it immediately (no actual execution) +5. If no result, schedule the work and wait + +```csharp +// During replay: +var a = await context.ScheduleTask(typeof(ActivityA), input); +// ↑ Sees TaskCompleted in history, returns "A-result" immediately + +var b = await context.ScheduleTask(typeof(ActivityB), a); +// ↑ Sees TaskCompleted in history, returns "B-result" immediately +``` + +### Partial Replay + +If an orchestration is waiting for an activity: + +```text +1. ExecutionStarted { Input: "hello" } +2. TaskScheduled { Name: "ActivityA" } +3. TaskCompleted { Result: "A-result" } +4. TaskScheduled { Name: "ActivityB" } +← Activity B is still running +``` + +When Activity B completes, the orchestration replays: + +```csharp +var a = await context.ScheduleTask(typeof(ActivityA), input); +// ↑ Returns "A-result" from history + +var b = await context.ScheduleTask(typeof(ActivityB), a); +// ↑ Finds new TaskCompleted event, returns result + +return b; // Orchestration completes +``` + +## Checkpointing + +### When Checkpoints Occur + +The orchestration state is checkpointed (saved) when: + +1. An `await` yields control back to the framework +2. The orchestration completes or fails +3. `ContinueAsNew` is called and the current execution ends + +### What Gets Saved + +- Complete event history +- Custom status (if set) +- Input and output (if any) + +### What Doesn't Get Saved + +- Local variables (they're rebuilt during replay) +- In-memory state outside the orchestration + +## Understanding Context.IsReplaying + +The `IsReplaying` property tells you if the orchestration is replaying: + +```csharp +public override async Task RunTask(OrchestrationContext context, string input) +{ + // This code runs during EVERY replay + var greeting = $"Hello, {input}"; + + if (!context.IsReplaying) + { + // This only runs during the FIRST execution of this code path + _logger.LogInformation("Processing input: {Input}", input); + } + + var result = await context.ScheduleTask(typeof(MyActivity), greeting); + + return result; +} +``` + +### When to Use IsReplaying + +| Use Case | Use IsReplaying? | +| -------- | ---------------- | +| Logging | ✅ Yes - avoid duplicate logs | +| Metrics | ✅ Yes - avoid double-counting | +| Business logic | ❌ No - should work identically during replay | +| Side effects | ❌ No - use activities instead | + +## Why Determinism Matters + +Because orchestrations replay, they **must** produce the same sequence of events every time: + +### Example: Non-Deterministic Code (BAD) + +```csharp +// ❌ WRONG - Different result on each replay +public override async Task RunTask(OrchestrationContext context, string input) +{ + if (DateTime.UtcNow.Hour < 12) // Different on replay! + { + return await context.ScheduleTask(typeof(MorningActivity), input); + } + return await context.ScheduleTask(typeof(EveningActivity), input); +} +``` + +If the orchestration starts at 11:55 AM and replays at 12:05 PM, it will try to match `EveningActivity` against a history containing `MorningActivity` → **crash**. + +### Example: Deterministic Code (GOOD) + +```csharp +// ✅ CORRECT - Same result on every replay +public override async Task RunTask(OrchestrationContext context, string input) +{ + if (context.CurrentUtcDateTime.Hour < 12) // Same value during replay! + { + return await context.ScheduleTask(typeof(MorningActivity), input); + } + return await context.ScheduleTask(typeof(EveningActivity), input); +} +``` + +## History Events + +Common events in the orchestration history: + +| Event | Description | +| ----- | ----------- | +| `ExecutionStarted` | Orchestration started | +| `TaskScheduled` | Activity was scheduled | +| `TaskCompleted` | Activity completed successfully | +| `TaskFailed` | Activity failed | +| `SubOrchestrationInstanceCreated` | Sub-orchestration started | +| `SubOrchestrationInstanceCompleted` | Sub-orchestration completed | +| `TimerCreated` | Timer was created | +| `TimerFired` | Timer elapsed | +| `EventRaised` | External event received | +| `ExecutionCompleted` | Orchestration completed | +| `ExecutionFailed` | Orchestration failed | +| `ExecutionTerminated` | Orchestration was terminated | +| `ContinueAsNew` | Orchestration restarted | + +## Viewing History + +### Via Client + +```csharp +var history = await client.GetOrchestrationHistoryAsync(instance); +foreach (var evt in history) +{ + Console.WriteLine($"{evt.EventType}: {evt.Timestamp}"); +} +``` + +### What History Tells You + +- Exact sequence of operations +- Timing of each step +- Input/output of each activity +- Where failures occurred + +## Performance Implications + +### History Growth + +Every operation adds to the history. Large histories can impact: + +- **Memory** — Full history is loaded into memory during replay +- **Latency** — More events = longer replay time +- **Storage** — More data to persist and transfer (the exact impact depends on the storage provider) + +### Mitigation Strategies + +1. **Use `ContinueAsNew`** for long-running orchestrations: + + ```csharp + if (context.CurrentUtcDateTime > startTime.AddHours(24)) + { + context.ContinueAsNew(newState); // Reset history + return default; + } + ``` + +2. **Batch operations** in activities instead of many small activities + +3. **Sub-orchestrations** for logical groupings (separate history) + +4. **Purge completed instances** periodically + +## Next Steps + +- [Deterministic Constraints](deterministic-constraints.md) — Rules for writing deterministic code +- [Eternal Orchestrations](../features/eternal-orchestrations.md) — Managing long-running workflows +- [Versioning](../features/versioning.md) — Updating orchestration code safely diff --git a/docs/features/README.md b/docs/features/README.md new file mode 100644 index 000000000..d92f2125d --- /dev/null +++ b/docs/features/README.md @@ -0,0 +1,15 @@ +# Features + +This section covers the built-in features and patterns available in the Durable Task Framework. + +## Topics + +| Feature | Description | +| ------- | ----------- | +| [Retries](retries.md) | Automatic retry policies for activities and sub-orchestrations | +| [Timers](timers.md) | Durable delays and scheduling with `CreateTimer` | +| [External Events](external-events.md) | Receiving data from outside sources (webhooks, human interaction) | +| [Sub-Orchestrations](sub-orchestrations.md) | Breaking workflows into smaller, reusable pieces | +| [Error Handling](error-handling.md) | Exception handling, compensation, and recovery patterns | +| [Eternal Orchestrations](eternal-orchestrations.md) | Long-running workflows with `ContinueAsNew` | +| [Versioning](versioning.md) | Strategies for updating orchestrations safely | diff --git a/docs/features/error-handling.md b/docs/features/error-handling.md new file mode 100644 index 000000000..fe7cf54d6 --- /dev/null +++ b/docs/features/error-handling.md @@ -0,0 +1,515 @@ +# Error Handling + +The Durable Task Framework provides robust error handling capabilities for orchestrations and activities. This guide covers exception handling, compensation, and recovery patterns. + +## Activity Exceptions + +### Basic Exception Handling + +When an activity throws an exception, it becomes a `TaskFailedException` in the orchestration: + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + try + { + var result = await context.ScheduleTask(typeof(RiskyActivity), input); + return new Result { Success = true, Data = result }; + } + catch (TaskFailedException ex) + { + // ex.InnerException contains the original exception + return new Result { Success = false, Error = ex.InnerException?.Message }; + } +} +``` + +### Exception Details + +```csharp +catch (TaskFailedException ex) +{ + var originalException = ex.InnerException; + var activityName = ex.Name; // "RiskyActivity" + var scheduledEventId = ex.ScheduleId; // Event ID in history + + _logger.LogError(originalException, + "Activity {Activity} failed", activityName); +} +``` + +## Error Propagation Modes + +The `TaskHubWorker.ErrorPropagationMode` property controls how exception information is propagated from failed activities and sub-orchestrations. + +### SerializeExceptions (Default) + +The default mode serializes the original exception and makes it available via `InnerException`: + +```csharp +worker.ErrorPropagationMode = ErrorPropagationMode.SerializeExceptions; + +// In orchestration: +catch (TaskFailedException ex) +{ + // Original exception is deserialized and available + var originalException = ex.InnerException; + + if (originalException is InvalidOperationException invalidOp) + { + // Can catch specific exception types + } +} +``` + +**Limitations:** + +- Not all exception types can be serialized/deserialized correctly +- Custom exceptions may lose data if not properly serializable +- Doesn't work across language boundaries (e.g., polyglot scenarios) + +### UseFailureDetails (Recommended) + +The `UseFailureDetails` mode provides consistent, structured error information via `FailureDetails`: + +```csharp +worker.ErrorPropagationMode = ErrorPropagationMode.UseFailureDetails; +``` + +With this mode: + +- `InnerException` is **always null** +- Error details are available via `FailureDetails` property +- Works consistently across all exception types and language runtimes + +```csharp +catch (TaskFailedException ex) +{ + // InnerException is null in UseFailureDetails mode + // Use FailureDetails instead + FailureDetails details = ex.FailureDetails; + + string errorType = details.ErrorType; // e.g., "System.InvalidOperationException" + string errorMessage = details.ErrorMessage; // The exception message + string stackTrace = details.StackTrace; // Full stack trace + bool isNonRetriable = details.IsNonRetriable; + + // Check for inner failures (nested exceptions) + FailureDetails innerFailure = details.InnerFailure; +} +``` + +#### Checking Exception Types + +Use `IsCausedBy()` to check exception types without deserializing: + +```csharp +catch (TaskFailedException ex) when (ex.FailureDetails?.IsCausedBy() == true) +{ + // Handle InvalidOperationException +} +catch (TaskFailedException ex) when (ex.FailureDetails?.IsCausedBy() == true) +{ + // Handle TimeoutException +} +``` + +#### Sub-Orchestration Failures + +The same pattern applies to `SubOrchestrationFailedException`: + +```csharp +try +{ + await context.CreateSubOrchestrationInstance( + typeof(ChildOrchestration), + input); +} +catch (SubOrchestrationFailedException ex) +{ + FailureDetails details = ex.FailureDetails; + + _logger.LogError( + "Child orchestration failed: {ErrorType}: {Message}", + details.ErrorType, + details.ErrorMessage); +} +``` + +#### When to Use UseFailureDetails + +Use `UseFailureDetails` when: + +- You need consistent error handling across all exception types +- Running orchestrations/activities out-of-process or in other language runtimes +- Custom exceptions may not serialize correctly +- You want to avoid deserialization issues with `InnerException` + +> [!WARNING] +> Changing `ErrorPropagationMode` on an existing deployment can break in-flight orchestrations if they contain exception handling logic that depends on `InnerException`. Plan changes carefully and consider using [versioning strategies](versioning.md). + +#### Custom Exception Properties + +When using `UseFailureDetails`, you can include custom properties from your exceptions in the `FailureDetails.Properties` dictionary by implementing `IExceptionPropertiesProvider`: + +```csharp +public class CustomExceptionPropertiesProvider : IExceptionPropertiesProvider +{ + public IDictionary? GetExceptionProperties(Exception exception) + { + // Extract custom properties from known exception types + if (exception is OrderProcessingException orderEx) + { + return new Dictionary + { + ["OrderId"] = orderEx.OrderId, + ["FailureStage"] = orderEx.Stage, + ["RetryCount"] = orderEx.RetryCount + }; + } + + if (exception is ValidationException validationEx) + { + return new Dictionary + { + ["FieldName"] = validationEx.FieldName, + ["ValidationRule"] = validationEx.Rule + }; + } + + // Return null for exceptions without custom properties + return null; + } +} +``` + +Register the provider with the `TaskHubWorker`: + +```csharp +var worker = new TaskHubWorker(orchestrationService, loggerFactory); +worker.ErrorPropagationMode = ErrorPropagationMode.UseFailureDetails; +worker.ExceptionPropertiesProvider = new CustomExceptionPropertiesProvider(); +``` + +Access the custom properties in your orchestration's error handling: + +```csharp +catch (TaskFailedException ex) +{ + FailureDetails details = ex.FailureDetails; + + if (details.Properties != null) + { + if (details.Properties.TryGetValue("OrderId", out var orderId)) + { + _logger.LogError("Order {OrderId} failed: {Message}", orderId, details.ErrorMessage); + } + + if (details.Properties.TryGetValue("RetryCount", out var retryCount) && + retryCount is int count && count >= 3) + { + // Too many retries, escalate + await context.ScheduleTask(typeof(EscalateFailureActivity), details); + } + } +} +``` + +> [!NOTE] +> Property values should be simple, serializable types (strings, numbers, booleans). Complex objects may not serialize correctly across process boundaries. + +### Handling Specific Exception Types + +```csharp +try +{ + await context.ScheduleTask(typeof(PaymentActivity), payment); +} +catch (TaskFailedException ex) when (ex.InnerException is InsufficientFundsException) +{ + // Handle specific business error + await context.ScheduleTask(typeof(NotifyCustomerActivity), + "Payment failed: Insufficient funds"); +} +catch (TaskFailedException ex) when (ex.InnerException is PaymentGatewayException) +{ + // Retry with different gateway + await context.ScheduleTask(typeof(BackupPaymentActivity), payment); +} +catch (TaskFailedException) +{ + // Handle all other failures + throw; +} +``` + +## Automatic Retries + +Use `ScheduleWithRetry` for transient failures: + +```csharp +var retryOptions = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3) +{ + BackoffCoefficient = 2.0, + Handle = ex => ex is TimeoutException || ex is HttpRequestException +}; + +try +{ + await context.ScheduleWithRetry( + typeof(UnreliableActivity), + retryOptions, + input); +} +catch (TaskFailedException ex) +{ + // All retries exhausted +} +``` + +See [Retries](retries.md) for detailed retry configuration. + +## Sub-Orchestration Exceptions + +```csharp +try +{ + await context.CreateSubOrchestrationInstance( + typeof(ChildOrchestration), + input); +} +catch (SubOrchestrationFailedException ex) +{ + // Child orchestration threw an unhandled exception + var failureReason = ex.InnerException?.Message; +} +``` + +## Compensation Patterns + +### Saga Pattern + +Compensate previous steps when a later step fails: + +```csharp +public override async Task RunTask( + OrchestrationContext context, + OrderInput input) +{ + // Track completed steps for compensation + var completedSteps = new List(); + + try + { + // Step 1: Reserve inventory + await context.ScheduleTask(typeof(ReserveInventoryActivity), input); + completedSteps.Add("inventory"); + + // Step 2: Charge payment + await context.ScheduleTask(typeof(ChargePaymentActivity), input); + completedSteps.Add("payment"); + + // Step 3: Ship order (might fail) + await context.ScheduleTask(typeof(ShipOrderActivity), input); + + return new OrderResult { Success = true }; + } + catch (TaskFailedException ex) + { + // Compensate in reverse order + if (completedSteps.Contains("payment")) + { + await context.ScheduleTask(typeof(RefundPaymentActivity), input); + } + + if (completedSteps.Contains("inventory")) + { + await context.ScheduleTask(typeof(ReleaseInventoryActivity), input); + } + + return new OrderResult + { + Success = false, + Error = ex.InnerException?.Message + }; + } +} +``` + +### Compensation with Sub-Orchestrations + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + try + { + await context.CreateSubOrchestrationInstance( + typeof(ProcessOrderOrchestration), + input); + } + catch (SubOrchestrationFailedException) + { + // Run compensation orchestration + await context.CreateSubOrchestrationInstance( + typeof(CompensateOrderOrchestration), + new CompensationInput { OriginalInput = input }); + } +} +``` + +## Error Result Pattern + +Return errors as results instead of throwing: + +```csharp +// Activity returns result with error info +public class ProcessOrderActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + Order order) + { + if (!await ValidateOrderAsync(order)) + { + // Return error instead of throwing + return new OrderResult + { + Success = false, + ErrorCode = "VALIDATION_FAILED", + ErrorMessage = "Order validation failed" + }; + } + + // Process order... + return new OrderResult { Success = true, OrderId = newOrderId }; + } +} + +// Orchestration checks result +public override async Task RunTask(OrchestrationContext context, Order input) +{ + var result = await context.ScheduleTask(typeof(ProcessOrderActivity), input); + + if (!result.Success) + { + // Handle error without exception + await context.ScheduleTask(typeof(NotifyErrorActivity), result); + return new Result { Success = false, Error = result.ErrorMessage }; + } + + return new Result { Success = true, OrderId = result.OrderId }; +} +``` + +## Timeout Handling + +### Activity Timeout + +Activities don't have built-in timeout. Handle in orchestration: + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + using var cts = new CancellationTokenSource(); + + var activityTask = context.ScheduleTask(typeof(LongRunningActivity), input); + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(30), + true, + cts.Token); + + var winner = await Task.WhenAny(activityTask, timeoutTask); + + if (winner == activityTask) + { + cts.Cancel(); + return new Result { Success = true, Data = await activityTask }; + } + else + { + // Activity is still running but we've timed out + // Note: Activity will complete eventually, but result is ignored + return new Result { Success = false, TimedOut = true }; + } +} +``` + +### Orchestration-Level Timeout + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + var deadline = context.CurrentUtcDateTime.AddHours(4); + + while (context.CurrentUtcDateTime < deadline) + { + var status = await context.ScheduleTask(typeof(CheckStatusActivity), input); + + if (status.IsComplete) + return new Result { Success = true }; + + await context.CreateTimer(context.CurrentUtcDateTime.AddMinutes(5), true); + } + + return new Result { Success = false, TimedOut = true }; +} +``` + +## Circuit Breaker Pattern + +Prevent repeated failures: + +```csharp +public override async Task RunTask(OrchestrationContext context, State state) +{ + state ??= new State(); + + // Circuit breaker check + if (state.ConsecutiveFailures >= 5) + { + var cooldownEnd = state.LastFailure.AddMinutes(15); + if (context.CurrentUtcDateTime < cooldownEnd) + { + // Circuit is open - wait before retry + await context.CreateTimer(cooldownEnd, true); + } + state.ConsecutiveFailures = 0; // Reset after cooldown + } + + try + { + var result = await context.ScheduleTask(typeof(ExternalServiceActivity), state.Input); + state.ConsecutiveFailures = 0; + return new Result { Success = true, Data = result }; + } + catch (TaskFailedException) + { + state.ConsecutiveFailures++; + state.LastFailure = context.CurrentUtcDateTime; + + // Continue to retry with backoff + context.ContinueAsNew(state); + return null; + } +} +``` + +## Best Practices Summary + +| Practice | Description | +| -------- | ----------- | +| **Use `UseFailureDetails` mode** | Prefer `ErrorPropagationMode.UseFailureDetails` for consistent error handling | +| **Use retries for transient failures** | Configure `RetryOptions` for HTTP, timeout errors | +| **Return errors for expected failures** | Use result types instead of exceptions for business errors | +| **Implement compensation** | Use Saga pattern for multi-step transactions | +| **Set timeouts** | Don't let orchestrations wait indefinitely | +| **Log with context** | Include instance ID, activity name, error details | +| **Test failure scenarios** | Verify compensation and recovery logic | + +## Next Steps + +- [Retries](retries.md) — Configuring automatic retries +- [Replay and Durability](../concepts/replay-and-durability.md) — Understanding exception persistence +- [Testing](../advanced/testing.md) — Testing error handling diff --git a/docs/features/eternal-orchestrations.md b/docs/features/eternal-orchestrations.md new file mode 100644 index 000000000..5e2747ade --- /dev/null +++ b/docs/features/eternal-orchestrations.md @@ -0,0 +1,364 @@ +# Eternal Orchestrations + +Eternal orchestrations are long-running workflows that run indefinitely by periodically restarting themselves. This pattern is useful for monitoring, scheduling, and other recurring tasks. + +## The ContinueAsNew Pattern + +### Basic Eternal Orchestration + +```csharp +public class MonitorOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + MonitorInput input) + { + // Do the monitoring work + await context.ScheduleTask(typeof(CheckHealthActivity), input.Target); + + // Wait for next interval + await context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(input.IntervalMinutes), + true); + + // Restart with fresh history + context.ContinueAsNew(input); + + return null; // Has no effect since ContinueAsNew was called + } +} +``` + +### Why ContinueAsNew? + +Without `ContinueAsNew`, orchestration history grows unbounded: + +```text +// After 1000 iterations without ContinueAsNew: +History size: 10,000+ events +Memory usage: High +Replay time: Slow +``` + +An orchestration with an unbounded history can lead to severe performance degradation and process crashes due to OutOfMemoryExceptions. + +With `ContinueAsNew`: + +```text +// After 1000 iterations with ContinueAsNew: +History size: ~10 events (reset each iteration) +Memory usage: Low +Replay time: Fast +``` + +## ContinueAsNew Behavior + +### What Happens + +1. `ContinueAsNew(newInput)` is called +2. Current execution completes when `RunTask` returns +3. New execution starts with: + - Same instance ID + - Fresh (empty) history + - New input provided to `ContinueAsNew` + +### Status Transitions + +```text +Running → ContinuedAsNew → Running (new execution) +``` + +### History Reset + +Old history is usually **replaced**, not appended. The previous execution's history can optionally be retained for auditing (provider-dependent). + +## Common Patterns + +### Periodic Monitoring + +```csharp +public override async Task RunTask( + OrchestrationContext context, + MonitorConfig config) +{ + // Check system health + var health = await context.ScheduleTask( + typeof(CheckHealthActivity), + config.Endpoint); + + // Alert if unhealthy + if (!health.IsHealthy) + { + await context.ScheduleTask( + typeof(SendAlertActivity), + new Alert { Endpoint = config.Endpoint, Status = health }); + } + + // Wait before next check + await context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(config.CheckIntervalMinutes), + true); + + // Continue forever + context.ContinueAsNew(config); + return null; +} +``` + +### Job Queue Processor + +```csharp +public override async Task RunTask( + OrchestrationContext context, + QueueConfig config) +{ + // Get next batch of jobs + var jobs = await context.ScheduleTask>( + typeof(GetPendingJobsActivity), + new GetJobsInput { MaxCount = config.BatchSize }); + + if (jobs.Any()) + { + // Process jobs in parallel + var tasks = jobs.Select(job => + context.ScheduleTask(typeof(ProcessJobActivity), job)); + await Task.WhenAll(tasks); + } + + // Short delay if no jobs, to avoid busy-waiting + var delay = jobs.Any() + ? TimeSpan.FromSeconds(1) + : TimeSpan.FromSeconds(30); + + await context.CreateTimer(context.CurrentUtcDateTime.Add(delay), true); + + context.ContinueAsNew(config); + return null; +} +``` + +### Cron Scheduler + +```csharp +public override async Task RunTask( + OrchestrationContext context, + CronSchedule schedule) +{ + // Calculate next run time + var nextRun = GetNextCronTime(schedule.CronExpression, context.CurrentUtcDateTime); + + // Wait until scheduled time + await context.CreateTimer(nextRun, true); + + // Execute the scheduled task + await context.ScheduleTask(typeof(ScheduledTaskActivity), schedule.TaskInput); + + // Continue to next scheduled run + context.ContinueAsNew(schedule); + return null; +} +``` + +### Stateful Aggregator + +```csharp +public class AggregatorOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + AggregatorState state) + { + // Initialize state on first run + state ??= new AggregatorState { Count = 0, Total = 0 }; + + // Wait for data event or periodic save + using var cts = new CancellationTokenSource(); + var eventTask = context.WaitForExternalEvent("NewData"); + var saveTask = context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(5), + true, + cts.Token); + + var winner = await Task.WhenAny(eventTask, saveTask); + cts.Cancel(); + + if (winner == eventTask) + { + // Update aggregations + var data = await eventTask; + state.Count++; + state.Total += data.Value; + state.LastUpdated = context.CurrentUtcDateTime; + } + else + { + // Periodic save + if (state.Count > 0) + { + await context.ScheduleTask( + typeof(SaveAggregationActivity), + state); + } + } + + // Check for termination signal + if (state.ShouldTerminate) + { + return state; // Actually return and complete + } + + // Continue with updated state + context.ContinueAsNew(state); + return null; + } +} +``` + +### With Maximum Iterations + +```csharp +public override async Task RunTask( + OrchestrationContext context, + ProcessingState state) +{ + state.Iteration++; + + // Do work + var result = await context.ScheduleTask( + typeof(ProcessBatchActivity), + state.CurrentBatch); + + // Check completion conditions + if (state.Iteration >= state.MaxIterations) + { + return new ProcessingResult + { + Completed = true, + Iterations = state.Iteration + }; + } + + if (!state.HasMoreWork) + { + return new ProcessingResult + { + Completed = true, + Iterations = state.Iteration + }; + } + + // Wait and continue + await context.CreateTimer( + context.CurrentUtcDateTime.AddSeconds(state.DelaySeconds), + true); + + context.ContinueAsNew(state); + return null; // No effect due to ContinueAsNew +} +``` + +## Graceful Termination + +### Using External Events + +```csharp +public override async Task RunTask( + OrchestrationContext context, + Config config) +{ + using var cts = new CancellationTokenSource(); + + // Check for stop signal + Task stopTask = context.WaitForExternalEvent("Stop"); + Task workTask = DoWorkAsync(context, config); + Task timerTask = context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(1), + true, + cts.Token); + + Task winner = await Task.WhenAny(stopTask, workTask, timerTask); + cts.Cancel(); + + if (winner == stopTask) + { + // Graceful shutdown + return new Result { StoppedGracefully = true }; + } + + context.ContinueAsNew(config); + return null; +} +``` + +## Best Practices + +### 1. Be Careful with Tight Loops + +Immediate restarts via `ContinueAsNew` can be useful when processing batches of external events to minimize latency. However, be careful to avoid tight loops that do no meaningful work: + +```csharp +// ✅ OK - immediate restart when processing a batch of work +if (pendingItems.Any()) +{ + await ProcessBatchAsync(context, pendingItems); + context.ContinueAsNew(state); // Restart immediately to check for more + return null; +} + +// ✅ Good - add delay when no work to do +await context.CreateTimer(context.CurrentUtcDateTime.AddSeconds(30), true); +context.ContinueAsNew(state); + +// ⚠️ Risky - tight loop with no work and no delay +var items = await context.ScheduleTask>(typeof(GetItemsActivity), null); +if (!items.Any()) +{ + context.ContinueAsNew(state); // Immediately restarts even with no work! + return null; +} +``` + +### 2. Carry Forward Essential State + +```csharp +// ✅ Good - preserves necessary context +context.ContinueAsNew(new State +{ + TotalProcessed = state.TotalProcessed + batchSize, + LastCheckpoint = context.CurrentUtcDateTime, + Config = state.Config +}); + +// ⚠️ Careful - losing important state +context.ContinueAsNew(state.Config); // Lost TotalProcessed +``` + +### 3. Provide Termination Mechanism + +```csharp +// ✅ Good - can be stopped gracefully +if (config.StopRequested || iterationCount > maxIterations) +{ + return finalResult; +} +context.ContinueAsNew(config); +``` + +### 4. Monitor History Size + +If `ContinueAsNew` isn't called frequently enough, history can still grow. Consider continuing after a fixed number of operations: + +```csharp +if (state.OperationsSinceRestart > 100) +{ + state.OperationsSinceRestart = 0; + context.ContinueAsNew(state); + return null; +} +``` + +## Next Steps + +- [Timers](timers.md) — Creating durable delays +- [External Events](external-events.md) — Signaling eternal orchestrations +- [Replay and Durability](../concepts/replay-and-durability.md) — Understanding history growth diff --git a/docs/features/external-events.md b/docs/features/external-events.md new file mode 100644 index 000000000..7703f4492 --- /dev/null +++ b/docs/features/external-events.md @@ -0,0 +1,518 @@ +# External Events + +External events allow orchestrations to receive data from outside sources. This enables human interaction patterns, webhooks, and inter-orchestration communication. + +## The Event Pattern + +DTFx uses the `OnEvent` method override combined with `TaskCompletionSource` to handle external events. This pattern provides full control over event handling and is the standard approach in the framework. + +> [!NOTE] +> If you're familiar with [Azure Durable Functions](https://learn.microsoft.com/azure/azure-functions/durable/), note that DTFx does not have a built-in `WaitForExternalEvent()` helper method. Instead, DTFx provides the lower-level `OnEvent` pattern shown below, which Durable Functions builds upon. + +This pattern: + +1. Creates a `TaskCompletionSource` to represent the pending event +2. Awaits the `TaskCompletionSource.Task` in `RunTask` +3. Overrides `OnEvent` to receive events and complete the task + +### Basic Event Wait + +```csharp +public class SignalOrchestration : TaskOrchestration +{ + TaskCompletionSource resumeHandle; + + public override async Task RunTask(OrchestrationContext context, string input) + { + // Wait for external signal + string user = await WaitForSignal(); + + // Continue with the workflow + string greeting = await context.ScheduleTask(typeof(SendGreetingTask), user); + return greeting; + } + + async Task WaitForSignal() + { + this.resumeHandle = new TaskCompletionSource(); + string data = await this.resumeHandle.Task; + this.resumeHandle = null; + return data; + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + // Complete the pending task when event arrives + this.resumeHandle?.SetResult(input); + } +} +``` + +### Typed Event Data + +For strongly-typed event data, deserialize in `OnEvent`: + +```csharp +public class ApprovalOrchestration : TaskOrchestration +{ + TaskCompletionSource approvalHandle; + + public override async Task RunTask( + OrchestrationContext context, + ApprovalRequest request) + { + // Send approval request + await context.ScheduleTask(typeof(SendApprovalEmailActivity), request); + + // Wait for approval response + this.approvalHandle = new TaskCompletionSource(); + var response = await this.approvalHandle.Task; + this.approvalHandle = null; + + return new ApprovalResult + { + IsApproved = response.IsApproved, + ApprovedBy = response.ApprovedBy + }; + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + if (name == "Approval" && this.approvalHandle != null) + { + var response = context.MessageDataConverter.Deserialize(input); + this.approvalHandle.SetResult(response); + } + } +} +``` + +### Wait with Timeout + +Combine with timers to implement timeouts: + +```csharp +public class TimedApprovalOrchestration : TaskOrchestration +{ + TaskCompletionSource eventHandle; + + public override async Task RunTask(OrchestrationContext context, Request input) + { + // Set up the event wait + this.eventHandle = new TaskCompletionSource(); + var eventTask = this.eventHandle.Task; + + // Set up timeout + using var cts = new CancellationTokenSource(); + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddHours(24), + "timeout", + cts.Token); + + // Wait for either event or timeout + var winner = await Task.WhenAny(eventTask, timeoutTask); + + if (winner == eventTask) + { + cts.Cancel(); + var response = await eventTask; + this.eventHandle = null; + return new Result { Response = response, TimedOut = false }; + } + else + { + this.eventHandle = null; + return new Result { TimedOut = true }; + } + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + if (name == "UserResponse") + { + this.eventHandle?.SetResult(input); + } + } +} +``` + +### Multiple Event Types + +Handle different event types with named checks: + +```csharp +public class MultiEventOrchestration : TaskOrchestration +{ + TaskCompletionSource<(string EventType, string Data)> eventHandle; + + public override async Task RunTask(OrchestrationContext context, Request input) + { + this.eventHandle = new TaskCompletionSource<(string, string)>(); + + using var cts = new CancellationTokenSource(); + var eventTask = this.eventHandle.Task; + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddDays(7), + "timeout", + cts.Token); + + var winner = await Task.WhenAny(eventTask, timeoutTask); + cts.Cancel(); + this.eventHandle = null; + + if (winner == timeoutTask) + { + return new Result { Status = "TimedOut" }; + } + + var (eventType, data) = await eventTask; + + return eventType switch + { + "Approve" => new Result { Status = "Approved" }, + "Reject" => new Result { Status = "Rejected", Reason = data }, + "Cancel" => new Result { Status = "Cancelled" }, + _ => new Result { Status = "Unknown" } + }; + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + if (this.eventHandle != null && + (name == "Approve" || name == "Reject" || name == "Cancel")) + { + this.eventHandle.SetResult((name, input)); + } + } +} +``` + +## Sending Events + +### From TaskHubClient + +```csharp +var service = GetOrchestrationService(); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); + +// Send event to a specific orchestration instance +await client.RaiseEventAsync( + instance, // OrchestrationInstance + eventName: "Approval", // Event name (passed to OnEvent) + eventData: new ApprovalData // Event payload (serialized to string) + { + IsApproved = true, + ApprovedBy = "manager@company.com" + }); + +// Using instance ID directly +await client.RaiseEventAsync( + new OrchestrationInstance { InstanceId = "order-12345" }, + "Approval", + new ApprovalData { IsApproved = true }); +``` + +### From Another Orchestration + +Orchestrations cannot directly raise events. Use an activity: + +```csharp +public override async Task RunTask(OrchestrationContext context, SignalInput input) +{ + // Do some work... + + // Use an activity to send the event + await context.ScheduleTask(typeof(SendEventActivity), new SendEventInput + { + TargetInstanceId = input.TargetOrchestrationId, + EventName = "DataReady", + EventData = input.Data + }); +} + +// Activity to send the event +public class SendEventActivity : AsyncTaskActivity +{ + private readonly TaskHubClient _client; + + public SendEventActivity(TaskHubClient client) + { + _client = client; + } + + protected override async Task ExecuteAsync( + TaskContext context, + SendEventInput input) + { + await _client.RaiseEventAsync( + new OrchestrationInstance { InstanceId = input.TargetInstanceId }, + input.EventName, + input.EventData); + return true; + } +} +``` + +### From External Systems (Webhooks) + +```csharp +// In an ASP.NET Core controller +[ApiController] +[Route("api/[controller]")] +public class WebhookController : ControllerBase +{ + private readonly TaskHubClient _client; + + public WebhookController(TaskHubClient client) + { + _client = client; + } + + [HttpPost("approve/{instanceId}")] + public async Task Approve( + string instanceId, + [FromBody] ApprovalRequest request) + { + await _client.RaiseEventAsync( + new OrchestrationInstance { InstanceId = instanceId }, + "Approval", + new ApprovalData + { + IsApproved = request.Approved, + ApprovedBy = User.Identity?.Name + }); + + return Ok(); + } +} +``` + +## Event Patterns + +### Human Approval Workflow + +```csharp +public class ApprovalWorkflow : TaskOrchestration +{ + TaskCompletionSource approvalHandle; + + public override async Task RunTask( + OrchestrationContext context, + ApprovalRequest request) + { + // Step 1: Send approval request email + await context.ScheduleTask(typeof(SendApprovalEmailActivity), new EmailData + { + To = request.ApproverEmail, + Subject = $"Approval needed: {request.Title}", + ApprovalUrl = $"https://myapp.com/approve/{context.OrchestrationInstance.InstanceId}" + }); + + // Step 2: Wait for response with 7-day timeout + this.approvalHandle = new TaskCompletionSource(); + + using var cts = new CancellationTokenSource(); + var approvalTask = this.approvalHandle.Task; + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddDays(7), + "timeout", + cts.Token); + + var winner = await Task.WhenAny(approvalTask, timeoutTask); + cts.Cancel(); + + if (winner == timeoutTask) + { + this.approvalHandle = null; + await context.ScheduleTask(typeof(SendTimeoutNotificationActivity), request); + return new ApprovalResult { Status = ApprovalStatus.TimedOut }; + } + + var response = await approvalTask; + this.approvalHandle = null; + + // Step 3: Process the decision + if (response.IsApproved) + { + await context.ScheduleTask(typeof(ProcessApprovalActivity), request); + return new ApprovalResult { Status = ApprovalStatus.Approved }; + } + else + { + await context.ScheduleTask(typeof(ProcessRejectionActivity), new RejectionData + { + Request = request, + Reason = response.RejectionReason + }); + return new ApprovalResult + { + Status = ApprovalStatus.Rejected, + Reason = response.RejectionReason + }; + } + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + if (name == "ApprovalResponse" && this.approvalHandle != null) + { + var response = context.MessageDataConverter.Deserialize(input); + this.approvalHandle.SetResult(response); + } + } +} +``` + +### Sequential Multi-Step Events + +```csharp +public class MultiStepOrchestration : TaskOrchestration +{ + TaskCompletionSource currentEventHandle; + string currentEventName; + + public override async Task RunTask(OrchestrationContext context, Request input) + { + // Wait for step 1 + var step1 = await WaitForEvent("Step1Complete"); + await context.ScheduleTask(typeof(ProcessStep1Activity), step1); + + // Wait for step 2 + var step2 = await WaitForEvent("Step2Complete"); + await context.ScheduleTask(typeof(ProcessStep2Activity), step2); + + // Wait for step 3 + var step3 = await WaitForEvent("Step3Complete"); + await context.ScheduleTask(typeof(ProcessStep3Activity), step3); + + return new Result { Success = true }; + } + + async Task WaitForEvent(string eventName) + { + this.currentEventName = eventName; + this.currentEventHandle = new TaskCompletionSource(); + var result = await this.currentEventHandle.Task; + this.currentEventHandle = null; + this.currentEventName = null; + return result; + } + + public override void OnEvent(OrchestrationContext context, string name, string input) + { + if (name == this.currentEventName && this.currentEventHandle != null) + { + this.currentEventHandle.SetResult(input); + } + } +} +``` + +## Event Behavior + +### Event Buffering + +Events sent before the orchestration reaches its wait point are **buffered** and delivered when `OnEvent` is called during replay. The framework replays the event from history. + +### Event History + +Events are recorded in the orchestration history: + +```text +EventRaised { Name: "Approval", Input: "{...}" } +``` + +During replay, the `OnEvent` method is called with the same event data from history, one at a time using a single thread, ensuring deterministic behavior. The thread used to call `OnEvent` is the same thread that runs the orchestration code. + +## Best Practices + +### 1. Use Meaningful Event Names + +```csharp +public override void OnEvent(OrchestrationContext context, string name, string input) +{ + // ✅ Good - clear, descriptive names + if (name == "OrderApproved") { ... } + if (name == "PaymentReceived") { ... } + + // ❌ Bad - unclear names + if (name == "Event1") { ... } + if (name == "Data") { ... } +} +``` + +### 2. Include Timeout + +```csharp +// ✅ Good - has timeout +var eventTask = this.eventHandle.Task; +var timeoutTask = context.CreateTimer(deadline, "timeout", cts.Token); +await Task.WhenAny(eventTask, timeoutTask); + +// ⚠️ Risky - waits forever +await this.eventHandle.Task; +``` + +### 3. Clean Up Handles + +```csharp +public override async Task RunTask(OrchestrationContext context, Request input) +{ + this.eventHandle = new TaskCompletionSource(); + + try + { + var result = await this.eventHandle.Task; + return new Result { Data = result }; + } + finally + { + // ✅ Always clean up + this.eventHandle = null; + } +} +``` + +### 4. Validate Event Data + +```csharp +public override void OnEvent(OrchestrationContext context, string name, string input) +{ + if (name == "Approval" && this.approvalHandle != null) + { + var response = context.MessageDataConverter.Deserialize(input); + + // Validate before completing + if (string.IsNullOrEmpty(response.ApprovedBy)) + { + // Could log warning or ignore invalid event + return; + } + + this.approvalHandle.SetResult(response); + } +} +``` + +### 5. Document Expected Events + +```csharp +/// +/// Order processing orchestration. +/// +/// Expected external events: +/// - "PaymentConfirmed" (PaymentData): Payment has been processed +/// - "ShippingReady" (ShippingData): Order is ready to ship +/// - "Cancel" (CancellationData): Cancel the order +/// +public class OrderOrchestration : TaskOrchestration +{ + // ... +} +``` + +## Next Steps + +- [Timers](timers.md) — Combining events with timeouts +- [Sub-Orchestrations](sub-orchestrations.md) — Coordinating child workflows +- [Error Handling](error-handling.md) — Handling event failures diff --git a/docs/features/retries.md b/docs/features/retries.md new file mode 100644 index 000000000..2baad2623 --- /dev/null +++ b/docs/features/retries.md @@ -0,0 +1,303 @@ +# Automatic Retries + +The Durable Task Framework supports automatic retries for activities and sub-orchestrations. Retries are handled durably - the retry count and timing survive process restarts. + +## Basic Retry Configuration + +### RetryOptions + +Configure retries using `RetryOptions`: + +```csharp +var retryOptions = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3); +``` + +### Calling with Retry + +```csharp +var result = await context.ScheduleWithRetry( + typeof(UnreliableActivity), + retryOptions, + input); +``` + +## RetryOptions Properties + +| Property | Description | Default | +| -------- | ----------- | ------- | +| `FirstRetryInterval` | Delay before the first retry | Required | +| `MaxNumberOfAttempts` | Maximum total attempts (including first) | Required | +| `BackoffCoefficient` | Multiplier for exponential backoff | 1.0 | +| `MaxRetryInterval` | Maximum delay between retries | `TimeSpan.MaxValue` | +| `RetryTimeout` | Total time allowed for all retries | `TimeSpan.MaxValue` | +| `Handle` | Custom exception filter function | Retry all | + +## Retry Patterns + +### Fixed Interval + +Same delay between each retry: + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(10), + maxNumberOfAttempts: 5); +// BackoffCoefficient defaults to 1.0 +// Delays: 10s, 10s, 10s, 10s +``` + +### Exponential Backoff + +Increasing delays between retries: + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(1), + maxNumberOfAttempts: 5) +{ + BackoffCoefficient = 2.0 +}; +// Delays: 1s, 2s, 4s, 8s +``` + +### Exponential with Max Interval + +Cap the maximum delay: + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(1), + maxNumberOfAttempts: 10) +{ + BackoffCoefficient = 2.0, + MaxRetryInterval = TimeSpan.FromMinutes(1) +}; +// Delays: 1s, 2s, 4s, 8s, 16s, 32s, 60s, 60s, 60s +``` + +### With Timeout + +Limit total retry time: + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 100) // High limit +{ + BackoffCoefficient = 2.0, + MaxRetryInterval = TimeSpan.FromMinutes(5), + RetryTimeout = TimeSpan.FromHours(1) // Stop after 1 hour +}; +``` + +## Custom Exception Handling + +### Handle Specific Exceptions + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3) +{ + Handle = exception => + { + // Only retry on transient failures + return exception is HttpRequestException || + exception is TimeoutException; + } +}; +``` + +### Retry Based on Exception Details + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 5) +{ + Handle = exception => + { + if (exception is ApiException apiEx) + { + // Retry on 429 (rate limit) or 5xx (server errors) + return apiEx.StatusCode == 429 || + (int)apiEx.StatusCode >= 500; + } + return false; + } +}; +``` + +### Never Retry Specific Errors + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3) +{ + Handle = exception => + { + // Don't retry validation errors + if (exception is ValidationException) + return false; + + // Don't retry authentication errors + if (exception is AuthenticationException) + return false; + + return true; // Retry everything else + } +}; +``` + +## Sub-Orchestration Retries + +Retry sub-orchestrations with the same pattern: + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromMinutes(1), + maxNumberOfAttempts: 3); + +var result = await context.CreateSubOrchestrationInstanceWithRetry( + typeof(ChildOrchestration), + options, + input); +``` + +## Retry Behavior + +### What Happens During Retry + +1. Activity throws an exception +2. Framework records `TaskFailed` event +3. Retry timer is created (durable) +4. Timer fires, activity is scheduled again +5. If successful, `TaskCompleted` is recorded +6. If failed and attempts remain, go to step 2 + +### Durability + +Retries are durable: + +- Retry count survives process restarts +- Timer state is persisted +- No duplicate executions + +### Final Failure + +After all retries exhausted: + +- `TaskFailedException` is thrown in orchestration +- Contains the last exception as `InnerException` +- Orchestration can catch and handle + +```csharp +try +{ + var result = await context.ScheduleWithRetry( + typeof(UnreliableActivity), + retryOptions, + input); +} +catch (TaskFailedException ex) +{ + // All retries failed + _logger.LogError(ex.InnerException, "Activity failed after all retries"); + await context.ScheduleTask(typeof(CompensationActivity), input); +} +``` + +## Best Practices + +### 1. Use Idempotent Activities + +Activities may execute multiple times: + +```csharp +public class PaymentActivity : AsyncTaskActivity +{ + protected override async Task ExecuteAsync( + TaskContext context, + Payment input) + { + // Use idempotency key to prevent duplicate charges + return await _paymentService.ChargeAsync( + input.Amount, + idempotencyKey: input.OrderId); + } +} +``` + +### 2. Don't Retry Non-Transient Errors + +```csharp +var options = new RetryOptions(...) +{ + Handle = ex => !(ex is ValidationException) && + !(ex is NotFoundException) +}; +``` + +### 3. Set Reasonable Timeouts + +```csharp +var options = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 10) +{ + RetryTimeout = TimeSpan.FromMinutes(30) // Don't retry forever +}; +``` + +### 4. Consider Circuit Breaker Pattern + +For repeated failures, consider manual circuit breaking: + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + int consecutiveFailures = 0; + + while (consecutiveFailures < 3) + { + try + { + return await context.ScheduleWithRetry( + typeof(MyActivity), + retryOptions, + input); + } + catch (TaskFailedException) + { + consecutiveFailures++; + await context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(5 * consecutiveFailures), + true); + } + } + + throw new Exception("Circuit breaker opened"); +} +``` + +## Comparison with Activity-Level Retry + +Activity-level retries (inside the activity code) are not durable and do not survive orchestration restarts. They also do not appear in the orchestration history. + +| Feature | Orchestration Retry (ScheduleWithRetry) | Activity-Internal Retry | +| ------- | -------------------------------------- | ---------------------- | +| Durable | ✅ Yes | ❌ No | +| Survives crashes | ✅ Yes | ❌ No | +| Visible in history | ✅ Yes | ❌ No | +| Configurable per-call | ✅ Yes | ⚠️ Limited | + +Prefer orchestration-level retries for durability. + +## Next Steps + +- [Error Handling](error-handling.md) — Comprehensive error handling patterns +- [Timers](timers.md) — Durable timers and delays +- [Activities](../concepts/activities.md) — Writing retry-safe activities diff --git a/docs/features/sub-orchestrations.md b/docs/features/sub-orchestrations.md new file mode 100644 index 000000000..5305c580d --- /dev/null +++ b/docs/features/sub-orchestrations.md @@ -0,0 +1,315 @@ +# Sub-Orchestrations + +Sub-orchestrations allow you to break complex workflows into smaller, reusable pieces. A parent orchestration can start child orchestrations and wait for their results. + +## Creating Sub-Orchestrations + +### Basic Usage + +```csharp +public override async Task RunTask( + OrchestrationContext context, + OrderInput input) +{ + // Start a sub-orchestration and wait for result + var paymentResult = await context.CreateSubOrchestrationInstance( + typeof(PaymentOrchestration), + input.PaymentData); + + return new OrderResult { PaymentId = paymentResult.TransactionId }; +} +``` + +### With Custom Instance ID + +```csharp +var result = await context.CreateSubOrchestrationInstance( + typeof(ShippingOrchestration), + instanceId: $"shipping-{input.OrderId}", // Custom ID + input: input.ShippingData); +``` + +### With Retry Options + +```csharp +var retryOptions = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(30), + maxNumberOfAttempts: 3) +{ + BackoffCoefficient = 2.0 +}; + +var result = await context.CreateSubOrchestrationInstanceWithRetry( + typeof(ChildOrchestration), + retryOptions, + input); +``` + +## Sub-Orchestration Patterns + +### Sequential Sub-Orchestrations + +```csharp +public override async Task RunTask( + OrchestrationContext context, + OrderInput input) +{ + // Step 1: Validate + var validationResult = await context.CreateSubOrchestrationInstance( + typeof(ValidationOrchestration), + input); + + if (!validationResult.IsValid) + return new OrderResult { Error = validationResult.Error }; + + // Step 2: Process payment + var paymentResult = await context.CreateSubOrchestrationInstance( + typeof(PaymentOrchestration), + input.PaymentData); + + // Step 3: Fulfill order + var fulfillmentResult = await context.CreateSubOrchestrationInstance( + typeof(FulfillmentOrchestration), + new FulfillmentInput { OrderId = input.OrderId, PaymentId = paymentResult.Id }); + + return new OrderResult { Success = true, TrackingNumber = fulfillmentResult.TrackingNumber }; +} +``` + +### Parallel Sub-Orchestrations (Fan-Out) + +```csharp +public override async Task RunTask( + OrchestrationContext context, + BatchInput input) +{ + // Start all sub-orchestrations in parallel + var tasks = input.Items.Select(item => + context.CreateSubOrchestrationInstance( + typeof(ProcessItemOrchestration), + instanceId: $"item-{item.Id}", + input: item)); + + // Wait for all to complete + var results = await Task.WhenAll(tasks); + + return new BatchResult + { + ProcessedCount = results.Length, + SuccessCount = results.Count(r => r.Success), + FailedItems = results.Where(r => !r.Success).Select(r => r.ItemId).ToList() + }; +} +``` + +### Conditional Sub-Orchestrations + +```csharp +public override async Task RunTask( + OrchestrationContext context, + Input input) +{ + // Choose sub-orchestration based on input + if (input.ProcessType == ProcessType.Express) + { + return await context.CreateSubOrchestrationInstance( + typeof(ExpressProcessingOrchestration), + input); + } + else + { + return await context.CreateSubOrchestrationInstance( + typeof(StandardProcessingOrchestration), + input); + } +} +``` + +### Hierarchical Workflows + +```csharp +// Top-level orchestration +public class ProjectOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + ProjectInput input) + { + var results = new List(); + + foreach (var phase in input.Phases) + { + var phaseResult = await context.CreateSubOrchestrationInstance( + typeof(PhaseOrchestration), + phase); + results.Add(phaseResult); + + if (!phaseResult.Success) + break; // Stop on failure + } + + return new ProjectResult { Phases = results }; + } +} + +// Second-level orchestration +public class PhaseOrchestration : TaskOrchestration +{ + public override async Task RunTask( + OrchestrationContext context, + PhaseInput input) + { + // Each phase has multiple tasks as sub-orchestrations + var taskResults = await Task.WhenAll( + input.Tasks.Select(t => + context.CreateSubOrchestrationInstance( + typeof(TaskOrchestration), + t))); + + return new PhaseResult + { + Success = taskResults.All(r => r.Success), + TaskResults = taskResults.ToList() + }; + } +} +``` + +## Sub-Orchestration vs Activity + +| Feature | Sub-Orchestration | Activity | +| ------- | ----------------- | -------- | +| **Can call other orchestrations** | ✅ Yes | ❌ Not directly | +| **Can use timers** | ✅ Yes | ❌ No | +| **Can wait for events** | ✅ Yes | ❌ No | +| **Has own history** | ✅ Yes | ❌ No | +| **Overhead** | Higher | Lower | +| **Use for** | Complex workflows | Single operations | + +### When to Use Sub-Orchestrations + +✅ **Use sub-orchestrations when:** + +- The child workflow needs timers, events, or fan-out +- You want isolated history (for debugging, monitoring, or to reduce parent history size) +- You want to distribute orchestration work across multiple worker instances +- The logic is reusable across multiple parent orchestrations +- The child workflow is complex enough to warrant separate management + +❌ **Use activities instead when:** + +- Performing a single operation (API call, DB query) +- No need for durable timers or events +- Simple, stateless work + +## Instance ID Management + +### Auto-Generated IDs + +```csharp +// ID is an automatically generated GUID +var result = await context.CreateSubOrchestrationInstance( + typeof(ChildOrchestration), + input); +``` + +### Custom IDs for Idempotency + +```csharp +// Using custom ID ensures idempotency +var result = await context.CreateSubOrchestrationInstance( + typeof(ChildOrchestration), + instanceId: $"{context.OrchestrationInstance.InstanceId}:child:{input.ItemId}", + input: input); +``` + +### Naming Conventions + +```csharp +// Good patterns for sub-orchestration IDs: +$"{parentId}:payment" // Single child of type +$"{parentId}:item:{itemId}" // Multiple children by item +$"order-{orderId}:fulfillment" // Business-meaningful +``` + +## Error Handling + +### Catching Sub-Orchestration Failures + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + try + { + var result = await context.CreateSubOrchestrationInstance( + typeof(RiskyOrchestration), + input); + return result; + } + catch (SubOrchestrationFailedException ex) + { + // Sub-orchestration threw an unhandled exception + await context.ScheduleTask(typeof(CompensationActivity), input); + return new Result { Error = ex.Message }; + } +} +``` + +### Timeout Handling + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + using var cts = new CancellationTokenSource(); + + var subOrchTask = context.CreateSubOrchestrationInstance( + typeof(LongRunningOrchestration), + input); + + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddHours(1), + true, + cts.Token); + + var winner = await Task.WhenAny(subOrchTask, timeoutTask); + + if (winner == subOrchTask) + { + cts.Cancel(); + return await subOrchTask; + } + else + { + // Note: The sub-orchestration continues running! + // Consider terminating it via an activity if needed + return new Result { TimedOut = true }; + } +} +``` + +## Monitoring Sub-Orchestrations + +### Getting Sub-Orchestration Status + +```csharp +// From an activity or external code +var service = GetOrchestrationService(); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); +var state = await client.GetOrchestrationStateAsync( + new OrchestrationInstance { InstanceId = subOrchestrationId }); +``` + +### Viewing in History + +Sub-orchestration events in parent history: + +```text +SubOrchestrationInstanceCreated { InstanceId: "child-123", Name: "PaymentOrchestration" } +SubOrchestrationInstanceCompleted { InstanceId: "child-123", Result: "{...}" } +``` + +## Next Steps + +- [Activities](../concepts/activities.md) — When to use activities instead +- [Error Handling](error-handling.md) — Handling sub-orchestration failures +- [Versioning](versioning.md) — Updating sub-orchestration code diff --git a/docs/features/timers.md b/docs/features/timers.md new file mode 100644 index 000000000..d413947fd --- /dev/null +++ b/docs/features/timers.md @@ -0,0 +1,304 @@ +# Durable Timers + +Durable timers allow orchestrations to wait for specified durations or until specific times. Unlike `Thread.Sleep` or `Task.Delay`, durable timers are persisted and survive process restarts. + +## Creating Timers + +### Wait for Duration + +```csharp +// Wait for 5 minutes +await context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(5), + true); +``` + +### Wait Until Specific Time + +```csharp +// Wait until midnight +var midnight = context.CurrentUtcDateTime.Date.AddDays(1); +await context.CreateTimer(midnight, true); +``` + +### Timer with CancellationToken + +```csharp +using var cts = new CancellationTokenSource(); +var timerTask = context.CreateTimer( + context.CurrentUtcDateTime.AddHours(1), + true, + cts.Token); + +// Cancel the timer if needed +cts.Cancel(); +``` + +## Common Patterns + +### Timeout with Fallback + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + using var cts = new CancellationTokenSource(); + + var workTask = context.ScheduleTask(typeof(LongRunningActivity), input); + var timeoutTask = context.CreateTimer( + context.CurrentUtcDateTime.AddMinutes(30), + true, + cts.Token); + + var winner = await Task.WhenAny(workTask, timeoutTask); + + if (winner == workTask) + { + cts.Cancel(); // Cancel the timer + return await workTask; + } + else + { + // Timeout occurred + return new Result { TimedOut = true }; + } +} +``` + +### Approval with Deadline + +```csharp +public override async Task RunTask( + OrchestrationContext context, + ApprovalRequest request) +{ + // Send approval request + await context.ScheduleTask(typeof(SendApprovalEmail), request); + + using var cts = new CancellationTokenSource(); + + // Wait for approval event or 7-day timeout + var approvalTask = context.WaitForExternalEvent("Approved"); + var deadlineTask = context.CreateTimer( + context.CurrentUtcDateTime.AddDays(7), + true, + cts.Token); + + var winner = await Task.WhenAny(approvalTask, deadlineTask); + + if (winner == approvalTask) + { + cts.Cancel(); + var approved = await approvalTask; + return new ApprovalResult { Approved = approved }; + } + else + { + return new ApprovalResult { Approved = false, Expired = true }; + } +} +``` + +### Periodic Polling (Monitor Pattern) + +```csharp +public override async Task RunTask( + OrchestrationContext context, + JobInput input) +{ + var expirationTime = context.CurrentUtcDateTime.AddHours(4); + var pollingInterval = TimeSpan.FromSeconds(30); + + while (context.CurrentUtcDateTime < expirationTime) + { + var status = await context.ScheduleTask( + typeof(CheckJobStatusActivity), + input.JobId); + + if (status.IsComplete) + { + return new JobResult { Success = true, Data = status.Data }; + } + + // Wait before next poll + var nextCheck = context.CurrentUtcDateTime.Add(pollingInterval); + await context.CreateTimer(nextCheck, true); + + // Optional: exponential backoff + pollingInterval = TimeSpan.FromSeconds( + Math.Min(pollingInterval.TotalSeconds * 1.5, 300)); + } + + return new JobResult { Success = false, TimedOut = true }; +} +``` + +### Scheduled Execution + +```csharp +public override async Task RunTask( + OrchestrationContext context, + ScheduledTask input) +{ + // Wait until scheduled time + if (context.CurrentUtcDateTime < input.ScheduledTime) + { + await context.CreateTimer(input.ScheduledTime, true); + } + + // Execute the task + return await context.ScheduleTask( + typeof(ScheduledWorkActivity), + input); +} +``` + +### Cron-like Scheduling + +```csharp +public override async Task RunTask(OrchestrationContext context, CronInput input) +{ + var nextRun = GetNextCronTime(input.CronExpression, context.CurrentUtcDateTime); + + // Wait until next scheduled time + await context.CreateTimer(nextRun, true); + + // Execute scheduled work + await context.ScheduleTask(typeof(CronJobActivity), input); + + // Continue as new for next iteration + context.ContinueAsNew(input); +} + +private DateTime GetNextCronTime(string cronExpression, DateTime fromTime) +{ + // Use a cron parsing library like Cronos + var expression = CronExpression.Parse(cronExpression); + return expression.GetNextOccurrence(fromTime, TimeZoneInfo.Utc) + ?? throw new InvalidOperationException("No next occurrence"); +} +``` + +### Reminder/Notification Pattern + +```csharp +public override async Task RunTask(OrchestrationContext context, ReminderInput input) +{ + // Send initial notification + await context.ScheduleTask(typeof(SendReminderActivity), new ReminderData + { + UserId = input.UserId, + Message = input.InitialMessage + }); + + // Send follow-up reminders + foreach (var reminder in input.FollowUpSchedule) + { + await context.CreateTimer( + context.CurrentUtcDateTime.Add(reminder.Delay), + true); + + await context.ScheduleTask(typeof(SendReminderActivity), new ReminderData + { + UserId = input.UserId, + Message = reminder.Message + }); + } +} +``` + +## Timer Behavior + +### Durability + +Timers are persisted as `TimerCreated` events: + +```text +1. ExecutionStarted +2. TimerCreated { FireAt: "2024-01-15T10:00:00Z" } +``` + +When the timer fires, a `TimerFired` event is added: + +```text +3. TimerFired { TimerId: 1 } +``` + +### Replay Behavior + +During replay: + +- Past timers complete immediately (fire time already passed) +- Future timers wait for the scheduled time + +### Minimum Duration + +Very short timers (< 1 second) may not provide precise timing due to: + +- Message processing overhead +- Partition lease renewal intervals +- Clock synchronization + +For precise short delays, use activities. + +## Best Practices + +### 1. Always Use Context Time + +```csharp +// ✅ Correct +await context.CreateTimer(context.CurrentUtcDateTime.AddMinutes(5), true); + +// ❌ Wrong - non-deterministic +await context.CreateTimer(DateTime.UtcNow.AddMinutes(5), true); +``` + +### 2. Cancel Unused Timers + +```csharp +using var cts = new CancellationTokenSource(); +var timer = context.CreateTimer(deadline, true, cts.Token); +var work = context.WaitForExternalEvent("Event"); + +var winner = await Task.WhenAny(timer, work); +if (winner == work) +{ + cts.Cancel(); // Important: cancel the timer +} +``` + +> [!NOTE] +> If an orchestration completes while timers are pending, the orchestration will remain in the "Running" state until all timers either fire or are cancelled. + +### 3. Avoid Very Long Timers Without ContinueAsNew + +Super long timers make it harder to version orchestration code. Periodically break up long waits using `ContinueAsNew` if possible. + +```csharp +// For very long waits, consider breaking up with ContinueAsNew +public override async Task RunTask(OrchestrationContext context, WaitInput input) +{ + var remainingWait = input.TotalWait - (context.CurrentUtcDateTime - input.StartTime); + + if (remainingWait > TimeSpan.FromDays(7)) + { + // Wait for a week, then continue as new + await context.CreateTimer( + context.CurrentUtcDateTime.AddDays(7), + true); + context.ContinueAsNew(input); + return; + } + + await context.CreateTimer( + context.CurrentUtcDateTime.Add(remainingWait), + true); + + await context.ScheduleTask(typeof(FinalActivity), input); +} +``` + +## Next Steps + +- [External Events](external-events.md) — Combining timers with events +- [Eternal Orchestrations](eternal-orchestrations.md) — Long-running workflows +- [Replay and Durability](../concepts/replay-and-durability.md) — How timers are persisted diff --git a/docs/features/versioning.md b/docs/features/versioning.md new file mode 100644 index 000000000..0eb6d21c4 --- /dev/null +++ b/docs/features/versioning.md @@ -0,0 +1,498 @@ +# Orchestration Versioning + +When you need to update orchestration code while instances are running, careful versioning strategies are required to avoid breaking in-flight orchestrations. + +## The Versioning Problem + +Orchestrations use [replay](../concepts/replay-and-durability.md) to rebuild state. If you change the code while an orchestration is in-flight, replay can fail. + +### Example: Adding an Activity at the Beginning + +```csharp +// Version 1 +public override async Task RunTask(OrchestrationContext context, string input) +{ + var a = await context.ScheduleTask(typeof(ActivityA), input); + var b = await context.ScheduleTask(typeof(ActivityB), a); + return b; +} + +// Version 2 - Added a new activity at the BEGINNING +public override async Task RunTask(OrchestrationContext context, string input) +{ + var validated = await context.ScheduleTask(typeof(ValidateActivity), input); // NEW + var a = await context.ScheduleTask(typeof(ActivityA), validated); + var b = await context.ScheduleTask(typeof(ActivityB), a); + return b; +} +``` + +Suppose an instance started with V1 and completed `ActivityA`. Its history contains: + +```text +TaskScheduled { Name: "ActivityA" } +TaskCompleted { Result: "..." } +``` + +When V2 code replays this history: + +1. V2 expects first task to be `ValidateActivity` +2. History shows first task was `ActivityA` +3. **NonDeterministicOrchestrationException** is thrown + +### Why Adding to the End Is Different + +Adding activities at the **end** of an orchestration is generally safe because: + +- Completed orchestrations are never replayed +- In-flight orchestrations haven't reached that point yet + +```csharp +// Version 2 - Adding at the END is safe +public override async Task RunTask(OrchestrationContext context, string input) +{ + var a = await context.ScheduleTask(typeof(ActivityA), input); + var b = await context.ScheduleTask(typeof(ActivityB), a); + var c = await context.ScheduleTask(typeof(ActivityC), b); // Usually safe to add here + return c; +} +``` + +However, be cautious if in-flight orchestrations are waiting on timers or external events near the end—they may still replay and encounter the new code. + +## Versioning Strategies + +### Strategy 1: Side-by-Side Versioning + +Deploy multiple versions of the orchestration simultaneously using `NameValueObjectCreator`: + +```csharp +// Define both versions as separate classes +public class OrderOrchestrationV1 : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, Input input) + { + // V1 logic + } +} + +public class OrderOrchestrationV2 : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, Input input) + { + // V2 logic with new features + } +} + +// Register both with explicit name and version +worker.AddTaskOrchestrations( + new NameValueObjectCreator( + "OrderOrchestration", "V1", typeof(OrderOrchestrationV1)), + new NameValueObjectCreator( + "OrderOrchestration", "V2", typeof(OrderOrchestrationV2))); +``` + +Start new instances with the new version: + +```csharp +// Start with specific version +var instance = await client.CreateOrchestrationInstanceAsync( + "OrderOrchestration", + "V2", // Version string must match registration + input); +``` + +### Strategy 2: Feature Flags with Version Check + +Check version in orchestration code: + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + var a = await context.ScheduleTask(typeof(ActivityA), input); + + // Only run new code for instances started after cutoff + if (context.OrchestrationInstance.ExecutionId != null && + input.Version >= 2) + { + var b = await context.ScheduleTask(typeof(ActivityB), a); + return new Result { Data = b }; + } + + return new Result { Data = a }; +} +``` + +### Strategy 3: Wait for Completion + +The safest approach for breaking changes: + +1. **Stop starting new instances** of the old version +2. **Wait for all running instances** to complete +3. **Deploy the new version** +4. **Resume starting instances** + +```csharp +// Query running instances +var runningInstances = await client.GetOrchestrationStateAsync( + new OrchestrationStateQuery + { + RuntimeStatus = new[] { OrchestrationStatus.Running } + }); + +// Wait for completion +while (runningInstances.Any()) +{ + await Task.Delay(TimeSpan.FromMinutes(1)); + runningInstances = await client.GetOrchestrationStateAsync(...); +} + +// Safe to deploy new version +``` + +### Strategy 4: Graceful Migration + +For long-running orchestrations, add a migration point: + +```csharp +// V1: Add migration check +public override async Task RunTask(OrchestrationContext context, Input input) +{ + // Check if migration is needed + if (input.ShouldMigrate) + { + // Start V2 orchestration with current state + var result = await context.CreateSubOrchestrationInstance( + "OrderOrchestration", + "2.0", + input); + return result; + } + + // Continue with V1 logic + var a = await context.ScheduleTask(typeof(ActivityA), input); + return new Result { Data = a }; +} +``` + +### Strategy 5: Worker-Level Version Filtering + +Configure workers to only process orchestrations matching specific version criteria using `VersioningSettings`. This enables zero-downtime deployments by running multiple worker versions simultaneously. + +#### Setting Up VersioningSettings + +```csharp +using DurableTask.Core.Settings; + +var versioningSettings = new VersioningSettings +{ + Version = "2.0", + MatchStrategy = VersioningSettings.VersionMatchStrategy.CurrentOrOlder, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Reject +}; + +var worker = new TaskHubWorker(orchestrationService, versioningSettings, loggerFactory); +``` + +> [!IMPORTANT] +> The `Version` property serves two purposes: +> +> 1. It defines which orchestrations this worker will process (based on `MatchStrategy`) +> 2. It becomes the **default version** for all new orchestrations created without an explicit version + +This means when you start a new orchestration without specifying a version, it will automatically be stamped with the worker's configured version: + +```csharp +// This orchestration will be created with version "2.0" (from VersioningSettings) +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(OrderOrchestration), + input); +``` + +#### Version Match Strategies + +| Strategy | Description | +| -------- | ----------- | +| `None` | Default. Ignore version, process all orchestrations. | +| `Strict` | Only process orchestrations with an **exact** version match. | +| `CurrentOrOlder` | Process orchestrations with version **less than or equal** to the worker version. | + +#### Version Failure Strategies + +| Strategy | Description | +| -------- | ----------- | +| `Reject` | Default. Abandon the work item so another worker can pick it up (or retry later). | +| `Fail` | Fail the orchestration with a `VersionMismatch` error. | + +#### Blue-Green Deployment Example + +Run old and new workers simultaneously during deployments: + +```csharp +// OLD worker (handles existing orchestrations) +var oldSettings = new VersioningSettings +{ + Version = "1.0", + MatchStrategy = VersioningSettings.VersionMatchStrategy.Strict, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Reject +}; +var oldWorker = new TaskHubWorker(orchestrationService, oldSettings, loggerFactory); +oldWorker.AddTaskOrchestrations(typeof(OrderOrchestrationV1)); + +// NEW worker (handles new orchestrations) +var newSettings = new VersioningSettings +{ + Version = "2.0", + MatchStrategy = VersioningSettings.VersionMatchStrategy.Strict, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Reject +}; +var newWorker = new TaskHubWorker(orchestrationService, newSettings, loggerFactory); +newWorker.AddTaskOrchestrations(typeof(OrderOrchestrationV2)); + +// Both workers run simultaneously +// - V1 orchestrations are processed by oldWorker +// - V2 orchestrations are processed by newWorker +// Once all V1 orchestrations complete, retire oldWorker +``` + +#### Version Comparison + +Versions are compared using the following rules: + +1. Empty versions are treated as "unversioned" and compare as less than any defined version +2. If both versions can be parsed as `System.Version` (e.g., "1.0.0", "2.1"), numeric comparison is used +3. Otherwise, case-insensitive string comparison is used + +```csharp +// Version comparison examples +VersioningSettings.CompareVersions("1.0.0", "1.0.0"); // Returns 0 (equal) +VersioningSettings.CompareVersions("2.0.0", "1.0.0"); // Returns 1 (greater) +VersioningSettings.CompareVersions("1.0.0", "2.0.0"); // Returns -1 (less) +VersioningSettings.CompareVersions("", "1.0.0"); // Returns -1 (empty < defined) +``` + +#### Accessing Version in Orchestrations + +The orchestration version is available via `OrchestrationContext.Version`: + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + // Access the version this orchestration was started with + string version = context.Version; + + if (!context.IsReplaying) + { + _logger.LogInformation("Processing orchestration version: {Version}", version); + } + + // Use version for conditional logic (CompareVersions handles "2.0", "2.1", "3.0", etc.) + if (VersioningSettings.CompareVersions(version, "2.0") >= 0) + { + // V2+ specific logic + } + + // ... +} +``` + +## Safe Code Changes + +### Changes That Are Safe + +✅ **Adding activities at the end** (after all existing durable operations): + +```csharp +// Safe - existing orchestrations completed or haven't reached this point +var a = await context.ScheduleTask(typeof(ActivityA), input); +var b = await context.ScheduleTask(typeof(ActivityB), a); +var c = await context.ScheduleTask(typeof(ActivityC), b); // Added at end +``` + +✅ **Changing activity implementation** (not the orchestration code): + +```csharp +// Safe - activity logic doesn't affect replay +public class ActivityA : TaskActivity +{ + protected override string Execute(TaskContext context, string input) + { + return input.ToUpper(); // Changed from ToLower() + } +} +``` + +✅ **Adding logging or metrics** (using IsReplaying): + +```csharp +if (!context.IsReplaying) +{ + _logger.LogInformation("Processing..."); // Safe to add +} +``` + +✅ **Changing non-durable code**: + +```csharp +var formatted = input.Trim().ToLower(); // Safe to change +var result = await context.ScheduleTask(typeof(MyActivity), formatted); +``` + +### Changes That Are NOT Safe + +❌ **Removing or reordering activities**: + +```csharp +// V1 +var a = await context.ScheduleTask(typeof(ActivityA), input); +var b = await context.ScheduleTask(typeof(ActivityB), a); + +// V2 - BREAKS replay +var b = await context.ScheduleTask(typeof(ActivityB), input); +var a = await context.ScheduleTask(typeof(ActivityA), b); +``` + +❌ **Changing activity types**: + +```csharp +// V1 +await context.ScheduleTask(typeof(ActivityA), input); + +// V2 - BREAKS replay (different activity name) +await context.ScheduleTask(typeof(ActivityANew), input); +``` + +❌ **Changing conditional logic that affects scheduling**: + +```csharp +// V1 +if (input.Amount > 100) + await context.ScheduleTask(typeof(LargeOrderActivity), input); + +// V2 - BREAKS replay (different threshold) +if (input.Amount > 50) // Changed condition! + await context.ScheduleTask(typeof(LargeOrderActivity), input); +``` + +❌ **Adding activities in the middle**: + +```csharp +// V1 +var a = await context.ScheduleTask(typeof(ActivityA), input); +var c = await context.ScheduleTask(typeof(ActivityC), a); + +// V2 - BREAKS replay +var a = await context.ScheduleTask(typeof(ActivityA), input); +var b = await context.ScheduleTask(typeof(ActivityB), a); // Added in middle! +var c = await context.ScheduleTask(typeof(ActivityC), b); +``` + +❌ **Changing retry policies**: + +```csharp +// V1 +var options = new RetryOptions(TimeSpan.FromSeconds(5), maxNumberOfAttempts: 3); +await context.ScheduleWithRetry(typeof(ActivityA), options, input); + +// V2 - BREAKS replay (different retry behavior recorded in history) +var options = new RetryOptions(TimeSpan.FromSeconds(10), maxNumberOfAttempts: 5); +await context.ScheduleWithRetry(typeof(ActivityA), options, input); +``` + +## Orchestration Name Registration + +### Custom Naming + +By default, orchestrations are registered using their class name. Use `NameValueObjectCreator` to specify a custom name: + +```csharp +public class OrderOrchestration : TaskOrchestration { } + +// Register with custom name "OrderProcessing" instead of class name +worker.AddTaskOrchestrations( + new NameValueObjectCreator( + "OrderProcessing", "", typeof(OrderOrchestration))); +``` + +### Side-by-Side Registration + +Use `NameValueObjectCreator` to register multiple versions of the same orchestration: + +```csharp +public class OrderOrchestrationV1 : TaskOrchestration { /* V1 impl */ } + +public class OrderOrchestrationV2 : TaskOrchestration { /* V2 impl */ } +``` + +### Registration + +```csharp +worker.AddTaskOrchestrations( + new NameValueObjectCreator( + "OrderProcessing", + "V1", + typeof(OrderOrchestrationV1)), + new NameValueObjectCreator( + "OrderProcessing", + "V2", + typeof(OrderOrchestrationV2))); +``` + +## Best Practices + +### 1. Plan for Versioning from the Start + +```csharp +public class OrderInput +{ + public int Version { get; set; } = 1; // Include version in input + public string OrderId { get; set; } + // ... +} +``` + +### 2. Use Feature Flags for Gradual Rollout + +```csharp +public override async Task RunTask(OrchestrationContext context, Input input) +{ + if (input.Features.UseNewPaymentFlow) + { + return await NewPaymentFlowAsync(context, input); + } + return await LegacyPaymentFlowAsync(context, input); +} +``` + +### 3. Keep Orchestrations Short-Lived When Possible + +Long-running orchestrations are harder to version. Consider: + +- Breaking into sub-orchestrations +- Using `ContinueAsNew` more frequently +- Designing for completion within hours/days, not months + +### 4. Document Breaking Changes + +```csharp +/// +/// Order processing orchestration. +/// +/// Version History: +/// - V1: Initial version +/// - V2: Added fraud check activity (BREAKING - wait for V1 completion) +/// - V2.1: Updated logging (compatible with V2) +/// +public class OrderOrchestrationV2_1 : TaskOrchestration { } + +// Register with name and version +worker.AddTaskOrchestrations( + new NameValueObjectCreator( + "OrderProcessing", "V2.1", typeof(OrderOrchestrationV2_1))); +``` + +## Next Steps + +- [Replay and Durability](../concepts/replay-and-durability.md) — Understanding why versioning matters +- [Deterministic Constraints](../concepts/deterministic-constraints.md) — Writing safe orchestration code +- [Error Handling](error-handling.md) — Handling version mismatch errors diff --git a/docs/getting-started/choosing-a-backend.md b/docs/getting-started/choosing-a-backend.md new file mode 100644 index 000000000..cff2cfa3a --- /dev/null +++ b/docs/getting-started/choosing-a-backend.md @@ -0,0 +1,166 @@ +# Choosing a Backend + +The Durable Task Framework (DTFx) supports multiple backend storage providers. This guide helps you choose the right one for your needs. + +## Recommendation: Durable Task Scheduler + +For most new projects, we recommend the **[Durable Task Scheduler](../providers/durable-task-scheduler.md)**—a fully managed Azure service that eliminates infrastructure management and provides the best developer experience. + +## Provider Comparison + +| Feature | [Durable Task Scheduler](../providers/durable-task-scheduler.md) | [Azure Storage](../providers/azure-storage.md) | [MSSQL](../providers/mssql.md) | [Service Bus](../providers/service-bus.md) | [Service Fabric](../providers/service-fabric.md) | [Emulator](../providers/emulator.md) | +| ------- | ---------------------- | ------------- | ----- | ----------- | -------------- | -------- | +| **Type** | ⭐ Managed service | Self-managed | Self-managed | Self-managed | Self-managed | In-memory | +| **Production ready** | ✅ Yes | ✅ Yes | ✅ Yes | ✅ Yes | ✅ Yes | ❌ No | +| **Azure support SLA** | ✅ Yes | ❌ No | ❌ No | ❌ No | ❌ No | ❌ No | +| **Infrastructure** | None required | Storage account | SQL Server database | Service Bus namespace | Service Fabric cluster | None | +| **Throughput** | Very high | Moderate+ | Moderate+ | Moderate | Unknown | N/A | +| **Latency** | Low | Moderate | Low | Moderate+ | Unknown | Very low | +| **Built-in dashboard** | ✅ Yes | ❌ No | ❌ No | ❌ No | ❌ No | ❌ No | +| **Managed identity** | ✅ Yes | ✅ Yes | ✅ Yes | ✅ Yes | N/A | N/A | +| **Local emulator** | ✅ Docker | N/A | ✅ SQL Server | N/A | N/A | ✅ Built-in | +| **Cost model** | Fixed monthly or per-operation | Storage transactions | Database DTUs/vCores | Messaging units | Cluster nodes | Free | + +## When to Use Each Provider + +### Durable Task Scheduler ⭐ Recommended + +**Best for:** + +- ✅ New projects and greenfield development +- ✅ Production workloads requiring enterprise support +- ✅ Teams that want to minimize operational overhead +- ✅ High-throughput scenarios +- ✅ Applications needing built-in monitoring + +**Considerations:** + +- Requires Azure subscription +- Cost based on operations (see [pricing](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/durable-task-scheduler-dedicated-sku)) + +👉 **[Get started with Durable Task Scheduler](../providers/durable-task-scheduler.md)** + +--- + +### Azure Storage + +**Best for:** + +- ✅ Existing Azure Storage deployments +- ✅ Cost-sensitive workloads with moderate throughput +- ✅ Scenarios requiring data residency control +- ✅ Teams already managing Azure Storage infrastructure + +**Considerations:** + +- Throughput limited by Azure Storage transaction limits +- Requires management of storage account, queues, tables, and blobs +- No built-in monitoring dashboard + +👉 **[Get started with Azure Storage](../providers/azure-storage.md)** + +--- + +### MSSQL (Microsoft SQL Server) + +**Best for:** + +- ✅ Non-Azure or hybrid deployments +- ✅ Teams with existing SQL Server expertise +- ✅ Scenarios requiring direct database queries against orchestration state +- ✅ Environments with strict BCDR requirements + +**Considerations:** + +- Requires management of SQL Server database +- State is stored in indexed tables with stored procedures for direct querying +- Available for Azure SQL Database, SQL Server, or any compatible MSSQL database + +👉 **[Get started with MSSQL](https://github.com/microsoft/durabletask-mssql)** + +--- + +### Service Bus + +**Best for:** + +- ✅ Existing Service Bus deployments +- ✅ Low(er)-latency message delivery requirements + +**Considerations:** + +- Requires management of Service Bus namespace +- Tracking store requires separate Azure Storage account + +👉 **[Get started with Service Bus](../providers/service-bus.md)** + +--- + +### Service Fabric + +**Best for:** + +- ✅ Existing Service Fabric clusters +- ✅ Integration with Service Fabric stateful services + +**Considerations:** + +- Requires Service Fabric cluster management +- Tightly coupled to Service Fabric ecosystem + +👉 **[Get started with Service Fabric](../providers/service-fabric.md)** + +--- + +### Emulator + +**Best for:** + +- ✅ Local development and testing +- ✅ Unit tests and integration tests +- ✅ Learning and experimentation + +**Considerations:** + +- In-memory only—data is lost on restart +- Not suitable for production use +- Single-process only + +👉 **[Get started with Emulator](../providers/emulator.md)** + +--- + +### Netherite ⚠️ Deprecated + +> **Warning:** Netherite is being deprecated and is not recommended for new projects. + +Netherite is an ultra-high performance backend developed by Microsoft Research that uses Azure Event Hubs and Azure Page Blobs with [FASTER](https://www.microsoft.com/research/project/faster/) database technology. + +**Considerations:** + +- ⚠️ Being deprecated—not recommended for new projects +- More complex infrastructure requirements (Event Hubs + Azure Storage) +- Consider migrating to Durable Task Scheduler for similar performance characteristics + +👉 **[Netherite GitHub Repository](https://github.com/microsoft/durabletask-netherite)** + +--- + +## Migration Between Providers + +Each provider stores orchestration state differently, so migrating between providers requires: + +1. **Completing or terminating** all running orchestrations +2. **Reconfiguring** the application with the new provider +3. **Restarting** orchestrations from scratch + +There is no built-in state migration tool between providers. + +## Need Help Deciding? + +- For **enterprise support**, choose [Durable Task Scheduler](../providers/durable-task-scheduler.md) +- For **non-Azure deployments**, choose [MSSQL](https://github.com/microsoft/durabletask-mssql) +- For **lowest cost**, choose [Azure Storage](../providers/azure-storage.md) +- For **local testing**, choose [Emulator](../providers/emulator.md) + +See [Support](../support.md) for more information about getting help. diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 000000000..d1e511090 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,91 @@ +# Installation + +This guide covers installing the Durable Task Framework (DTFx) packages for your project. + +## Prerequisites + +- .NET 6.0 or later (.NET 10.0 is currently recommended) +- .NET Framework 4.7.2 or later (for .NET Framework projects) + +## NuGet Packages + +### Core Package + +All DTFx applications require the core package: + +```bash +dotnet add package Microsoft.Azure.DurableTask.Core +``` + +### Backend Providers + +Backend providers implement the storage and messaging layers for DTFx. You can choose one of several backend providers based on your needs. See [Choosing a Backend](choosing-a-backend.md) for guidance. + +#### Durable Task Scheduler (Recommended) + +For new projects, we recommend the fully managed [Durable Task Scheduler](../providers/durable-task-scheduler.md): + +```bash +dotnet add package Microsoft.DurableTask.AzureManagedBackend +``` + +#### Azure Storage + +For self-managed deployments using Azure Storage (queues, tables, blobs): + +```bash +dotnet add package Microsoft.Azure.DurableTask.AzureStorage +``` + +#### Azure Service Bus + +For deployments using Azure Service Bus: + +```bash +dotnet add package Microsoft.Azure.DurableTask.ServiceBus +``` + +#### Azure Service Fabric + +For Service Fabric applications: + +```bash +dotnet add package Microsoft.Azure.DurableTask.AzureServiceFabric +``` + +#### Emulator (Local Development) + +For local development and testing without external dependencies: + +```bash +dotnet add package Microsoft.Azure.DurableTask.Emulator +``` + +### Optional Packages + +#### Application Insights Integration + +For Application Insights telemetry: + +```bash +dotnet add package Microsoft.Azure.DurableTask.ApplicationInsights +``` + +## Package Versions + +All DTFx packages follow semantic versioning. We recommend using the latest stable versions: + +| Package | NuGet | +| ------- | ----- | +| DurableTask.Core | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.Core.svg)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.Core/) | +| DurableTask.AzureManagedBackend | [![NuGet](https://img.shields.io/nuget/v/Microsoft.DurableTask.AzureManagedBackend.svg)](https://www.nuget.org/packages/Microsoft.DurableTask.AzureManagedBackend/) | +| DurableTask.AzureStorage | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.AzureStorage.svg)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.AzureStorage/) | +| DurableTask.ServiceBus | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.ServiceBus.svg)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.ServiceBus/) | +| DurableTask.AzureServiceFabric | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.AzureServiceFabric.svg)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.AzureServiceFabric/) | +| DurableTask.Emulator | [![NuGet](https://img.shields.io/nuget/v/Microsoft.Azure.DurableTask.Emulator.svg)](https://www.nuget.org/packages/Microsoft.Azure.DurableTask.Emulator/) | + +## Next Steps + +- [Quickstart](quickstart.md) — Create your first orchestration +- [Choosing a Backend](choosing-a-backend.md) — Compare backend providers +- [Core Concepts](../concepts/core-concepts.md) — Understand the architecture diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 000000000..e4f74c959 --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,173 @@ +# Quickstart + +This guide walks you through creating your first Durable Task Framework (DTFx) orchestration. + +## Overview + +In this quickstart, you'll create: +1. An **activity** that performs a simple greeting +2. An **orchestration** that calls the activity +3. A **host** that runs the orchestration + +## Step 1: Create a New Project + +```bash +dotnet new console -n HelloDurableTask +cd HelloDurableTask +``` + +## Step 2: Install Packages + +For this quickstart, we'll use the in-memory emulator: + +```bash +dotnet add package Microsoft.Azure.DurableTask.Core +dotnet add package Microsoft.Azure.DurableTask.Emulator +``` + +> 💡 For production, see [Choosing a Backend](choosing-a-backend.md) to select an appropriate provider. + +## Step 3: Create an Activity + +Activities are the basic unit of work in DTFx. Create a file named `GreetActivity.cs`: + +```csharp +using DurableTask.Core; + +public class GreetActivity : TaskActivity +{ + protected override string Execute(TaskContext context, string name) + { + return $"Hello, {name}!"; + } +} +``` + +## Step 4: Create an Orchestration + +Orchestrations coordinate activities. Create a file named `GreetingOrchestration.cs`: + +```csharp +using DurableTask.Core; + +public class GreetingOrchestration : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, string input) + { + // Call the GreetActivity + string greeting = await context.ScheduleTask(typeof(GreetActivity), input); + return greeting; + } +} +``` + +## Step 5: Create the Host + +Update `Program.cs` to create and run the orchestration: + +```csharp +using DurableTask.Core; +using DurableTask.Emulator; +using Microsoft.Extensions.Logging; + +// Create logger factory for diagnostics +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Create the in-memory orchestration service +var service = new LocalOrchestrationService(); + +// Create and configure the worker +var worker = new TaskHubWorker(service, loggerFactory); +worker.AddTaskOrchestrations(typeof(GreetingOrchestration)); +worker.AddTaskActivities(typeof(GreetActivity)); + +// Start the worker +await worker.StartAsync(); +Console.WriteLine("Worker started."); + +// Create a client to start orchestrations +var client = new TaskHubClient(service, loggerFactory: loggerFactory); + +// Start a new orchestration instance +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(GreetingOrchestration), + "World"); + +Console.WriteLine($"Started orchestration: {instance.InstanceId}"); + +// Wait for completion +var result = await client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromSeconds(30)); + +Console.WriteLine($"Result: {result.Output}"); +Console.WriteLine($"Status: {result.OrchestrationStatus}"); + +// Stop the worker +await worker.StopAsync(); +``` + +## Step 6: Run the Application + +```bash +dotnet run +``` + +Expected output: +``` +Worker started. +Started orchestration: +Result: "Hello, World!" +Status: Completed +``` + +## Understanding the Code + +### TaskActivity + +```csharp +public class GreetActivity : TaskActivity +``` + +- `TaskActivity` — Base class for activities +- Activities contain the actual work logic +- They are automatically retried on failure (configurable) + +### TaskOrchestration + +```csharp +public class GreetingOrchestration : TaskOrchestration +``` + +- `TaskOrchestration` — Base class for orchestrations +- Orchestrations coordinate activities and sub-orchestrations +- They must be [deterministic](../concepts/deterministic-constraints.md) + +### OrchestrationContext + +```csharp +await context.ScheduleTask(typeof(GreetActivity), input); +``` + +- `OrchestrationContext` provides APIs for scheduling work +- `ScheduleTask` — Schedule an activity +- `CreateSubOrchestrationInstance` — Start a sub-orchestration +- `CreateTimer` — Create a durable timer +- `WaitForExternalEvent` — Wait for an external event + +### TaskHubWorker and TaskHubClient + +- `TaskHubWorker` — Hosts orchestrations and activities +- `TaskHubClient` — Starts and manages orchestration instances + +## Next Steps + +- [Choosing a Backend](choosing-a-backend.md) — Select a production-ready provider +- [Core Concepts](../concepts/core-concepts.md) — Understand Task Hubs, Workers, and Clients +- [Writing Orchestrations](../concepts/orchestrations.md) — Learn orchestration patterns +- [Writing Activities](../concepts/activities.md) — Learn activity patterns +- [Samples Catalog](../samples/catalog.md) — Explore more examples diff --git a/docs/providers/README.md b/docs/providers/README.md new file mode 100644 index 000000000..e9f200629 --- /dev/null +++ b/docs/providers/README.md @@ -0,0 +1,19 @@ +# Backend Providers + +The Durable Task Framework supports multiple backend storage providers. Choose based on your requirements for management overhead, throughput, and existing infrastructure. + +## Provider Comparison + +| Provider | Type | Best For | +| -------- | ---- | -------- | +| [Durable Task Scheduler](durable-task-scheduler.md) ⭐ | Managed | New projects, production workloads | +| [Azure Storage](azure-storage.md) | Self-managed | Existing Azure Storage infrastructure | +| [MSSQL](mssql.md) | Self-managed | SQL Server / Azure SQL infrastructure | +| [Emulator](emulator.md) | In-memory | Local development and testing | +| [Service Fabric](service-fabric.md) | Self-managed | Applications on Service Fabric clusters | +| [Service Bus](service-bus.md) | Self-managed | Legacy (maintenance mode) | +| [Custom Provider](custom-provider.md) | DIY | Specialized storage requirements | + +> ⭐ **Recommended:** For new projects, use the [Durable Task Scheduler](durable-task-scheduler.md)—a fully managed Azure service with zero infrastructure management and built-in monitoring. + +See [Choosing a Backend](../getting-started/choosing-a-backend.md) for detailed guidance. diff --git a/docs/providers/azure-storage.md b/docs/providers/azure-storage.md new file mode 100644 index 000000000..cc9960838 --- /dev/null +++ b/docs/providers/azure-storage.md @@ -0,0 +1,390 @@ +# Azure Storage Provider + +The Azure Storage provider uses Azure Storage queues, tables, and blobs to persist orchestration state. It's a self-managed option suitable for existing Azure Storage deployments. + +## When to Use Azure Storage + +✅ **Good for:** + +- Existing Azure Storage infrastructure +- Cost-sensitive workloads with low-to-moderate throughput +- Internal Azure services in Ring-1 or lower + +⚠️ **Consider [Durable Task Scheduler](durable-task-scheduler.md) instead for:** + +- New projects +- Enterprise support requirements +- High-throughput scenarios +- Zero infrastructure management +- Internal Azure services in Ring-2 or higher + +## Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.AzureStorage +``` + +## Configuration + +### Basic Setup + +```csharp +using DurableTask.AzureStorage; +using DurableTask.Core; +using Microsoft.Extensions.Logging; + +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +var settings = new AzureStorageOrchestrationServiceSettings +{ + StorageConnectionString = "DefaultEndpointsProtocol=https;AccountName=...;AccountKey=...;", + TaskHubName = "MyTaskHub", + LoggerFactory = loggerFactory +}; + +var service = new AzureStorageOrchestrationService(settings); +await service.CreateIfNotExistsAsync(); + +var worker = new TaskHubWorker(service, loggerFactory); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); +``` + +### Using Managed Identity + +```csharp +using Azure.Identity; +using DurableTask.AzureStorage; +using DurableTask.Core; +using Microsoft.Extensions.Logging; + +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Uses DefaultAzureCredential +var credential = new DefaultAzureCredential(); + +var settings = new AzureStorageOrchestrationServiceSettings +{ + TaskHubName = "MyTaskHub", + StorageAccountClientProvider = new StorageAccountClientProvider("mystorageaccount", credential), + LoggerFactory = loggerFactory +}; + +var service = new AzureStorageOrchestrationService(settings); +``` + +> [!TIP] +> For complete runnable examples using managed identity, see the [Managed Identity Samples](../../samples/ManagedIdentitySample/). + +## Configuration Options + +### Core Settings + +| Setting | Description | Default | +| ------- | ----------- | ------- | +| `TaskHubName` | Name of the task hub (alphanumeric, 3-45 chars) | Required | +| `StorageConnectionString` | Azure Storage connection string | Required* | +| `StorageAccountName` | Storage account name (for managed identity) | Required* | + +*Either `StorageConnectionString` or `StorageAccountName` with credentials is required. + +### Performance Settings + +| Setting | Description | Default | +| ------- | ----------- | ------- | +| `PartitionCount` | Number of control queue partitions (1-16) | 4 | +| `ControlQueueBufferThreshold` | Max messages prefetched and buffered per partition | 64 | +| `MaxConcurrentTaskOrchestrationWorkItems` | Max concurrent orchestrations | 100 | +| `MaxConcurrentTaskActivityWorkItems` | Max concurrent activities | 10 | + +### Partition Management + +| Setting | Description | Default | +| ------- | ----------- | ------- | +| `UseTablePartitionManagement` | Use table-based partition management (recommended) | `true` | +| `UseLegacyPartitionManagement` | Use legacy blob-based partition management | `false` | +| `LeaseRenewInterval` | Interval for renewing partition leases | 10 seconds | +| `LeaseInterval` | Lease duration before expiration | 30 seconds | +| `LeaseAcquireInterval` | Interval for checking partition balance | 10 seconds | + +> [!NOTE] +> Table-based partition management (`UseTablePartitionManagement = true`) is the default and recommended option. It provides better reliability for partition distribution and uses a `{taskhub}Partitions` table instead of blob leases. It's also significantly less expensive in terms of Azure Storage operations. + +### Example Configuration + +```csharp +var settings = new AzureStorageOrchestrationServiceSettings +{ + StorageConnectionString = connectionString, + TaskHubName = "MyTaskHub", + + // Performance tuning + PartitionCount = 8, + ControlQueueBufferThreshold = 128, + MaxConcurrentTaskOrchestrationWorkItems = 200, + MaxConcurrentTaskActivityWorkItems = 200, + + // Lease settings + LeaseInterval = TimeSpan.FromSeconds(30), + LeaseRenewInterval = TimeSpan.FromSeconds(10) +}; +``` + +## Architecture + +### Storage Resources + +The Azure Storage provider creates these resources: + +| Resource Type | Name Pattern | Purpose | +| ------------- | ------------ | ------- | +| **Control Queues** | `{taskhub}-control-{0..N}` | Orchestration messages | +| **Work Item Queue** | `{taskhub}-workitems` | Activity messages | +| **History Table** | `{taskhub}History` | Orchestration history | +| **Instances Table** | `{taskhub}Instances` | Instance metadata | +| **Partitions Table** | `{taskhub}Partitions` | Partition leases (table manager) | +| **Lease Blobs** | `{taskhub}-leases/` | Partition leases (blob manager) | + +### Partitioning + +The Azure Storage provider uses **partitions** to distribute orchestration workloads across workers. Each partition corresponds to exactly one **control queue**. + +#### How Partitioning Works + +- **Orchestrations and entities** are assigned to partitions by hashing the instance ID +- Instance IDs are random GUIDs by default, ensuring even distribution across partitions +- A single orchestration instance is always processed by one partition (and therefore one worker) at a time +- The `PartitionCount` setting (1–16, default 4) determines how many control queues are created + +#### Queue Architecture + +The task hub uses two types of queues: + +| Queue Type | Count | Purpose | Processing | +| ---------- | ----- | ------- | ---------- | +| **Control queues** | `PartitionCount` | Orchestration lifecycle messages | Partitioned — each queue owned by one worker | +| **Work item queue** | 1 | Activity function messages | Shared — all workers compete for messages | + +```text +┌──────────────────────────────────────────────────────────────────────┐ +│ Task Hub │ +│ │ +│ CONTROL QUEUES (partitioned) WORK ITEM QUEUE (shared) │ +│ ┌──────────────────────────────────┐ ┌─────────────────────────┐ │ +│ │ control-00 │ control-01 │ ... │ │ workitems │ │ +│ │ (Worker A) │ (Worker B) │ │ │ │ │ +│ │ │ │ │ │ All workers compete │ │ +│ │ • Start │ • Start │ │ │ for activity messages │ │ +│ │ • Timer │ • Timer │ │ │ │ │ +│ │ • Activity │ • Activity │ │ └─────────────────────────┘ │ +│ │ complete │ complete │ │ ▲ │ +│ │ • External │ • External │ │ │ │ +│ │ event │ event │ │ Activities scheduled by │ +│ └────────────┴────────────┴───────┘ orchestrators go here │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────────────────────────┐ │ +│ │ ORCHESTRATION INSTANCES │ │ +│ │ Hash(InstanceID) → Partition │ │ +│ └──────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +#### Control Queue Messages + +Control queues contain orchestration lifecycle messages: + +- **ExecutionStarted** — New orchestration started +- **TaskCompleted** — Activity function completed +- **TimerFired** — Durable timer expired +- **EventRaised** — External event received +- **SubOrchestrationCompleted** — Child orchestration completed + +When messages are dequeued, up to 32 messages are fetched in a single poll. Messages for the same instance are batched together for efficient processing. + +#### Work Item Queue + +The work item queue is a simple, non-partitioned queue for activity function messages: + +- All workers compete to dequeue activity messages +- Activities are **stateless** — any worker can execute any activity +- Activities can scale out infinitely (limited only by worker count) + +#### Partition Count Guidance + +| Workload | Recommended `PartitionCount` | +| -------- | ---------------------------- | +| Development/testing | 1–2 | +| Low-to-moderate throughput | 4 (default) | +| High throughput | 8–16 | + +> [!IMPORTANT] +> Partition count **cannot be changed** after task hub creation. Set it high enough to accommodate future scale-out needs. The maximum number of workers that can process orchestrations concurrently equals the partition count. Note that higher partition counts increase Azure Storage costs due to more queue and table operations. + +### Lease Management + +Workers compete for partition ownership using one of two partition managers: + +#### Table Partition Manager (Default) + +When `UseTablePartitionManagement = true` (default): + +- Partition leases are stored in the `{taskhub}Partitions` table +- Uses Azure Table ETags for concurrency control +- Provides better reliability due to transactional updates + +#### Blob Partition Manager (Legacy) + +When `UseTablePartitionManagement = false`: + +- Partition leases are stored as blobs in `{taskhub}-leases/` +- Uses Azure Blob leases for concurrency control +- Available in "safe" (`UseLegacyPartitionManagement = false`) and "legacy" (`UseLegacyPartitionManagement = true`) variants + +#### Partition lifecycle + +1. Workers acquire leases to claim partition ownership +2. Leases are renewed at `LeaseRenewInterval` (default 10s) +3. Leases expire after `LeaseInterval` (default 30s) if not renewed +4. Partitions are automatically balanced across workers + +### Message Processing + +1. **Prefetching**: Messages are prefetched from control queues in batches +2. **Batching**: Messages for the same instance are grouped together +3. **History fetch**: Orchestration history is loaded from Table Storage +4. **Processing**: Orchestration code runs with the loaded history +5. **Checkpoint**: New history and messages are appended + +### Checkpoint Order + +Checkpoints are written in this order to ensure that no data is lost if there is a failure: + +1. New messages → Storage queues +2. New history → Table storage +3. Delete processed messages + +Because the checkpoints aren't atomic, duplicates may occur. The replay model handles history duplicates gracefully. Message duplicates may result in activities being executed multiple times if an unexpected failure occurs. + +## Scaling + +### Horizontal Scaling + +Multiple workers can connect to the same task hub: + +```csharp +// Worker 1, 2, 3... all connect to same task hub +var service = new AzureStorageOrchestrationService(settings); +var worker = new TaskHubWorker(service, loggerFactory); +await worker.StartAsync(); +``` + +Partitions are automatically distributed across workers. + +### Partition Count + +For high-throughput scenarios, increase partition count: + +```csharp +var settings = new AzureStorageOrchestrationServiceSettings +{ + PartitionCount = 16 // More partitions = more parallelism +}; +``` + +> [!WARNING] +> Partition count cannot be changed after task hub creation. + +## Operations + +### Create Task Hub + +```csharp +await service.CreateIfNotExistsAsync(); +``` + +### Delete Task Hub + +```csharp +await service.DeleteAsync(); +``` + +### Purge History + +```csharp +await service.PurgeOrchestrationHistoryAsync( + DateTime.UtcNow.AddDays(-30), // Older than 30 days + OrchestrationStateTimeRangeFilterType.OrchestrationCompletedTimeFilter); +``` + +> [!WARNING] +> Purging history is very expensive and may take hours for large task hubs. It's recommended to purge history frequently and in smaller time ranges. + +## Monitoring + +### Azure Storage Metrics + +Monitor these metrics in Azure portal: + +- Queue message count +- Table transactions +- Blob lease operations + +## Logging + +The Azure Storage provider supports structured logging via `Microsoft.Extensions.Logging`. + +### Enabling Logging + +```csharp +using Microsoft.Extensions.Logging; + +// Create a logger factory +ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.AddFilter("DurableTask.AzureStorage", LogLevel.Information); + builder.AddFilter("DurableTask.Core", LogLevel.Information); +}); + +// Pass logger factory to settings +var settings = new AzureStorageOrchestrationServiceSettings +{ + StorageConnectionString = connectionString, + TaskHubName = "MyTaskHub", + LoggerFactory = loggerFactory +}; + +var service = new AzureStorageOrchestrationService(settings); +``` + +### Log Categories + +| Category | Description | +| -------- | ----------- | +| `DurableTask.AzureStorage` | Azure Storage-specific operations (messages, queues, tables) | +| `DurableTask.Core` | Core framework operations (orchestrations, activities, dispatchers) | + +### Example Log Events + +- `SendingMessage` / `ReceivedMessage` — Queue message operations +- `FetchedInstanceHistory` — History table reads +- `PoisonMessageDetected` — Unprocessable messages +- `PartitionManagerInfo` / `PartitionManagerWarning` — Partition management + +### ETW Event Source + +Events are also published to Event Tracing for Windows (ETW) via the `DurableTask-AzureStorage` event source (GUID: {4C4AD4A2-F396-5E18-01B6-618C12A10433}). + +## Next Steps + +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Compare all providers +- [Durable Task Scheduler](durable-task-scheduler.md) — Recommended managed alternative +- [Core Concepts](../concepts/core-concepts.md) — Learn the fundamentals diff --git a/docs/providers/custom-provider.md b/docs/providers/custom-provider.md new file mode 100644 index 000000000..7b49636b5 --- /dev/null +++ b/docs/providers/custom-provider.md @@ -0,0 +1,269 @@ +# Custom Provider Implementation + +You can implement a custom storage provider by implementing the `IOrchestrationService` interface. This allows you to use DTFx with any backend storage system. + +## When to Implement a Custom Provider + +✅ **Good for:** + +- Integrating with proprietary storage systems +- Specialized requirements not met by existing providers +- Research and experimentation + +⚠️ **Consider existing providers first:** + +- [Durable Task Scheduler](durable-task-scheduler.md) — Managed service +- [Azure Storage](azure-storage.md) — Self-managed with Azure Storage +- [Emulator](emulator.md) — Local development + +## Core Interfaces + +### IOrchestrationService + +The primary interface for storage providers: + +```csharp +public interface IOrchestrationService +{ + // Lifecycle + Task StartAsync(); + Task StopAsync(); + Task StopAsync(bool isForced); + Task CreateAsync(); + Task CreateAsync(bool recreateInstanceStore); + Task CreateIfNotExistsAsync(); + Task DeleteAsync(); + Task DeleteAsync(bool deleteInstanceStore); + + // Orchestration dispatcher + int TaskOrchestrationDispatcherCount { get; } + int MaxConcurrentTaskOrchestrationWorkItems { get; } + BehaviorOnContinueAsNew EventBehaviourForContinueAsNew { get; } + + bool IsMaxMessageCountExceeded(int currentMessageCount, OrchestrationRuntimeState runtimeState); + int GetDelayInSecondsAfterOnProcessException(Exception exception); + int GetDelayInSecondsAfterOnFetchException(Exception exception); + + Task LockNextTaskOrchestrationWorkItemAsync( + TimeSpan receiveTimeout, CancellationToken cancellationToken); + Task RenewTaskOrchestrationWorkItemLockAsync(TaskOrchestrationWorkItem workItem); + Task CompleteTaskOrchestrationWorkItemAsync( + TaskOrchestrationWorkItem workItem, + OrchestrationRuntimeState newOrchestrationRuntimeState, + IList outboundMessages, + IList orchestratorMessages, + IList timerMessages, + TaskMessage continuedAsNewMessage, + OrchestrationState orchestrationState); + Task AbandonTaskOrchestrationWorkItemAsync(TaskOrchestrationWorkItem workItem); + Task ReleaseTaskOrchestrationWorkItemAsync(TaskOrchestrationWorkItem workItem); + + // Activity dispatcher + int TaskActivityDispatcherCount { get; } + int MaxConcurrentTaskActivityWorkItems { get; } + + Task LockNextTaskActivityWorkItem( + TimeSpan receiveTimeout, CancellationToken cancellationToken); + Task RenewTaskActivityWorkItemLockAsync(TaskActivityWorkItem workItem); + Task CompleteTaskActivityWorkItemAsync(TaskActivityWorkItem workItem, TaskMessage responseMessage); + Task AbandonTaskActivityWorkItemAsync(TaskActivityWorkItem workItem); +} +``` + +### IOrchestrationServiceClient + +For client operations (starting, querying, managing instances): + +```csharp +public interface IOrchestrationServiceClient +{ + Task CreateTaskOrchestrationAsync(TaskMessage creationMessage); + Task CreateTaskOrchestrationAsync(TaskMessage creationMessage, OrchestrationStatus[] dedupeStatuses); + + Task SendTaskOrchestrationMessageAsync(TaskMessage message); + Task SendTaskOrchestrationMessageBatchAsync(params TaskMessage[] messages); + + Task WaitForOrchestrationAsync( + string instanceId, + string executionId, + TimeSpan timeout, + CancellationToken cancellationToken); + + Task ForceTerminateTaskOrchestrationAsync(string instanceId, string reason); + + Task GetOrchestrationStateAsync(string instanceId, string executionId); + Task> GetOrchestrationStateAsync(string instanceId, bool allExecutions); + + Task GetOrchestrationHistoryAsync(string instanceId, string executionId); + Task PurgeOrchestrationHistoryAsync( + DateTime thresholdDateTimeUtc, + OrchestrationStateTimeRangeFilterType timeRangeFilterType); +} +``` + +> [!NOTE] +> Most providers implement both interfaces in a single class. + +## Minimal Implementation + +Here's a skeleton for a custom provider: + +```csharp +public class MyCustomOrchestrationService : IOrchestrationService, IOrchestrationServiceClient +{ + private readonly MyStorageBackend _storage; + + public MyCustomOrchestrationService(string connectionString) + { + _storage = new MyStorageBackend(connectionString); + } + + // Lifecycle + public Task CreateAsync() => CreateIfNotExistsAsync(); + public Task CreateAsync(bool recreateInstanceStore) => CreateIfNotExistsAsync(); + + public async Task CreateIfNotExistsAsync() + { + await _storage.InitializeAsync(); + } + + public async Task DeleteAsync() + { + await _storage.DeleteAllDataAsync(); + } + + public Task DeleteAsync(bool deleteInstanceStore) => DeleteAsync(); + + // Worker lifecycle + public Task StartAsync() + { + // Start background processes if needed + return Task.CompletedTask; + } + + public Task StopAsync() => StopAsync(false); + public Task StopAsync(bool isForced) + { + // Stop background processes + return Task.CompletedTask; + } + + // Work item polling + public async Task LockNextTaskOrchestrationWorkItemAsync( + TimeSpan receiveTimeout, + CancellationToken cancellationToken) + { + // Poll for orchestration messages + var message = await _storage.DequeueOrchestrationMessageAsync(receiveTimeout, cancellationToken); + if (message == null) return null; + + // Load history + var history = await _storage.LoadHistoryAsync(message.InstanceId); + + return new TaskOrchestrationWorkItem + { + InstanceId = message.InstanceId, + NewMessages = new[] { message }, + OrchestrationRuntimeState = new OrchestrationRuntimeState(history) + }; + } + + public async Task LockNextTaskActivityWorkItem( + TimeSpan receiveTimeout, + CancellationToken cancellationToken) + { + var message = await _storage.DequeueActivityMessageAsync(receiveTimeout, cancellationToken); + if (message == null) return null; + + return new TaskActivityWorkItem + { + Id = Guid.NewGuid().ToString(), + TaskMessage = message + }; + } + + // Orchestration completion + public async Task CompleteTaskOrchestrationWorkItemAsync( + TaskOrchestrationWorkItem workItem, + OrchestrationRuntimeState newState, + IList outboundMessages, + IList orchestratorMessages, + IList timerMessages, + TaskMessage continuedAsNewMessage, + OrchestrationState state) + { + // Save new history + await _storage.SaveHistoryAsync(workItem.InstanceId, newState.Events); + + // Enqueue outbound messages (activities) + foreach (var msg in outboundMessages) + { + await _storage.EnqueueActivityMessageAsync(msg); + } + + // Enqueue orchestrator messages (sub-orchestrations, events) + foreach (var msg in orchestratorMessages) + { + await _storage.EnqueueOrchestrationMessageAsync(msg); + } + + // Handle timers + foreach (var msg in timerMessages) + { + await _storage.ScheduleTimerAsync(msg); + } + + // Handle continue-as-new + if (continuedAsNewMessage != null) + { + await _storage.EnqueueOrchestrationMessageAsync(continuedAsNewMessage); + } + + // Update instance status + await _storage.UpdateInstanceStateAsync(workItem.InstanceId, state); + } + + // ... implement remaining interface methods + + // Capabilities + public int TaskOrchestrationDispatcherCount => 1; + public int MaxConcurrentTaskOrchestrationWorkItems => settings.MaxOrchestrationConcurrency; + public int TaskActivityDispatcherCount => 1; + public int MaxConcurrentTaskActivityWorkItems => settings.MaxActivityConcurrency; + public BehaviorOnContinueAsNew EventBehaviourForContinueAsNew => + BehaviorOnContinueAsNew.Carryover; +} +``` + +## Key Concepts + +### Work Items + +**TaskOrchestrationWorkItem**: Represents orchestration work to process + +- Contains one or more messages that triggered the orchestrator invocation (`ExecutionStartedEvent`, `TaskCompletedEvent`, etc.) +- Contains the current orchestration state (the full history) +- Must be completed or abandoned, and should always be released + +**TaskActivityWorkItem**: Represents activity work to execute + +- Contains a single activity task message (`TaskScheduledEvent`) +- Must be completed or abandoned +- Supports lock renewal for long-running activities + +### State Management + +Your provider must manage: + +1. **Message queues** — For orchestration and activity messages +2. **History storage** — For orchestration event history +3. **Instance metadata** — For querying orchestration status +4. **Timer scheduling** — For durable timers + +How you implement these components is entirely up to you. In the ideal case, your storage backend should provide atomic operations (like in the Durable Task Scheduler and MSSQL backend providers) to ensure consistency. If that's not possible (like in Azure Storage), you must gracefully handle potential inconsistencies due to process crashes to ensure there's no data loss. + +## Next Steps + +- [Azure Storage Provider](azure-storage.md) — Reference implementation +- [Core Concepts](../concepts/core-concepts.md) — Understand the architecture +- [Replay and Durability](../concepts/replay-and-durability.md) — Key concepts for providers diff --git a/docs/providers/durable-task-scheduler.md b/docs/providers/durable-task-scheduler.md new file mode 100644 index 000000000..731a17d80 --- /dev/null +++ b/docs/providers/durable-task-scheduler.md @@ -0,0 +1,220 @@ +# Durable Task Scheduler + +The **Durable Task Scheduler** is a fully managed Azure service purpose-built for running durable orchestrations. It provides the best experience for production workloads with zero infrastructure management and built-in enterprise support. + +> ⭐ **Recommended**: For new projects, we recommend the Durable Task Scheduler as your backend provider. + +## Overview + +| Feature | Benefit | +| ------- | ------- | +| **Fully Managed** | No storage accounts, databases, or infrastructure to manage | +| **Built-in Dashboard** | Monitor orchestrations without additional tooling | +| **Highest Throughput** | Purpose-built for durable workflow performance | +| **Azure Support** | 24/7 enterprise support with SLA (with Azure support plan) | +| **Managed Identity** | Secure authentication using Azure AD | +| **Local Emulator** | Docker-based emulator for development | + +For complete documentation on creating and configuring Azure resources, authentication, SKUs, RBAC, and pricing, see the [official Azure documentation](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/durable-task-scheduler). + +## Installation + +```bash +dotnet add package Microsoft.DurableTask.AzureManagedBackend +``` + +## DTFx Code Sample + +```csharp +using DurableTask.Core; +using Microsoft.DurableTask.AzureManagedBackend; +using Microsoft.Extensions.Logging; + +// Get connection string from environment +// Expected format: "Endpoint=https://;Authentication=;TaskHub=" +string? connectionString = Environment.GetEnvironmentVariable("DTS_CONNECTION_STRING"); +if (string.IsNullOrWhiteSpace(connectionString)) +{ + Console.Error.WriteLine("An environment variable named DTS_CONNECTION_STRING is required."); + return; +} + +// Configure logging +ILoggerFactory loggerFactory = LoggerFactory.Create(builder => + builder.AddSimpleConsole(options => + { + options.SingleLine = true; + options.UseUtcTimestamp = true; + options.TimestampFormat = "yyyy-MM-ddTHH:mm:ss.fffZ "; + })); + +// Create the orchestration service for Durable Task Scheduler +AzureManagedOrchestrationService service = new( + AzureManagedOrchestrationServiceOptions.FromConnectionString(connectionString), + loggerFactory); + +// Create and configure the worker +TaskHubWorker worker = new(service, loggerFactory); +worker.AddTaskOrchestrations(typeof(HelloWorldOrchestration)); +worker.AddTaskActivities(typeof(HelloActivity)); + +// Start the worker +await worker.StartAsync(); + +// Create a client and start an orchestration +TaskHubClient client = new(service, null, loggerFactory); +OrchestrationInstance instance = await client.CreateOrchestrationInstanceAsync( + orchestrationType: typeof(HelloWorldOrchestration), + input: null); + +Console.WriteLine($"Started orchestration with ID = '{instance.InstanceId}'"); + +// Wait for completion +OrchestrationState state = await client.WaitForOrchestrationAsync( + instance, + TimeSpan.FromMinutes(1)); + +Console.WriteLine($"Orchestration completed with status: {state.OrchestrationStatus}"); +Console.WriteLine($"Output: {state.Output}"); + +// Clean up +await worker.StopAsync(); +service.Dispose(); +``` + +## Connection String Format + +```text +Endpoint=;TaskHub=;Authentication= +``` + +See [Authentication Types](#authentication-types) for supported credential types. + +## Configuration Options + +The `AzureManagedOrchestrationServiceOptions` class provides configuration for the Durable Task Scheduler backend. + +### Creating Options + +**From connection string (recommended):** + +```csharp +var options = AzureManagedOrchestrationServiceOptions.FromConnectionString(connectionString); +``` + +**Manual construction:** + +```csharp +var options = new AzureManagedOrchestrationServiceOptions( + address: "https://myscheduler.westus3.durabletask.io", + credential: new DefaultAzureCredential()); +options.TaskHubName = "my-task-hub"; +``` + +### Authentication Types + +The connection string `Authentication` property supports these credential types: + +| Value | Credential Type | Use Case | +| ----- | --------------- | -------- | +| `DefaultAzure` | `DefaultAzureCredential` | General purpose; tries multiple auth methods | +| `ManagedIdentity` | `ManagedIdentityCredential` | Azure-hosted apps; add `ClientId` for user-assigned | +| `WorkloadIdentity` | `WorkloadIdentityCredential` | Kubernetes, CI/CD pipelines, SPIFFE | +| `Environment` | `EnvironmentCredential` | Container apps with env var credentials | +| `AzureCLI` | `AzureCliCredential` | Local dev with `az login` | +| `AzurePowerShell` | `AzurePowerShellCredential` | Local dev with `Connect-AzAccount` | +| `VisualStudio` | `VisualStudioCredential` | Local dev from Visual Studio | +| `InteractiveBrowser` | `InteractiveBrowserCredential` | Interactive scenarios (not for production) | +| `None` | No authentication | Local emulator only | + +**User-assigned managed identity example:** + +```text +Endpoint=https://myscheduler.westus3.durabletask.io;TaskHub=default;Authentication=ManagedIdentity;ClientId=00000000-0000-0000-0000-000000000000 +``` + +### Concurrency Settings + +Control how many work items are processed in parallel: + +| Property | Default | Description | +| -------- | ------- | ----------- | +| `MaxConcurrentOrchestrationWorkItems` | `ProcessorCount * 10` | Max parallel orchestration executions | +| `MaxConcurrentActivityWorkItems` | `ProcessorCount * 10` | Max parallel activity executions | + +```csharp +var options = AzureManagedOrchestrationServiceOptions.FromConnectionString(connectionString); +options.MaxConcurrentOrchestrationWorkItems = 50; +options.MaxConcurrentActivityWorkItems = 100; +``` + +> [!TIP] +> Increase activity concurrency for I/O-bound workloads. Reduce orchestration concurrency if orchestrations consume significant memory. + +### Large Payload Storage + +For payloads exceeding gRPC message limits, configure Azure Blob Storage to externalize large data: + +```csharp +var options = AzureManagedOrchestrationServiceOptions.FromConnectionString(connectionString); +options.LargePayloadStorageOptions = new LargePayloadStorageOptions("UseDevelopmentStorage=true") +{ + ExternalizeThresholdBytes = 1024, // Externalize payloads larger than 1KB + MaxExternalizedPayloadBytes = 4194304, // Max 4MB payload size + CompressPayloads = true // Compress before storing (default: true) +}; +``` + +| Property | Description | +| -------- | ----------- | +| Constructor (`connectionString`) | Azure Storage connection string or `"UseDevelopmentStorage=true"` for local dev | +| `ExternalizeThresholdBytes` | Payloads larger than this are stored in blob storage | +| `MaxExternalizedPayloadBytes` | Maximum allowed payload size (fails fast if exceeded) | +| `CompressPayloads` | Whether to compress payloads before storing (improves storage efficiency) | + +When enabled, large orchestration inputs, outputs, activity results, and event payloads are automatically stored in blob storage and retrieved transparently. + +### Additional Options + +| Property | Default | Description | +| -------- | ------- | ----------- | +| `TaskHubName` | `"default"` | Name of the task hub (usually set via connection string) | +| `OrchestrationHistoryCacheExpirationPeriod` | 10 minutes | How long orchestration history is cached in memory | +| `ResourceId` | `https://durabletask.io` | OAuth resource ID (change only for sovereign clouds) | + +## Local Development with Emulator + +For local development, use the Docker-based emulator: + +```bash +docker pull mcr.microsoft.com/dts/dts-emulator:latest +docker run -d -p 8080:8080 -p 8082:8082 mcr.microsoft.com/dts/dts-emulator:latest +``` + +Connect to the emulator: + +```csharp +var connectionString = "Endpoint=http://localhost:8080;TaskHub=default;Authentication=None"; +var service = new AzureManagedOrchestrationService( + AzureManagedOrchestrationServiceOptions.FromConnectionString(connectionString), + loggerFactory); +``` + +Access the local dashboard at: `http://localhost:8082` + +## Samples + +For complete working examples, see the [Durable Task Scheduler samples repository](https://github.com/Azure-Samples/Durable-Task-Scheduler/tree/main/samples/dtfx). + +## Additional Resources + +- [Azure Documentation](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/durable-task-scheduler) — Creating resources, configuration, SKUs, RBAC, pricing +- [Quickstart Guide](https://learn.microsoft.com/azure/azure-functions/durable/durable-task-scheduler/quickstart-durable-task-scheduler) +- [Azure Samples Repository](https://github.com/Azure-Samples/Durable-Task-Scheduler/) +- [Support](../support.md) — Enterprise support options + +## Next Steps + +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Compare all providers +- [Quickstart](../getting-started/quickstart.md) — Create your first orchestration +- [Core Concepts](../concepts/core-concepts.md) — Learn the fundamentals diff --git a/docs/providers/emulator.md b/docs/providers/emulator.md new file mode 100644 index 000000000..195ea2082 --- /dev/null +++ b/docs/providers/emulator.md @@ -0,0 +1,56 @@ +# Emulator Provider + +The Emulator provider (`LocalOrchestrationService`) is an in-memory implementation for local development and testing. It requires no external dependencies and is ideal for quick iteration. + +## Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.Emulator +``` + +## Usage + +```csharp +using DurableTask.Core; +using DurableTask.Emulator; +using Microsoft.Extensions.Logging; + +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Create in-memory service +var service = new LocalOrchestrationService(); + +// Create worker and client +var worker = new TaskHubWorker(service, loggerFactory); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); + +// ... +``` + +## Limitations + +| Limitation | Description | +| ---------- | ----------- | +| **In-memory only** | All state is lost when process exits | +| **Single process** | Cannot share state across processes | + +## Transitioning to Production + +When moving from emulator to production, replace `LocalOrchestrationService` with your chosen provider. The rest of your code remains the same: + +```csharp +// Development +IOrchestrationService service = new LocalOrchestrationService(); + +// Production (example: Azure Storage) +IOrchestrationService service = new AzureStorageOrchestrationService(settings); +``` + +## Next Steps + +- [Quickstart](../getting-started/quickstart.md) — Get started with the emulator +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Select a production provider diff --git a/docs/providers/mssql.md b/docs/providers/mssql.md new file mode 100644 index 000000000..1bc2e25eb --- /dev/null +++ b/docs/providers/mssql.md @@ -0,0 +1,32 @@ +# MSSQL Provider + +The MSSQL provider uses Microsoft SQL Server or Azure SQL Database as the backend storage for orchestration state. This provider is maintained in a separate repository. + +## Repository + +The MSSQL provider is available at: **[https://github.com/microsoft/durabletask-mssql](https://github.com/microsoft/durabletask-mssql)** + +## Features + +- Uses SQL Server or Azure SQL Database for durable storage +- Supports both on-premises SQL Server and Azure SQL +- Includes database migrations for schema management +- Compatible with DTFx and Azure Durable Functions + +## Getting Started + +For installation, configuration, and usage documentation, see the [durabletask-mssql repository](https://github.com/microsoft/durabletask-mssql). + +## When to Use + +Consider the MSSQL provider when: + +- You have existing SQL Server infrastructure +- You need the transactional guarantees of a relational database +- You want to query orchestration state using familiar SQL tools +- Your organization has SQL Server expertise and operational practices + +## Next Steps + +- [durabletask-mssql Documentation](https://github.com/microsoft/durabletask-mssql#readme) +- [Choosing a Backend](../getting-started/choosing-a-backend.md) diff --git a/docs/providers/service-bus.md b/docs/providers/service-bus.md new file mode 100644 index 000000000..5e17bb934 --- /dev/null +++ b/docs/providers/service-bus.md @@ -0,0 +1,164 @@ +# Service Bus Provider + +The Service Bus provider uses Azure Service Bus for orchestration messaging. It's suitable for scenarios requiring Service Bus integration or existing Service Bus infrastructure. + +> [!WARNING] +> The Service Bus provider is in maintenance mode and is not recommended for new projects. Consider using [Durable Task Scheduler](durable-task-scheduler.md) for a managed alternative or the [Azure Storage Provider](azure-storage.md) for self-managed deployments. + +## Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.ServiceBus +``` + +## Configuration + +### Basic Setup + +The Service Bus provider requires separate stores for messaging (Service Bus) and history/state (Azure Storage): + +```csharp +using DurableTask.ServiceBus; +using DurableTask.ServiceBus.Settings; +using DurableTask.ServiceBus.Tracking; +using DurableTask.Core; +using Microsoft.Extensions.Logging; + +string serviceBusConnectionString = "Endpoint=sb://mynamespace.servicebus.windows.net/;SharedAccessKeyName=...;SharedAccessKey=..."; +string storageConnectionString = "DefaultEndpointsProtocol=https;AccountName=...;AccountKey=..."; +string taskHubName = "MyTaskHub"; + +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Create the instance store (Azure Table Storage for history) +var instanceStore = new AzureTableInstanceStore(taskHubName, storageConnectionString); + +// Create the blob store (Azure Blob Storage for large messages/sessions) +var blobStore = new AzureStorageBlobStore(taskHubName, storageConnectionString); + +// Configure settings +var settings = new ServiceBusOrchestrationServiceSettings(); + +// Create the orchestration service +var service = new ServiceBusOrchestrationService( + serviceBusConnectionString, + taskHubName, + instanceStore, + blobStore, + settings); + +await service.CreateIfNotExistsAsync(); + +var worker = new TaskHubWorker(service, loggerFactory); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); +``` + +### Using Managed Identity + +The Service Bus provider supports managed identity authentication (.NET Standard 2.0+): + +```csharp +using Azure.Identity; +using DurableTask.ServiceBus; +using DurableTask.ServiceBus.Settings; +using DurableTask.ServiceBus.Tracking; + +string serviceBusNamespace = "mynamespace.servicebus.windows.net"; +Uri storageEndpoint = new Uri("https://mystorageaccount.table.core.windows.net"); +Uri blobEndpoint = new Uri("https://mystorageaccount.blob.core.windows.net"); +string taskHubName = "MyTaskHub"; + +var credential = new DefaultAzureCredential(); + +// Create stores with managed identity +var instanceStore = new AzureTableInstanceStore(taskHubName, storageEndpoint, credential); +var blobStore = new AzureStorageBlobStore(taskHubName, blobEndpoint, credential); + +var settings = new ServiceBusOrchestrationServiceSettings(); + +// Create Service Bus connection with managed identity +var service = new ServiceBusOrchestrationService( + serviceBusNamespace, // Just the hostname, not a connection string + credential, + taskHubName, + instanceStore, + blobStore, + settings); +``` + +## Configuration Options + +### ServiceBusOrchestrationServiceSettings + +| Setting | Description | Default | +| ------- | ----------- | ------- | +| `MaxTaskOrchestrationDeliveryCount` | Max delivery attempts for orchestration messages | 10 | +| `MaxTaskActivityDeliveryCount` | Max delivery attempts for activity messages | 10 | +| `MaxTrackingDeliveryCount` | Max delivery attempts for tracking messages | 10 | +| `MaxQueueSizeInMegabytes` | Maximum queue size for Service Bus queues | 1024 | +| `PrefetchCount` | Message prefetch count | 50 | +| `TaskOrchestrationDispatcherSettings` | Orchestration dispatcher configuration | See below | +| `TaskActivityDispatcherSettings` | Activity dispatcher configuration | See below | +| `MessageCompressionSettings` | Message compression configuration | Disabled | +| `JumpStartSettings` | Jump start (stale instance recovery) settings | Enabled | + +### Dispatcher Settings + +Dispatcher settings control concurrency: + +```csharp +var settings = new ServiceBusOrchestrationServiceSettings +{ + TaskOrchestrationDispatcherSettings = + { + MaxConcurrentOrchestrations = 100, + CompressOrchestrationState = true + }, + TaskActivityDispatcherSettings = + { + MaxConcurrentActivities = 100 + } +}; +``` + +## Architecture + +### Service Bus Resources + +The provider creates these Service Bus entities: + +| Entity Type | Name Pattern | Purpose | +| ----------- | ------------ | ------- | +| **Orchestrator Queue** | `{taskhub}/orchestrator` | Orchestration messages | +| **Worker Queue** | `{taskhub}/worker` | Activity messages | +| **Tracking Queue** | `{taskhub}/tracking` | Tracking events | + +### Storage Resources + +In addition to Service Bus, the provider uses Azure Storage: + +| Resource | Name Pattern | Purpose | +| -------- | ------------ | ------- | +| **Instance History Table** | `InstanceHistory00{taskhub}` | Orchestration state and execution history | +| **Jump Start Table** | `JumpStart00{taskhub}` | Pending orchestrations for stale instance recovery | +| **Blob Container** | `{taskhub}-dtfx` | Large messages and session state | + +> [!NOTE] +> Unlike the Azure Storage provider (which has separate History and Instances tables), the Service Bus provider stores both instance metadata and history events in a single `InstanceHistory` table. + +## Limitations + +- Requires both Service Bus and Azure Storage +- Limited query capabilities compared to Azure Storage provider +- Less commonly used — smaller community and fewer examples +- No built-in monitoring dashboard + +## Next Steps + +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Compare all providers +- [Durable Task Scheduler](durable-task-scheduler.md) — Recommended managed alternative +- [Azure Storage Provider](azure-storage.md) — Alternative self-managed option diff --git a/docs/providers/service-fabric.md b/docs/providers/service-fabric.md new file mode 100644 index 000000000..5109d288f --- /dev/null +++ b/docs/providers/service-fabric.md @@ -0,0 +1,197 @@ +# Service Fabric Provider + +The Service Fabric provider uses Azure Service Fabric reliable collections for orchestration state. It's designed for applications already running on Service Fabric clusters. + +## Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.AzureServiceFabric +``` + +## Configuration + +### Basic Setup + +The Service Fabric provider includes built-in infrastructure via `TaskHubStatefulService` and `TaskHubProxyListener`: + +```csharp +using DurableTask.AzureServiceFabric; +using DurableTask.AzureServiceFabric.Service; +using DurableTask.Core; +using Microsoft.ServiceFabric.Services.Runtime; + +// In Program.cs +ServiceRuntime.RegisterServiceAsync("StatefulServiceType", context => +{ + var settings = new FabricOrchestrationProviderSettings(); + + var listener = new TaskHubProxyListener( + settings, + RegisterOrchestrations); + + return new TaskHubStatefulService(context, new[] { listener }); +}).GetAwaiter().GetResult(); + +void RegisterOrchestrations(TaskHubWorker worker) +{ + worker.AddTaskOrchestrations(typeof(MyOrchestration)); + worker.AddTaskActivities(typeof(MyActivity)); +} +``` + +### Manual Setup with Provider Factory + +For more control, use the `FabricOrchestrationProviderFactory`: + +```csharp +using DurableTask.AzureServiceFabric; +using DurableTask.Core; +using Microsoft.ServiceFabric.Services.Runtime; + +public class DurableTaskService : StatefulService +{ + private FabricOrchestrationProvider provider; + private TaskHubWorker worker; + + public DurableTaskService(StatefulServiceContext context) : base(context) { } + + protected override async Task RunAsync(CancellationToken cancellationToken) + { + var settings = new FabricOrchestrationProviderSettings(); + + var factory = new FabricOrchestrationProviderFactory( + this.StateManager, + settings); + + provider = factory.CreateProvider(); + + worker = new TaskHubWorker(provider.OrchestrationService, settings.LoggerFactory); + worker.AddTaskOrchestrations(typeof(MyOrchestration)); + worker.AddTaskActivities(typeof(MyActivity)); + + await worker.StartAsync(); + + try + { + await Task.Delay(Timeout.Infinite, cancellationToken); + } + finally + { + await worker.StopAsync(); + provider.Dispose(); + } + } +} +``` + +### Service Registration + +Register the service in `Program.cs`: + +```csharp +ServiceRuntime.RegisterServiceAsync( + "DurableTaskServiceType", + context => new DurableTaskService(context)) + .GetAwaiter().GetResult(); +``` + +## Architecture + +### Reliable Collections + +State is stored in Service Fabric reliable collections: + +| Collection Name | Purpose | +| --------------- | ------- | +| `DtfxSfp_Orchestrations` | Orchestration sessions and state | +| `DtfxSfp_Activities` | Pending activity messages | +| `DtfxSfp_InstanceStore` | Instance metadata for queries | +| `DtfxSfp_ExecutionIdStore` | Execution ID mappings | +| `DtfxSfp_ScheduledMessages` | Scheduled timer messages | +| `DtfxSfp_SessionMessages_{id}` | Per-session message queues | + +### Partitioning + +Service Fabric handles partitioning automatically based on your service configuration: + +```xml + + + + + +``` + +## Configuration Options + +| Setting | Description | Default | +| ------- | ----------- | ------- | +| `TaskOrchestrationDispatcherSettings.MaxConcurrentOrchestrations` | Max concurrent orchestrations | 1000 | +| `TaskOrchestrationDispatcherSettings.DispatcherCount` | Number of orchestration dispatchers | 10 | +| `TaskActivityDispatcherSettings.MaxConcurrentActivities` | Max concurrent activities | 1000 | +| `TaskActivityDispatcherSettings.DispatcherCount` | Number of activity dispatchers | 10 | +| `LoggerFactory` | Optional logger factory for diagnostics | null | + +### Example Configuration + +```csharp +var settings = new FabricOrchestrationProviderSettings +{ + TaskOrchestrationDispatcherSettings = + { + MaxConcurrentOrchestrations = 500, + DispatcherCount = 5 + }, + TaskActivityDispatcherSettings = + { + MaxConcurrentActivities = 500, + DispatcherCount = 5 + } +}; +``` + +## Client Access + +### From Within Service Fabric + +Use the `FabricOrchestrationProvider` to get both worker and client: + +```csharp +var factory = new FabricOrchestrationProviderFactory(this.StateManager, settings); +var provider = factory.CreateProvider(); + +var worker = new TaskHubWorker(provider.OrchestrationService, settings.LoggerFactory); +var client = new TaskHubClient(provider.OrchestrationServiceClient, loggerFactory: settings.LoggerFactory); + +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + input); +``` + +### From External Applications + +External clients connect via the built-in HTTP API or Service Fabric remoting: + +```csharp +// Create a service proxy +var serviceUri = new Uri("fabric:/MyApp/DurableTaskService"); +var proxy = ServiceProxy.Create(serviceUri); + +// Call methods on the proxy +var instanceId = await proxy.StartOrchestrationAsync(input); +``` + +The `TaskHubProxyListener` exposes an HTTP API via `FabricOrchestrationServiceController` for external access. + +## Limitations + +- Requires Service Fabric cluster +- Tightly coupled to Service Fabric ecosystem +- More complex deployment and management +- No external persistence — state is lost if all replicas are lost + +## Next Steps + +- [Choosing a Backend](../getting-started/choosing-a-backend.md) — Compare all providers +- [Durable Task Scheduler](durable-task-scheduler.md) — Recommended managed alternative +- [Azure Storage Provider](azure-storage.md) — Alternative self-managed option diff --git a/docs/samples/catalog.md b/docs/samples/catalog.md new file mode 100644 index 000000000..4464c8f2d --- /dev/null +++ b/docs/samples/catalog.md @@ -0,0 +1,21 @@ +# Sample Applications + +This page provides an overview of the sample applications included in the repository. These samples demonstrate various patterns and features of the Durable Task Framework. + +For detailed instructions on running each sample, see the README in each project directory. + +## Sample Projects + +| Sample | Description | Key Features | +| ------ | ----------- | ------------ | +| [DurableTask.Samples](../../samples/DurableTask.Samples/) | Core orchestration patterns | Greetings, Cron, Fan-out/Fan-in, Error handling, External events | +| [Correlation.Samples](../../samples/Correlation.Samples/) | Legacy distributed tracing | W3C TraceContext, Application Insights | +| [DistributedTraceSample](../../samples/DistributedTraceSample/) | Modern telemetry integration | OpenTelemetry, Application Insights | +| [ManagedIdentitySample](../../samples/ManagedIdentitySample/) | Azure authentication | Managed Identity with Azure Storage | + +## Additional Resources + +- [Getting Started](../getting-started/quickstart.md) +- [Core Concepts](../concepts/core-concepts.md) +- [Choosing a Backend](../getting-started/choosing-a-backend.md) +- [Testing](../advanced/testing.md) diff --git a/docs/support.md b/docs/support.md new file mode 100644 index 000000000..3afb8efa9 --- /dev/null +++ b/docs/support.md @@ -0,0 +1,49 @@ +# Support + +This document describes support options for the Durable Task Framework (DTFx). + +## Community Support (Open Source) + +The Durable Task Framework is an open-source project maintained by Microsoft. Community support is available through [GitHub Issues](https://github.com/Azure/durabletask/issues) for bug reports, feature requests, and technical questions. + +### Support Policy + +Community support is provided on a **best-effort basis**: + +- ⚠️ **No SLA** — Response times are not guaranteed +- ⚠️ **No 24/7 coverage** — Issues are triaged during business hours +- ⚠️ **Community-driven** — Many answers come from community members +- ⚠️ **Not all providers maintained** — Some backend providers are no longer actively maintained +- ✅ **Open collaboration** — All issues and discussions are public + +See [Choosing a Backend](getting-started/choosing-a-backend.md) for information on the development status of each backend provider. + +This model works well for: + +- Learning and experimentation +- Non-critical workloads +- Development and testing environments +- Projects with in-house expertise + +## Enterprise Support: Durable Task Scheduler + +For production workloads requiring guaranteed support, we recommend using DTFx with the **[Durable Task Scheduler](providers/durable-task-scheduler.md)** as the backend provider. This fully managed Azure service offers enterprise-grade support with the following benefits: + +| Feature | Open Source (BYO Providers) | Durable Task Scheduler | +| ------- | -------------------------- | ---------------------- | +| **Support SLA** | ❌ Best-effort | ✅ Azure support with SLA | +| **24/7 Coverage** | ❌ No | ✅ Yes (with Azure support plan) | +| **Infrastructure** | Self-managed | Fully managed by Azure | +| **Monitoring** | Bring your own tools | Built-in dashboard | +| **Throughput** | Varies by provider | Highest available | +| **Response Time** | Not guaranteed | Based on Azure support tier | + +## Reporting Security Issues + +⚠️ **Do not report security vulnerabilities through public GitHub issues.** + +Please see [SECURITY.md](../SECURITY.md) for instructions on reporting security issues responsibly. + +## Contributing + +We welcome contributions! See the [GitHub repository](https://github.com/Azure/durabletask) for contribution guidelines. diff --git a/docs/telemetry/application-insights.md b/docs/telemetry/application-insights.md new file mode 100644 index 000000000..bab234285 --- /dev/null +++ b/docs/telemetry/application-insights.md @@ -0,0 +1,358 @@ +# Application Insights Integration + +The Durable Task Framework provides deep integration with Azure Application Insights for monitoring, diagnostics, and performance analysis. + +## Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.ApplicationInsights +dotnet add package Microsoft.ApplicationInsights +``` + +## Setup + +### Basic Configuration + +```csharp +using Microsoft.ApplicationInsights; +using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.Azure.DurableTask.ApplicationInsights; +using Microsoft.Extensions.DependencyInjection; + +var services = new ServiceCollection(); + +// Add Application Insights +services.AddApplicationInsightsTelemetryWorkerService(options => +{ + options.ConnectionString = "InstrumentationKey=your-key;..."; +}); + +// Add DurableTask telemetry module for distributed tracing +services.TryAddEnumerable( + ServiceDescriptor.Singleton()); + +var serviceProvider = services.BuildServiceProvider(); +``` + +### ASP.NET Core Integration + +```csharp +// In Program.cs +var builder = WebApplication.CreateBuilder(args); + +// Add Application Insights +builder.Services.AddApplicationInsightsTelemetry(); + +// Add DurableTask telemetry module +builder.Services.TryAddEnumerable( + ServiceDescriptor.Singleton()); + +var app = builder.Build(); +``` + +### Console Application + +```csharp +using Microsoft.ApplicationInsights; +using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.Azure.DurableTask.ApplicationInsights; +using DurableTask.Core; +using DurableTask.Emulator; + +// Configure Application Insights +var configuration = TelemetryConfiguration.CreateDefault(); +configuration.ConnectionString = "InstrumentationKey=..."; + +// Add the DurableTask telemetry module +var module = new DurableTelemetryModule(); +module.Initialize(configuration); + +var telemetryClient = new TelemetryClient(configuration); + +// Create logger factory for diagnostics +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Create DTFx components +var service = new LocalOrchestrationService(); +var worker = new TaskHubWorker(service, loggerFactory); +worker.AddTaskOrchestrations(typeof(MyOrchestration)); +worker.AddTaskActivities(typeof(MyActivity)); + +await worker.StartAsync(); + +// ... run orchestrations ... + +await worker.StopAsync(); + +// Ensure telemetry is flushed +telemetryClient.Flush(); +await Task.Delay(TimeSpan.FromSeconds(5)); +``` + +## What Gets Tracked + +### Automatic Telemetry + +The `DurableTelemetryModule` automatically tracks: + +| Telemetry Type | Description | +|----------------|-------------| +| **Requests** | Orchestration and activity executions | +| **Dependencies** | Activity calls, sub-orchestrations | +| **Traces** | Log messages from DTFx | +| **Exceptions** | Failures in orchestrations and activities | +| **Custom Events** | Orchestration lifecycle events | + +### Distributed Tracing + +Operations are automatically correlated: + +```text +Request: OrderOrchestration (parent) +├── Dependency: ValidateOrderActivity +├── Dependency: ProcessPaymentActivity +├── Dependency: ShippingOrchestration (sub-orchestration) +│ └── Dependency: CreateShipmentActivity +└── Dependency: SendConfirmationActivity +``` + +## Custom Telemetry + +### Adding Custom Properties + +```csharp +public class OrderOrchestration : TaskOrchestration +{ + private readonly TelemetryClient _telemetryClient; + + public OrderOrchestration(TelemetryClient telemetryClient) + { + _telemetryClient = telemetryClient; + } + + public override async Task RunTask( + OrchestrationContext context, + OrderInput input) + { + // Track custom event + if (!context.IsReplaying) + { + _telemetryClient.TrackEvent("OrderProcessingStarted", new Dictionary + { + ["InstanceId"] = context.OrchestrationInstance.InstanceId, + ["OrderId"] = input.OrderId, + ["CustomerId"] = input.CustomerId + }); + } + + // ... orchestration logic ... + + if (!context.IsReplaying) + { + _telemetryClient.TrackEvent("OrderProcessingCompleted", new Dictionary + { + ["InstanceId"] = context.OrchestrationInstance.InstanceId, + ["OrderId"] = input.OrderId, + ["Status"] = "Success" + }); + } + + return result; + } +} +``` + +### Tracking Metrics + +```csharp +if (!context.IsReplaying) +{ + _telemetryClient.TrackMetric("OrderProcessingDuration", + (context.CurrentUtcDateTime - startTime).TotalMilliseconds); + + _telemetryClient.TrackMetric("OrderItemCount", input.Items.Count); +} +``` + +### Tracking Exceptions + +```csharp +try +{ + await context.ScheduleTask(typeof(RiskyActivity), input); +} +catch (TaskFailedException ex) +{ + if (!context.IsReplaying) + { + _telemetryClient.TrackException(ex.InnerException, new Dictionary + { + ["InstanceId"] = context.OrchestrationInstance.InstanceId, + ["ActivityName"] = ex.Name + }); + } + throw; +} +``` + +## Querying Data + +### Kusto Queries (Log Analytics) + +**Orchestration execution times:** +```kusto +requests +| where name contains "orchestration" +| summarize avg(duration), percentile(duration, 95) by name +| order by avg_duration desc +``` + +**Failed orchestrations:** +```kusto +requests +| where name contains "orchestration" and success == false +| project timestamp, name, duration, customDimensions +| order by timestamp desc +``` + +**Activity performance:** +```kusto +dependencies +| where type == "DurableTask" +| summarize count(), avg(duration) by name +| order by count_ desc +``` + +**End-to-end traces:** +```kusto +union requests, dependencies, traces +| where operation_Id == "your-operation-id" +| order by timestamp asc +| project timestamp, itemType, name, message, duration +``` + +## Live Metrics + +Application Insights Live Metrics shows real-time: + +- Incoming request rate +- Failure rate +- Dependency call duration +- Server response time + +Enable Live Metrics in your configuration: + +```csharp +services.AddApplicationInsightsTelemetryWorkerService(options => +{ + options.ConnectionString = "..."; + options.EnableLiveMetrics = true; +}); +``` + +## Alerts + +Configure alerts for common scenarios: + +### High Failure Rate + +```kusto +requests +| where name contains "orchestration" +| summarize failureCount = countif(success == false), totalCount = count() by bin(timestamp, 5m) +| extend failureRate = failureCount * 100.0 / totalCount +| where failureRate > 5 +``` + +### Long-Running Orchestrations + +```kusto +requests +| where name contains "orchestration" +| where duration > 300000 // 5 minutes +| project timestamp, name, duration, operation_Id +``` + +### Stuck Orchestrations + +Monitor for orchestrations that haven't progressed: + +```kusto +customEvents +| where name == "OrchestrationStarted" +| join kind=leftanti ( + customEvents + | where name == "OrchestrationCompleted" + | project completedInstanceId = tostring(customDimensions["InstanceId"]) +) on $left.customDimensions["InstanceId"] == $right.completedInstanceId +| where timestamp < ago(1h) +``` + +## Sampling + +For high-volume scenarios, configure sampling: + +```csharp +services.AddApplicationInsightsTelemetryWorkerService(options => +{ + options.ConnectionString = "..."; +}); + +services.Configure(config => +{ + config.DefaultTelemetrySink.TelemetryProcessorChainBuilder + .UseAdaptiveSampling(maxTelemetryItemsPerSecond: 5) + .Build(); +}); +``` + +## Best Practices + +### 1. Use IsReplaying for Custom Telemetry + +```csharp +if (!context.IsReplaying) +{ + _telemetryClient.TrackEvent("CustomEvent"); +} +``` + +### 2. Include Correlation IDs + +```csharp +_telemetryClient.TrackEvent("OrderProcessed", new Dictionary +{ + ["InstanceId"] = context.OrchestrationInstance.InstanceId, + ["OrderId"] = input.OrderId +}); +``` + +### 3. Flush Before Shutdown + +```csharp +await worker.StopAsync(); +telemetryClient.Flush(); +await Task.Delay(TimeSpan.FromSeconds(5)); // Allow time for flush +``` + +### 4. Monitor Key Metrics + +- Orchestration success/failure rate +- Activity duration +- Queue depth (if applicable) +- Concurrent orchestrations + +## Samples + +See the complete working sample: +- [Application Insights Sample](../../samples/DistributedTraceSample/ApplicationInsights) + +## Next Steps + +- [Distributed Tracing](distributed-tracing.md) — OpenTelemetry integration +- [Logging](logging.md) — Structured logging +- [Support](../support.md) — Getting help diff --git a/docs/telemetry/distributed-tracing.md b/docs/telemetry/distributed-tracing.md new file mode 100644 index 000000000..c03684ea5 --- /dev/null +++ b/docs/telemetry/distributed-tracing.md @@ -0,0 +1,285 @@ +# Distributed Tracing + +The Durable Task Framework supports distributed tracing using the standard .NET `ActivitySource` API, compatible with both OpenTelemetry and Application Insights. + +## Overview + +Distributed tracing provides visibility into orchestration execution across services and activities. DTFx emits spans for: + +- Starting orchestrations +- Running orchestrations +- Starting and running activities +- Sub-orchestrations +- Timers +- External events + +## Supported Protocols + +DTFx supports trace context propagation using standard protocols: + +| Protocol | Description | +| -------- | ----------- | +| **W3C TraceContext** | W3C standard for distributed tracing (default) | +| **HTTP Correlation Protocol** | Legacy Application Insights protocol | + +## OpenTelemetry Setup + +### Installation + +```bash +dotnet add package OpenTelemetry +dotnet add package OpenTelemetry.Exporter.Console # Or your preferred exporter +``` + +### Configuration + +Add the `DurableTask.Core` source to the OpenTelemetry trace builder: + +```csharp +using OpenTelemetry; +using OpenTelemetry.Trace; + +var tracerProvider = Sdk.CreateTracerProviderBuilder() + .AddSource("DurableTask.Core") + .AddConsoleExporter() // Or your preferred exporter + .Build(); +``` + +### Full Example + +```csharp +using OpenTelemetry; +using OpenTelemetry.Trace; +using DurableTask.Core; +using DurableTask.AzureStorage; + +// Configure OpenTelemetry +using var tracerProvider = Sdk.CreateTracerProviderBuilder() + .AddSource("DurableTask.Core") + .AddConsoleExporter() + .Build(); + +// Create logger factory +using ILoggerFactory loggerFactory = LoggerFactory.Create(builder => +{ + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); +}); + +// Set up DTFx +var settings = new AzureStorageOrchestrationServiceSettings +{ + TaskHubName = "MyTaskHub", + StorageAccountClientProvider = new StorageAccountClientProvider(connectionString), + LoggerFactory = loggerFactory +}; + +var service = new AzureStorageOrchestrationService(settings); +await service.CreateIfNotExistsAsync(); + +var worker = new TaskHubWorker(service, loggerFactory); +worker.AddTaskOrchestrations(typeof(MyOrchestration)); +worker.AddTaskActivities(typeof(MyActivity)); + +await worker.StartAsync(); + +var client = new TaskHubClient(service, loggerFactory: loggerFactory); +var instance = await client.CreateOrchestrationInstanceAsync( + typeof(MyOrchestration), + "input"); + +await client.WaitForOrchestrationAsync(instance, TimeSpan.FromMinutes(1)); +await worker.StopAsync(); +``` + +### Exporting to Azure Monitor + +```csharp +using Azure.Monitor.OpenTelemetry.Exporter; + +var tracerProvider = Sdk.CreateTracerProviderBuilder() + .AddSource("DurableTask.Core") + .AddAzureMonitorTraceExporter(o => + { + o.ConnectionString = "InstrumentationKey=..."; + }) + .Build(); +``` + +### Exporting to Jaeger + +```csharp +using OpenTelemetry.Exporter; + +var tracerProvider = Sdk.CreateTracerProviderBuilder() + .AddSource("DurableTask.Core") + .AddJaegerExporter(o => + { + o.AgentHost = "localhost"; + o.AgentPort = 6831; + }) + .Build(); +``` + +## Application Insights Setup + +For Application Insights integration, use the dedicated telemetry module. + +### Application Insights Installation + +```bash +dotnet add package Microsoft.Azure.DurableTask.ApplicationInsights +dotnet add package Microsoft.ApplicationInsights +``` + +### Application Insights Configuration + +```csharp +using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.Azure.DurableTask.ApplicationInsights; +using Microsoft.Extensions.DependencyInjection; + +var services = new ServiceCollection(); + +// Add Application Insights +services.AddApplicationInsightsTelemetryWorkerService(options => +{ + options.ConnectionString = "InstrumentationKey=..."; +}); + +// Add DurableTask telemetry module +services.TryAddEnumerable( + ServiceDescriptor.Singleton()); + +var serviceProvider = services.BuildServiceProvider(); +``` + +### ASP.NET Core Integration + +```csharp +// In Program.cs or Startup.cs +builder.Services.AddApplicationInsightsTelemetry(); +builder.Services.TryAddEnumerable( + ServiceDescriptor.Singleton()); +``` + +## Span Reference + +### Orchestration Spans + +| Span Name | Kind | Description | +| --------- | ---- | ----------- | +| `create_orchestration:{name}` | Producer | Starting an orchestration from client | +| `orchestration:{name}` | Server | Running an orchestration in worker | +| `orchestration:{name}` | Client | Starting a sub-orchestration | + +### Activity Spans + +| Span Name | Kind | Description | +| --------- | ---- | ----------- | +| `activity:{name}` | Client | Starting an activity from orchestration | +| `activity:{name}` | Server | Running an activity in worker | + +### Other Spans + +| Span Name | Kind | Description | +| --------- | ---- | ----------- | +| `timer` | Internal | Durable timer | +| `event:{name}` | Producer | Sending an external event | + +## Attributes + +DTFx spans include these attributes: + +| Attribute | Type | Description | +| --------- | ---- | ----------- | +| `durabletask.type` | string | Type: "orchestration", "activity", "timer", "event" | +| `durabletask.task.name` | string | Name of the task | +| `durabletask.task.version` | string | Version of the task (if specified) | +| `durabletask.task.instance_id` | string | Orchestration instance ID | +| `durabletask.task.execution_id` | string | Execution ID | +| `durabletask.task.task_id` | int | Task index within orchestration | +| `durabletask.task.result` | string | Result: "Succeeded", "Failed", "Terminated" | + +## Trace Correlation + +Traces are automatically correlated across: + +- Parent orchestration → Sub-orchestration +- Orchestration → Activity +- Client → Orchestration + +### Example Trace Hierarchy + +```text +create_orchestration:OrderOrchestration (Producer) +└── orchestration:OrderOrchestration (Server) + ├── activity:ValidateOrder (Client) + │ └── activity:ValidateOrder (Server) + ├── activity:ProcessPayment (Client) + │ └── activity:ProcessPayment (Server) + └── orchestration:ShippingOrchestration (Client) + └── orchestration:ShippingOrchestration (Server) + └── activity:CreateShipment (Client) + └── activity:CreateShipment (Server) +``` + +## Samples + +See the sample projects for complete working examples: + +- [OpenTelemetry Sample](../../samples/DistributedTraceSample/OpenTelemetry) — Modern ActivitySource-based tracing with OpenTelemetry +- [Application Insights Sample](../../samples/DistributedTraceSample/ApplicationInsights) — Modern ActivitySource-based tracing with Application Insights +- [Correlation Sample](../../samples/Correlation.Samples) — Legacy CorrelationSettings-based tracing (Azure Storage only) + +## Legacy Correlation (Azure Storage Only) + +The Azure Storage provider includes a legacy correlation system using `CorrelationSettings`. This approach predates the modern `ActivitySource` API and is maintained for backward compatibility. + +### Enabling Legacy Correlation + +```csharp +using DurableTask.Core.Settings; + +// Enable legacy distributed tracing +CorrelationSettings.Current.EnableDistributedTracing = true; +CorrelationSettings.Current.Protocol = Protocol.W3CTraceContext; // or Protocol.HttpCorrelationProtocol +``` + +### Setting Up Telemetry + +The legacy system requires manual setup of `CorrelationTraceClient`: + +```csharp +using DurableTask.Core; +using Microsoft.ApplicationInsights; + +// Set up telemetry callbacks +CorrelationTraceClient.SetUp( + (TraceContextBase requestTraceContext) => + { + requestTraceContext.Stop(); + var requestTelemetry = requestTraceContext.CreateRequestTelemetry(); + telemetryClient.TrackRequest(requestTelemetry); + }, + (TraceContextBase dependencyTraceContext) => + { + dependencyTraceContext.Stop(); + var dependencyTelemetry = dependencyTraceContext.CreateDependencyTelemetry(); + telemetryClient.TrackDependency(dependencyTelemetry); + }, + (Exception e) => + { + telemetryClient.TrackException(e); + } +); +``` + +> [!NOTE] +> The modern `ActivitySource` approach (OpenTelemetry/DurableTelemetryModule) is recommended for new projects. The legacy `CorrelationSettings` system only works with the Azure Storage provider. + +## Next Steps + +- [Logging](logging.md) — Structured logging in DTFx +- [Application Insights](application-insights.md) — Full AI integration +- [Semantic Conventions](traces/semantic-conventions.md) — Detailed span specification diff --git a/docs/telemetry/logging.md b/docs/telemetry/logging.md new file mode 100644 index 000000000..467729675 --- /dev/null +++ b/docs/telemetry/logging.md @@ -0,0 +1,356 @@ +# Logging + +The Durable Task Framework provides structured logging for observability and debugging. This guide covers logging configuration and best practices. + +## Log Sources + +DTFx emits logs from these sources: + +| Source | Description | +| ------ | ----------- | +| `DurableTask.Core` | Core framework operations | +| `DurableTask.AzureStorage` | Azure Storage provider | +| `DurableTask.ServiceBus` | Service Bus provider | +| `DurableTask.AzureManagedBackend` | Durable Task Scheduler | + +## Configuring Logging + +### With Microsoft.Extensions.Logging + +```csharp +using Microsoft.Extensions.Logging; + +var loggerFactory = LoggerFactory.Create(builder => +{ + builder + .SetMinimumLevel(LogLevel.Information) + .AddConsole() + .AddFilter("DurableTask.Core", LogLevel.Debug); +}); + +// Configure provider-specific logging (e.g., Azure Storage) +var settings = new AzureStorageOrchestrationServiceSettings +{ + TaskHubName = "MyTaskHub", + StorageAccountClientProvider = new StorageAccountClientProvider( + "mystorageaccount", + new DefaultAzureCredential()), + LoggerFactory = loggerFactory, // Provider logs +}; +var service = new AzureStorageOrchestrationService(settings); + +// Pass to worker and client +var worker = new TaskHubWorker(service, loggerFactory); +var client = new TaskHubClient(service, loggerFactory: loggerFactory); +``` + +> [!NOTE] +> Pass the `ILoggerFactory` to all three locations (provider settings, worker, and client) for complete log coverage. Provider-specific logs include message delivery times, partition operations, and other backend details useful for debugging. + +### With Serilog + +```csharp +using Serilog; + +Log.Logger = new LoggerConfiguration() + .MinimumLevel.Information() + .MinimumLevel.Override("DurableTask.Core", Serilog.Events.LogEventLevel.Debug) + .WriteTo.Console() + .CreateLogger(); + +var loggerFactory = new LoggerFactory().AddSerilog(); +var worker = new TaskHubWorker(service, loggerFactory); +``` + +### ASP.NET Core Integration + +```csharp +// In Program.cs +builder.Logging.AddFilter("DurableTask.Core", LogLevel.Debug); +``` + +## Log Events + +### Orchestration Events + +| Event ID | Level | Description | +| -------- | ----- | ----------- | +| 40 | Information | Scheduling orchestration | +| 43 | Information | Waiting for orchestration | +| 49 | Information | Orchestration completed | +| 51 | Information | Executing orchestration logic | +| 52 | Information | Orchestration executed (scheduled operations) | + +### Activity Events + +| Event ID | Level | Description | +| -------- | ----- | ----------- | +| 46 | Information | Scheduling activity | +| 60 | Information | Starting activity | +| 61 | Information | Activity completed | + +### Worker Events + +| Event ID | Level | Description | +| -------- | ----- | ----------- | +| 10 | Information | Worker starting | +| 11 | Information | Worker started | +| 12 | Information | Worker stopping | +| 13 | Information | Worker stopped | + +### Example Log Output + +```text +info: DurableTask.Core[10] Durable task hub worker is starting +info: DurableTask.Core[40] Scheduling orchestration 'MyOrchestration' with instance ID = 'abc123' +info: DurableTask.Core[51] abc123: Executing 'MyOrchestration' orchestration logic +info: DurableTask.Core[46] abc123: Scheduling activity [MyActivity#0] +info: DurableTask.Core[60] abc123: Starting task activity [MyActivity#0] +info: DurableTask.Core[61] abc123: Task activity [MyActivity#0] completed successfully +info: DurableTask.Core[49] abc123: Orchestration completed with status 'Completed' +``` + +## Logging in Orchestrations + +### Using IsReplaying + +Avoid duplicate logs during replay. Note that DTFx orchestrations do not support constructor-based dependency injection. Use a static logger or pass a logger factory through your object creator: + +```csharp +public class MyOrchestration : TaskOrchestration +{ + // Use a static logger or configure via ObjectCreator + private static readonly ILogger Logger = LoggerFactory + .Create(builder => builder.AddConsole()) + .CreateLogger(); + + public override async Task RunTask( + OrchestrationContext context, + string input) + { + // Only log during actual execution, not replay + if (!context.IsReplaying) + { + Logger.LogInformation( + "Processing orchestration {InstanceId} with input {Input}", + context.OrchestrationInstance.InstanceId, + input); + } + + var result = await context.ScheduleTask(typeof(MyActivity), input); + + if (!context.IsReplaying) + { + Logger.LogInformation( + "Orchestration {InstanceId} completed with result {Result}", + context.OrchestrationInstance.InstanceId, + result); + } + + return result; + } +} +``` + +### Structured Logging Best Practices + +Include relevant context in log messages: + +```csharp +// ✅ Good - structured with context +_logger.LogInformation( + "Processing order {OrderId} for customer {CustomerId} in orchestration {InstanceId}", + input.OrderId, + input.CustomerId, + context.OrchestrationInstance.InstanceId); + +// ❌ Bad - string concatenation, no structure +_logger.LogInformation( + "Processing order " + input.OrderId + " for customer " + input.CustomerId); +``` + +## Logging in Activities + +Activities don't have replay concerns, so log freely. Like orchestrations, DTFx activities do not support constructor-based dependency injection. Use a static logger or configure via a custom `ObjectCreator`: + +```csharp +public class MyActivity : AsyncTaskActivity +{ + // Use a static logger or configure via ObjectCreator + private static readonly ILogger Logger = LoggerFactory + .Create(builder => builder.AddConsole()) + .CreateLogger(); + + protected override async Task ExecuteAsync( + TaskContext context, + string input) + { + Logger.LogInformation( + "Starting activity for orchestration {InstanceId}", + context.OrchestrationInstance.InstanceId); + + try + { + var result = await DoWorkAsync(input); + + Logger.LogInformation( + "Activity completed for orchestration {InstanceId}", + context.OrchestrationInstance.InstanceId); + + return result; + } + catch (Exception ex) + { + Logger.LogError(ex, + "Activity failed for orchestration {InstanceId}", + context.OrchestrationInstance.InstanceId); + throw; + } + } + + private Task DoWorkAsync(string input) => Task.FromResult(input); +} +``` + +## Log Correlation + +### Correlation IDs + +Include correlation IDs for end-to-end tracing: + +```csharp +public override async Task RunTask( + OrchestrationContext context, + OrderInput input) +{ + using (Logger.BeginScope(new Dictionary + { + ["InstanceId"] = context.OrchestrationInstance.InstanceId, + ["OrderId"] = input.OrderId, + ["CorrelationId"] = input.CorrelationId + })) + { + if (!context.IsReplaying) + { + Logger.LogInformation("Starting order processing"); + } + + // ... orchestration logic + } +} +``` + +### Distributed Tracing Integration + +For trace correlation, see [Distributed Tracing](distributed-tracing.md). + +## Log Levels + +Recommended log level configuration: + +| Environment | DurableTask.Core | Provider | +| ----------- | ---------------- | -------- | +| Development | Debug | Debug | +| Testing | Debug | Information | +| Production | Information | Warning | + +### Configuration Example + +```json +{ + "Logging": { + "LogLevel": { + "Default": "Information", + "DurableTask.Core": "Information", + "DurableTask.AzureStorage": "Warning" + } + } +} +``` + +## Filtering Noisy Logs + +Some operations generate many logs. Filter as needed: + +```csharp +builder.Logging + .AddFilter("DurableTask.Core", LogLevel.Information) + // Reduce noise from Azure Storage provider + .AddFilter("DurableTask.AzureStorage", LogLevel.Warning); +``` + +## Diagnostic Logging + +For troubleshooting, enable debug logging: + +```csharp +builder.Logging + .SetMinimumLevel(LogLevel.Debug) + .AddFilter("DurableTask", LogLevel.Debug); +``` + +This reveals: + +- Message processing details +- Partition lease operations +- History loading/saving +- Timer scheduling + +## Logging with Middleware + +For cross-cutting logging concerns without modifying orchestration or activity code, use [middleware](../advanced/middleware.md). This approach lets you intercept all executions in one place: + +```csharp +var worker = new TaskHubWorker(orchestrationService, loggerFactory); + +// Add orchestration logging middleware +worker.AddOrchestrationDispatcherMiddleware(async (context, next) => +{ + var instance = context.GetProperty(); + var runtimeState = context.GetProperty(); + + logger.LogInformation("Orchestration {Name} ({InstanceId}) starting", + runtimeState?.Name, instance?.InstanceId); + + var stopwatch = Stopwatch.StartNew(); + try + { + await next(); + logger.LogInformation("Orchestration {Name} ({InstanceId}) completed in {ElapsedMs}ms", + runtimeState?.Name, instance?.InstanceId, stopwatch.ElapsedMilliseconds); + } + catch (Exception ex) + { + logger.LogError(ex, "Orchestration {Name} ({InstanceId}) failed", + runtimeState?.Name, instance?.InstanceId); + throw; + } +}); + +// Add activity logging middleware +worker.AddActivityDispatcherMiddleware(async (context, next) => +{ + var scheduledEvent = context.GetProperty(); + var instance = context.GetProperty(); + + logger.LogInformation("Activity {ActivityName} starting for {InstanceId}", + scheduledEvent?.Name, instance?.InstanceId); + + await next(); +}); +``` + +See [Middleware](../advanced/middleware.md) for complete examples. + +## Event Source Logging + +In addition to `ILogger`, DTFx also emits logs via [Event Source](https://docs.microsoft.com/dotnet/api/system.diagnostics.tracing.eventsource), which is used by platforms like Azure Functions and Azure App Service for automatic telemetry collection. Event Source logging is always enabled and captures additional correlation details. + +For advanced Event Source configuration, including provider GUIDs and structured logging details, see the [source documentation](https://github.com/Azure/durabletask/blob/main/src/DurableTask.Core/Logging/README.md). + +## Next Steps + +- [Distributed Tracing](distributed-tracing.md) — OpenTelemetry integration +- [Application Insights](application-insights.md) — Full AI integration +- [Middleware](../advanced/middleware.md) — Cross-cutting concerns including logging +- [Error Handling](../features/error-handling.md) — Logging errors diff --git a/eng/ci/official-build.yml b/eng/ci/official-build.yml index 9f39bb7a1..23eab48ef 100644 --- a/eng/ci/official-build.yml +++ b/eng/ci/official-build.yml @@ -13,7 +13,7 @@ schedules: # Build nightly to catch any new CVEs and report SDL often. # We are also required to generated CodeQL reports weekly, so this # helps us meet that. -- cron: "0 0 * * *" +- cron: "0 5 * * *" displayName: Nightly Build branches: include: @@ -38,6 +38,9 @@ extends: name: 1es-pool-azfunc image: 1es-windows-2022 os: windows + ${{ if eq( variables['Build.Reason'], 'Schedule' ) }}: + demands: + - Priority -equals Low stages: - stage: BuildAndSign diff --git a/eng/ci/public-build.yml b/eng/ci/public-build.yml index e3102a23d..95b1518a3 100644 --- a/eng/ci/public-build.yml +++ b/eng/ci/public-build.yml @@ -11,7 +11,7 @@ trigger: # Run nightly to catch new CVEs and to report SDL often. schedules: - - cron: "0 0 * * *" + - cron: "0 5 * * *" displayName: Nightly Run branches: include: @@ -41,6 +41,9 @@ extends: name: 1es-pool-azfunc-public image: 1es-windows-2022 os: windows + ${{ if eq( variables['Build.Reason'], 'Schedule' ) }}: + demands: + - Priority -equals Low sdl: codeql: diff --git a/eng/templates/build-steps.yml b/eng/templates/build-steps.yml index 746a1dba7..cb6300182 100644 --- a/eng/templates/build-steps.yml +++ b/eng/templates/build-steps.yml @@ -25,10 +25,10 @@ steps: version: '3.1.x' - task: UseDotNet@2 - displayName: 'Use the .NET 6 SDK' + displayName: 'Use the .NET 8 SDK' inputs: packageType: 'sdk' - version: '6.0.x' + version: '8.0.x' - task: DotNetCoreCLI@2 displayName: 'Restore nuget dependencies' diff --git a/eng/templates/test.yml b/eng/templates/test.yml index eed9f610f..f4fc29a7f 100644 --- a/eng/templates/test.yml +++ b/eng/templates/test.yml @@ -24,7 +24,7 @@ steps: npm install -g azurite mkdir azurite1 echo "azurite installed" - azurite --silent --location azurite1 --debug azurite1\debug.txt --queuePort 10001 & + azurite --silent --location azurite1 --debug azurite1\debug.txt --queuePort 10001 --skipApiVersionCheck & echo "azurite started" sleep 5 displayName: 'Install and Run Azurite' @@ -36,7 +36,7 @@ steps: testAssemblyVer2: | $(System.DefaultWorkingDirectory)/${{ parameters.testAssembly }} testFiltercriteria: 'TestCategory!=DisabledInCI' - vsTestVersion: 17.0 + vsTestVersion: '17.0' distributionBatchType: basedOnExecutionTime platform: 'any cpu' configuration: 'Debug' diff --git a/samples/Correlation.Samples/Correlation.Samples.csproj b/samples/Correlation.Samples/Correlation.Samples.csproj index d3c206a21..81757ac8b 100644 --- a/samples/Correlation.Samples/Correlation.Samples.csproj +++ b/samples/Correlation.Samples/Correlation.Samples.csproj @@ -2,7 +2,7 @@ Exe - net6.0 + net8.0 @@ -19,6 +19,8 @@ + + Always diff --git a/samples/Correlation.Samples/Readme.md b/samples/Correlation.Samples/Readme.md index f2920c9ae..e01372ac6 100644 --- a/samples/Correlation.Samples/Readme.md +++ b/samples/Correlation.Samples/Readme.md @@ -1,22 +1,42 @@ -# Distributed Tracing for Durable Task +# Correlation Samples -Distributed Tracing for Durable Task is a feature for enabling correlation propagation among orchestrations and activities. -The key features of Distributed Tracing for Durable Task are: +This sample demonstrates legacy distributed tracing using the `CorrelationSettings` API with the Azure Storage provider. -- **End to End Tracing with Application Insights**: Support Complex orchestration scenario. Multi-Layered Sub Orchestration, Fan-out Fan-in, retry, Timer, and more. -- **Support Protocol**: [W3C TraceContext](https://w3c.github.io/trace-context/) and [Http Correlation Protocol](https://github.com/dotnet/corefx/blob/master/src/System.Diagnostics.DiagnosticSource/src/HttpCorrelationProtocol.md) -- **Suppress Distributed Tracing**: No breaking change for the current implementation +> [!NOTE] +> For comprehensive distributed tracing documentation, including the modern `ActivitySource`-based approach, see the [Distributed Tracing Guide](../../docs/telemetry/distributed-tracing.md). -Currently, we support [DurableTask.AzureStorage](https://w3c.github.io/trace-context/). +## Overview + +This sample shows the legacy correlation approach using: + +- `CorrelationSettings.Current.EnableDistributedTracing` — Enable tracing +- `CorrelationTraceClient.SetUp()` — Manual telemetry callbacks +- Application Insights for trace visualization ![Overview](docs/images/overview.png) -# Getting Started +## Supported Scenarios + +The samples demonstrate tracing across various orchestration patterns: + +- Simple orchestrations (`HelloOrchestrator`) +- Fan-out/fan-in (`FanOutFanInOrchestrator`) +- Sub-orchestrations (`SubOrchestratorOrchestration`) +- Retry scenarios (`RetryOrchestration`, `MultiLayeredOrchestrationWithRetry`) +- Continue-as-new (`ContinueAsNewOrchestration`) +- Terminated orchestrations (`TerminatedOrchestration`) + +## Getting Started + +See [docs/getting-started.md](docs/getting-started.md) for setup instructions. + +## Provider Implementation -If you want to try Distributed Tracing with DurableTask.AzureStorage, you can find a document with a Handful of examples. +If you're implementing distributed tracing for a custom provider, see [docs/overview.md](docs/overview.md) for the architecture and extension points. - - [Intro](docs/getting-started.md) +## Modern Alternative -# Developing Provider +For new projects, consider using the modern `ActivitySource`-based approach with OpenTelemetry: -If you want to implement Distributed Tracing for other DurableTask providers, Read [Develop Distributed Tracing](docs/overview.md). \ No newline at end of file +- [OpenTelemetry Sample](../DistributedTraceSample/OpenTelemetry) +- [Application Insights Sample](../DistributedTraceSample/ApplicationInsights) diff --git a/samples/DistributedTraceSample/ApplicationInsights/ApplicationInsightsSample.csproj b/samples/DistributedTraceSample/ApplicationInsights/ApplicationInsightsSample.csproj index efbc910c6..160690471 100644 --- a/samples/DistributedTraceSample/ApplicationInsights/ApplicationInsightsSample.csproj +++ b/samples/DistributedTraceSample/ApplicationInsights/ApplicationInsightsSample.csproj @@ -2,7 +2,7 @@ Exe - net6.0 + net8.0 enable d4d9b2e3-fb2a-4de6-9747-3d6d3b639d1a dummy-value diff --git a/samples/DistributedTraceSample/ApplicationInsights/README.md b/samples/DistributedTraceSample/ApplicationInsights/README.md new file mode 100644 index 000000000..cf358239a --- /dev/null +++ b/samples/DistributedTraceSample/ApplicationInsights/README.md @@ -0,0 +1,57 @@ +# Application Insights Sample + +This sample demonstrates direct integration with Azure Application Insights for distributed tracing in Durable Task applications. + +## Prerequisites + +- .NET 6.0 SDK or later +- Azure Storage Emulator (Azurite) or Azure Storage account +- Azure Application Insights resource + +## Configuration + +1. Create an Application Insights resource in the Azure Portal + +2. Configure the connection string in `appsettings.json`: + + ```json + { + "ApplicationInsights": { + "ConnectionString": "InstrumentationKey=..." + } + } + ``` + + Or set the environment variable: + + ```text + APPLICATIONINSIGHTS_CONNECTION_STRING=InstrumentationKey=... + ``` + +## Code Setup + +```csharp +services.AddApplicationInsightsTelemetryWorkerService(); +services.TryAddEnumerable( + ServiceDescriptor.Singleton()); +``` + +The `FilterOutStorageTelemetryProcessor` is included to reduce noise from Azure Storage operations in your telemetry. + +## Running the Sample + +```bash +dotnet run +``` + +## Viewing Traces + +1. Navigate to your Application Insights resource in the Azure Portal +2. Go to **Transaction Search** +3. Click on an entry to view the end-to-end transaction +4. A Gantt chart will show the visual representation of the trace and spans + +## Additional Resources + +- [Application Insights Documentation](../../../docs/telemetry/application-insights.md) +- [Distributed Tracing Guide](../../../docs/telemetry/distributed-tracing.md) diff --git a/samples/DistributedTraceSample/OpenTelemetry/OpenTelemetrySample.csproj b/samples/DistributedTraceSample/OpenTelemetry/OpenTelemetrySample.csproj index 970d3b7fc..3f6ac2af1 100644 --- a/samples/DistributedTraceSample/OpenTelemetry/OpenTelemetrySample.csproj +++ b/samples/DistributedTraceSample/OpenTelemetry/OpenTelemetrySample.csproj @@ -2,7 +2,7 @@ Exe - net6.0 + net8.0 enable diff --git a/samples/DistributedTraceSample/README.md b/samples/DistributedTraceSample/README.md new file mode 100644 index 000000000..f74316198 --- /dev/null +++ b/samples/DistributedTraceSample/README.md @@ -0,0 +1,56 @@ +# Distributed Trace Samples + +This directory contains samples demonstrating telemetry integration with different distributed tracing providers for Durable Task applications. + +## Overview + +Distributed tracing allows you to monitor and debug orchestrations across your entire application stack. These samples show how to configure various telemetry exporters. + +## Samples + +| Sample | Description | +| ------ | ----------- | +| [OpenTelemetry](OpenTelemetry/) | Integration with OpenTelemetry for vendor-neutral distributed tracing | +| [ApplicationInsights](ApplicationInsights/) | Integration with Azure Application Insights | + +## OpenTelemetry Sample + +The [OpenTelemetry sample](OpenTelemetry/) demonstrates how to configure distributed tracing with multiple exporters including Console, Application Insights, and Zipkin. + +```csharp +using var tracerProvider = Sdk.CreateTracerProviderBuilder() + .SetResourceBuilder(ResourceBuilder.CreateDefault().AddService("MySample")) + .AddSource("DurableTask.Core") + .AddConsoleExporter() + .AddZipkinExporter() + .AddAzureMonitorTraceExporter(options => + { + options.ConnectionString = Environment.GetEnvironmentVariable("AZURE_MONITOR_CONNECTION_STRING"); + }) + .Build(); +``` + +See the [OpenTelemetry README](OpenTelemetry/README.md) for detailed setup instructions. + +## Application Insights Sample + +The [Application Insights sample](ApplicationInsights/) demonstrates direct integration with Azure Application Insights without OpenTelemetry. + +```csharp +services.AddApplicationInsightsTelemetryWorkerService(); +services.TryAddEnumerable( + ServiceDescriptor.Singleton()); +``` + +## Prerequisites + +- .NET 6.0 SDK or later +- Azure Storage Emulator (Azurite) or Azure Storage account +- (Optional) Application Insights resource +- (Optional) Zipkin instance for OpenTelemetry sample + +## Additional Resources + +- [Distributed Tracing Guide](../../docs/telemetry/distributed-tracing.md) +- [Application Insights Documentation](../../docs/telemetry/application-insights.md) +- [OpenTelemetry Documentation](https://opentelemetry.io/) diff --git a/samples/DurableTask.Samples/DurableTask.Samples.csproj b/samples/DurableTask.Samples/DurableTask.Samples.csproj index 2782a3274..6a5c66ba5 100644 --- a/samples/DurableTask.Samples/DurableTask.Samples.csproj +++ b/samples/DurableTask.Samples/DurableTask.Samples.csproj @@ -4,7 +4,7 @@ true Exe - net462 + net48 diff --git a/samples/DurableTask.Samples/README.md b/samples/DurableTask.Samples/README.md new file mode 100644 index 000000000..9da5433ce --- /dev/null +++ b/samples/DurableTask.Samples/README.md @@ -0,0 +1,203 @@ +# DurableTask.Samples + +This project contains core sample orchestrations demonstrating fundamental patterns of the Durable Task Framework using the Azure Storage backend. + +## Prerequisites + +- .NET Framework 4.8 or later +- Azure Storage Emulator (Azurite) or Azure Storage account + +## Configuration + +Configure the connection string in `App.config`: + +```xml + + + + +``` + +For Azure Storage, replace `UseDevelopmentStorage=true` with your connection string (if not using the emulator): + +```text +DefaultEndpointsProtocol=https;AccountName=...;AccountKey=... +``` + +## Running the Samples + +### 1. Create the Task Hub (first time only) + +```bash +DurableTask.Samples.exe -c +``` + +### 2. Start an Orchestration + +```bash +DurableTask.Samples.exe -s [-p ] +``` + +The worker automatically starts and waits for the orchestration to complete. + +## Available Samples + +### Greetings + +A simple "Hello World" orchestration that calls greeting activities. + +```csharp +public class GreetingsOrchestration : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, string input) + { + string greeting = await context.ScheduleTask(typeof(GetUserTask)); + string result = await context.ScheduleTask(typeof(SendGreetingTask), greeting); + return result; + } +} +``` + +**Run:** `DurableTask.Samples.exe -s Greetings` + +### Greetings2 + +Demonstrates parameterized orchestrations with a configurable number of greetings. + +**Run:** `DurableTask.Samples.exe -s Greetings2 -p 5` + +### Cron + +An eternal orchestration that runs on a schedule using `CreateTimer` and `ContinueAsNew`. + +```csharp +public class CronOrchestration : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, string schedule) + { + // Execute the scheduled task + await context.ScheduleTask(typeof(CronTask)); + + // Wait until next scheduled time + DateTime nextRun = CalculateNextRun(context.CurrentUtcDateTime, schedule); + await context.CreateTimer(nextRun, true); + + // Continue as new instance + context.ContinueAsNew(schedule); + return "Completed cycle"; + } +} +``` + +**Run:** `DurableTask.Samples.exe -s Cron -p "0 12 * * *"` + +### AverageCalculator + +Fan-out/fan-in pattern that distributes computation across multiple activities. + +```csharp +public class AverageCalculatorOrchestration : TaskOrchestration +{ + public override async Task RunTask(OrchestrationContext context, int[] numbers) + { + // Fan-out: process chunks in parallel + var tasks = new List>(); + foreach (var chunk in numbers.Chunk(10)) + { + tasks.Add(context.ScheduleTask(typeof(ComputeSumTask), chunk)); + } + + // Fan-in: aggregate results + int[] sums = await Task.WhenAll(tasks); + return sums.Sum() / (double)numbers.Length; + } +} +``` + +**Run:** `DurableTask.Samples.exe -s Average -p "1 50 10"` + +Parameters: ` ` + +### ErrorHandling + +Demonstrates retry policies and exception handling patterns. + +```csharp +public override async Task RunTask(OrchestrationContext context, string input) +{ + var retryOptions = new RetryOptions( + firstRetryInterval: TimeSpan.FromSeconds(5), + maxNumberOfAttempts: 3); + + try + { + return await context.ScheduleWithRetry( + typeof(UnreliableActivity), + retryOptions, + input); + } + catch (TaskFailedException ex) + { + // Handle permanent failure + return $"Failed after retries: {ex.Message}"; + } +} +``` + +**Run:** `DurableTask.Samples.exe -s ErrorHandling` + +### Signal + +Demonstrates external events and human interaction patterns. + +```csharp +public override async Task RunTask(OrchestrationContext context, ApprovalRequest input) +{ + // Send notification + await context.ScheduleTask(typeof(SendApprovalRequest), input); + + // Wait for external event + var approval = await context.WaitForExternalEvent("ApprovalResult"); + + if (approval.IsApproved) + { + await context.ScheduleTask(typeof(ProcessApproval), input); + return "Approved and processed"; + } + + return "Rejected"; +} +``` + +**Run:** `DurableTask.Samples.exe -s Signal` + +To raise an event to a running instance: + +```bash +DurableTask.Samples.exe -n -i -p +``` + +### SumOfSquares + +Another fan-out/fan-in example computing sum of squares from a JSON input file. + +**Run:** `DurableTask.Samples.exe -s SumOfSquares` + +## Command Line Options + +| Option | Description | +| ------ | ----------- | +| `-c` | Create the task hub (required on first run) | +| `-s ` | Start the specified orchestration | +| `-p ` | Parameters to pass to the orchestration | +| `-i ` | Instance ID (auto-generated if not specified) | +| `-n ` | Event name for raising events | +| `-w` | Skip the worker (useful when worker runs separately) | + +## Additional Resources + +- [Getting Started Guide](../../docs/getting-started/quickstart.md) +- [Orchestrations](../../docs/concepts/orchestrations.md) +- [Activities](../../docs/concepts/activities.md) +- [Error Handling](../../docs/features/error-handling.md) +- [Timers](../../docs/features/timers.md) diff --git a/samples/ManagedIdentitySample/DTFx.AzureStorage v1.x/ManagedIdentity.AzStorageV1.csproj b/samples/ManagedIdentitySample/DTFx.AzureStorage v1.x/ManagedIdentity.AzStorageV1.csproj index 51ce684b2..cdb974eb9 100644 --- a/samples/ManagedIdentitySample/DTFx.AzureStorage v1.x/ManagedIdentity.AzStorageV1.csproj +++ b/samples/ManagedIdentitySample/DTFx.AzureStorage v1.x/ManagedIdentity.AzStorageV1.csproj @@ -4,7 +4,7 @@ Latest enable Exe - net6.0 + net8.0 diff --git a/samples/ManagedIdentitySample/README.md b/samples/ManagedIdentitySample/README.md new file mode 100644 index 000000000..2d07dc5d0 --- /dev/null +++ b/samples/ManagedIdentitySample/README.md @@ -0,0 +1,65 @@ +# Managed Identity Sample + +This directory contains samples demonstrating how to use Azure Managed Identity for authentication with Azure Storage in Durable Task applications. + +## Overview + +Managed Identity provides a more secure alternative to connection strings by eliminating the need to store credentials. These samples show how to configure identity-based connections for both v1.x and v2.x versions of the Azure Storage provider. + +## Samples + +| Sample | Description | +| ------ | ----------- | +| [DTFx.AzureStorage v1.x](DTFx.AzureStorage%20v1.x/) | Legacy WindowsAzure.Storage SDK with Managed Identity | +| [DTFx.AzureStorage v2.x](DTFx.AzureStorage%20v2.x/) | Modern Azure.Storage.* SDK with TokenCredential | + +## Prerequisites + +Before running these samples, you must: + +1. **Create an Azure Storage account** or reuse an existing one + +2. **Create your identity** in the Azure Portal. Detailed instructions can be found in the [Microsoft Entra documentation](https://learn.microsoft.com/entra/identity-platform/quickstart-register-app?tabs=certificate) + +3. **Assign Role-based Access Controls (RBAC)** to the identity with [these instructions](https://learn.microsoft.com/azure/role-based-access-control/role-assignments-portal-managed-identity#Overview): + - Storage Queue Data Contributor + - Storage Blob Data Contributor + - Storage Table Data Contributor + +4. **Configure the identity** in your app's configuration + +5. **Set the storage account name** in your configuration. The account name can be replaced with individual service URIs (BlobServiceUri, TableServiceUri, QueueServiceUri) + +## Code Examples + +### DTFx.AzureStorage v1.x + +```csharp +var credential = new DefaultAzureCredential(); +var settings = new AzureStorageOrchestrationServiceSettings +{ + StorageAccountClientProvider = new ManagedIdentityStorageAccountClientProvider( + storageAccountName, + credential) +}; +``` + +> [!NOTE] +> Identity-based connection is **not supported** with .NET Framework 4.x when using DurableTask.AzureStorage v1.x + +### DTFx.AzureStorage v2.x + +```csharp +var credential = new DefaultAzureCredential(); +var settings = new AzureStorageOrchestrationServiceSettings +{ + StorageAccountClientProvider = new StorageAccountClientProvider( + new Uri($"https://{storageAccountName}.blob.core.windows.net"), + credential) +}; +``` + +## Additional Resources + +- [Azure Storage Provider Documentation](../../docs/providers/azure-storage.md) +- [Azure Managed Identity Overview](https://learn.microsoft.com/azure/active-directory/managed-identities-azure-resources/overview) diff --git a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj index 6321d410e..56d052fba 100644 --- a/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj +++ b/src/DurableTask.ApplicationInsights/DurableTask.ApplicationInsights.csproj @@ -11,7 +11,7 @@ 0 - 6 + 10 0 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureServiceFabric/DurableTask.AzureServiceFabric.csproj b/src/DurableTask.AzureServiceFabric/DurableTask.AzureServiceFabric.csproj index 5ed626ae5..770052ae9 100644 --- a/src/DurableTask.AzureServiceFabric/DurableTask.AzureServiceFabric.csproj +++ b/src/DurableTask.AzureServiceFabric/DurableTask.AzureServiceFabric.csproj @@ -2,7 +2,7 @@ - net462;net472 + net48;net472 true Microsoft.Azure.DurableTask.AzureServiceFabric true @@ -16,7 +16,7 @@ $(NoWarn);NU5104 - + diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs index 9fdee5f8f..9ad814c43 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationService.cs @@ -478,6 +478,15 @@ public async Task StopAsync(bool isForced) { this.shutdownSource.Cancel(); await this.statsLoop; + + if (isForced) + { + // When forced, immediately remove all active sessions so that + // partition draining completes without waiting for sessions to + // finish their idle timeout or in-flight work. + this.orchestrationSessionManager.AbortAllSessions(); + } + await this.appLeaseManager.StopAsync(); this.isStarted = false; } @@ -793,7 +802,12 @@ async Task LockNextTaskOrchestrationWorkItemAsync(boo TraceContext = currentRequestTraceContext, }; - if (!this.IsExecutableInstance(session.RuntimeState, orchestrationWorkItem.NewMessages, settings.AllowReplayingTerminalInstances, out string warningMessage)) + string warningMessage = await this.IsExecutableInstanceAsync( + session.RuntimeState, + orchestrationWorkItem.NewMessages, + settings.AllowReplayingTerminalInstances, + cancellationToken); + if (!string.IsNullOrEmpty(warningMessage)) { // If all messages belong to the same execution ID, then all of them need to be discarded. // However, it's also possible to have messages for *any* execution ID batched together with messages @@ -1049,7 +1063,11 @@ internal static void TraceMessageReceived(AzureStorageOrchestrationServiceSettin data.Episode.GetValueOrDefault(-1)); } - bool IsExecutableInstance(OrchestrationRuntimeState runtimeState, IList newMessages, bool allowReplayingTerminalInstances, out string message) + async Task IsExecutableInstanceAsync( + OrchestrationRuntimeState runtimeState, + IList newMessages, + bool allowReplayingTerminalInstances, + CancellationToken cancellationToken) { if (runtimeState.ExecutionStartedEvent == null && !newMessages.Any(msg => msg.Event is ExecutionStartedEvent)) { @@ -1057,11 +1075,20 @@ bool IsExecutableInstance(OrchestrationRuntimeState runtimeState, IList msg.Event is ExecutionTerminatedEvent); + if (executionTerminatedEventMessage is not null) + { + var executionTerminatedEvent = (ExecutionTerminatedEvent)executionTerminatedEventMessage.Event; + await this.trackingStore.UpdateStatusForTerminationAsync( + instanceId, + executionTerminatedEvent); + return $"Instance is {OrchestrationStatus.Terminated}"; + } + // A non-zero event count usually happens when an instance's history is overwritten by a // new instance or by a ContinueAsNew. When history is overwritten by new instances, we // overwrite the old history with new history (with a new execution ID), but this is done @@ -1069,23 +1096,33 @@ bool IsExecutableInstance(OrchestrationRuntimeState runtimeState, IList SendTaskOrchestrationMessageInternalAsync( + internal Task SendTaskOrchestrationMessageInternalAsync( OrchestrationInstance sourceInstance, ControlQueue controlQueue, TaskMessage message) @@ -1909,7 +1946,7 @@ public async Task> GetOrchestrationStateAsync(string i /// /// Instance ID of the orchestration to terminate. /// The user-friendly reason for terminating. - public Task ForceTerminateTaskOrchestrationAsync(string instanceId, string reason) + public async Task ForceTerminateTaskOrchestrationAsync(string instanceId, string reason) { var taskMessage = new TaskMessage { @@ -1917,7 +1954,7 @@ public Task ForceTerminateTaskOrchestrationAsync(string instanceId, string reaso Event = new ExecutionTerminatedEvent(-1, reason) }; - return SendTaskOrchestrationMessageAsync(taskMessage); + await SendTaskOrchestrationMessageAsync(taskMessage); } /// @@ -2122,6 +2159,16 @@ public Task DownloadBlobAsync(string blobUri) return cachedQueue; } + /// + /// Whether to use separate work item queues for entities and orchestrators. + /// Wraps the value of AzureStorageOrchestrationServiceSettings.UseSeparateQueueForEntityWorkItems. + /// + public bool UseSeparateQueuesForEntityWorkItems + { + get => this.settings.UseSeparateQueueForEntityWorkItems; + set => this.settings.UseSeparateQueueForEntityWorkItems = value; + } + /// /// Disposes of the current object. /// diff --git a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs index 48e653868..e614da92f 100644 --- a/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs +++ b/src/DurableTask.AzureStorage/AzureStorageOrchestrationServiceSettings.cs @@ -294,5 +294,22 @@ internal LogHelper Logger /// The default is . /// public QueueClientMessageEncoding QueueClientMessageEncoding { get; set; } = QueueClientMessageEncoding.UTF8; + + /// + /// When true, an etag is used when attempting to make instance table updates upon completing an orchestration work item. + /// + /// + /// By default, etags are only used when updating the history table upon completing an orchestration work item. This can lead + /// to subtle race conditions where the instance table is incorrectly updated. Consider the following scenario: + /// 1. Worker A completes an orchestration work item, sends outgoing messages, updates the history table, then stalls. + /// 2. Worker B picks up the next and final orchestration work item for the same instance and completes it, updating the history + /// table and instance table with the new terminal status. + /// 3. Worker A resumes and overrides the terminal status in the instance table with an incorrect non-terminal status. + /// This will leave the instance status of the orchestration permanently as "Running" even though it has actually completed. + /// To prevent such scenarios, enabling this setting will ensure that instance table updates also use etags. This would prevent + /// worker A's update in step 3 from completing. Enabling this setting will also introduce a performance overhead since the instance + /// table must now be read before processing every orchestration work item to obtain the latest etag. + /// + public bool UseInstanceTableEtag { get; set; } = false; } } diff --git a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj index b65a306fd..b68d09603 100644 --- a/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj +++ b/src/DurableTask.AzureStorage/DurableTask.AzureStorage.csproj @@ -21,8 +21,8 @@ 2 - 5 - 0 + 8 + 3 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.AzureStorage/MessageData.cs b/src/DurableTask.AzureStorage/MessageData.cs index d89e7c686..4715043c7 100644 --- a/src/DurableTask.AzureStorage/MessageData.cs +++ b/src/DurableTask.AzureStorage/MessageData.cs @@ -111,6 +111,8 @@ internal void Update(UpdateReceipt receipt) { this.OriginalQueueMessage = this.OriginalQueueMessage.Update(receipt); } + + internal object MessageMetadata { get; set; } } /// diff --git a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs index 719694e97..ae3e355d6 100644 --- a/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs +++ b/src/DurableTask.AzureStorage/Messaging/OrchestrationSession.cs @@ -17,6 +17,7 @@ namespace DurableTask.AzureStorage.Messaging using System.Collections.Generic; using System.IO; using System.Linq; + using System.Threading; using System.Threading.Tasks; using Azure; using DurableTask.Core; @@ -26,6 +27,7 @@ namespace DurableTask.AzureStorage.Messaging sealed class OrchestrationSession : SessionBase, IOrchestrationSession { readonly TimeSpan idleTimeout; + readonly CancellationToken shutdownToken; readonly AsyncAutoResetEvent messagesAvailableEvent; readonly MessageCollection nextMessageBatch; @@ -37,18 +39,20 @@ public OrchestrationSession( ControlQueue controlQueue, List initialMessageBatch, OrchestrationRuntimeState runtimeState, - ETag? eTag, + OrchestrationETags eTags, DateTime lastCheckpointTime, object trackingStoreContext, TimeSpan idleTimeout, + CancellationToken shutdownToken, Guid traceActivityId) : base(settings, storageAccountName, orchestrationInstance, traceActivityId) { this.idleTimeout = idleTimeout; + this.shutdownToken = shutdownToken; this.ControlQueue = controlQueue ?? throw new ArgumentNullException(nameof(controlQueue)); this.CurrentMessageBatch = initialMessageBatch ?? throw new ArgumentNullException(nameof(initialMessageBatch)); this.RuntimeState = runtimeState ?? throw new ArgumentNullException(nameof(runtimeState)); - this.ETag = eTag; + this.ETags = eTags; this.LastCheckpointTime = lastCheckpointTime; this.TrackingStoreContext = trackingStoreContext; @@ -66,7 +70,7 @@ public OrchestrationSession( public OrchestrationRuntimeState RuntimeState { get; private set; } - public ETag? ETag { get; set; } + public OrchestrationETags ETags { get; set; } public DateTime LastCheckpointTime { get; } @@ -98,9 +102,9 @@ public void AddOrReplaceMessages(IEnumerable messages) public async Task> FetchNewOrchestrationMessagesAsync( TaskOrchestrationWorkItem workItem) { - if (!await this.messagesAvailableEvent.WaitAsync(this.idleTimeout)) + if (!await this.messagesAvailableEvent.WaitAsync(this.idleTimeout, this.shutdownToken)) { - return null; // timed-out + return null; // timed-out or shutting down } this.StartNewLogicalTraceScope(); @@ -160,18 +164,6 @@ public bool IsOutOfOrderMessage(MessageData message) return false; } - if (this.LastCheckpointTime > message.TaskMessage.Event.Timestamp) - { - // LastCheckpointTime represents the time at which the most recent history checkpoint completed. - // The checkpoint is written to the history table only *after* all queue messages are sent. - // A message is out of order when its timestamp *preceeds* the most recent checkpoint timestamp. - // In this case, we see that the checkpoint came *after* the message, so there is no out-of-order - // concern. Note that this logic only applies for messages sent by orchestrations to themselves. - // The next check considers the other cases (activities, sub-orchestrations, etc.). - // Orchestration checkpoint time information was added only after v1.6.4. - return false; - } - if (Utils.TryGetTaskScheduledId(message.TaskMessage.Event, out int taskScheduledId)) { // This message is a response to a task. Search the history to make sure that we've recorded the fact that @@ -189,7 +181,7 @@ public bool IsOutOfOrderMessage(MessageData message) var requestId = ((EventRaisedEvent)message.TaskMessage.Event).Name; if (requestId != null) { - HistoryEvent mostRecentTaskEvent = this.RuntimeState.Events.FirstOrDefault(e => e.EventType == EventType.EventSent && FindRequestId(((EventSentEvent)e).Input)?.ToString() == requestId); + HistoryEvent mostRecentTaskEvent = this.RuntimeState.Events.LastOrDefault(e => e.EventType == EventType.EventSent && FindRequestId(((EventSentEvent)e).Input)?.ToString() == requestId); if (mostRecentTaskEvent != null) { return false; diff --git a/src/DurableTask.AzureStorage/Messaging/TaskHubQueue.cs b/src/DurableTask.AzureStorage/Messaging/TaskHubQueue.cs index 371b00818..221789566 100644 --- a/src/DurableTask.AzureStorage/Messaging/TaskHubQueue.cs +++ b/src/DurableTask.AzureStorage/Messaging/TaskHubQueue.cs @@ -345,7 +345,6 @@ await this.storageQueue.UpdateMessageAsync( public virtual async Task DeleteMessageAsync(MessageData message, SessionBase? session = null) { - QueueMessage queueMessage = message.OriginalQueueMessage; TaskMessage taskMessage = message.TaskMessage; bool haveRetried = false; @@ -356,16 +355,16 @@ public virtual async Task DeleteMessageAsync(MessageData message, SessionBase? s this.settings.TaskHubName, taskMessage.Event.EventType.ToString(), Utils.GetTaskEventId(taskMessage.Event), - queueMessage.MessageId, + message.OriginalQueueMessage.MessageId, taskMessage.OrchestrationInstance.InstanceId, taskMessage.OrchestrationInstance.ExecutionId, this.storageQueue.Name, message.SequenceNumber, - queueMessage.PopReceipt); + message.OriginalQueueMessage.PopReceipt); try { - await this.storageQueue.DeleteMessageAsync(queueMessage, session?.TraceActivityId); + await this.storageQueue.DeleteMessageAsync(message.OriginalQueueMessage, session?.TraceActivityId); } catch (Exception e) { diff --git a/src/DurableTask.AzureStorage/OrchestrationETags.cs b/src/DurableTask.AzureStorage/OrchestrationETags.cs new file mode 100644 index 000000000..6c304b393 --- /dev/null +++ b/src/DurableTask.AzureStorage/OrchestrationETags.cs @@ -0,0 +1,24 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.AzureStorage +{ + using Azure; + + class OrchestrationETags + { + internal ETag? InstanceETag { get; set; } + + internal ETag? HistoryETag { get; set; } + } +} diff --git a/src/DurableTask.AzureStorage/OrchestrationSessionManager.cs b/src/DurableTask.AzureStorage/OrchestrationSessionManager.cs index 51a6e07ba..abf7a58b2 100644 --- a/src/DurableTask.AzureStorage/OrchestrationSessionManager.cs +++ b/src/DurableTask.AzureStorage/OrchestrationSessionManager.cs @@ -43,6 +43,8 @@ class OrchestrationSessionManager : IDisposable readonly ITrackingStore trackingStore; readonly DispatchQueue fetchRuntimeStateQueue; + CancellationToken shutdownToken; + public OrchestrationSessionManager( string queueAccountName, AzureStorageOrchestrationServiceSettings settings, @@ -61,6 +63,8 @@ public OrchestrationSessionManager( public void AddQueue(string partitionId, ControlQueue controlQueue, CancellationToken cancellationToken) { + this.shutdownToken = cancellationToken; + if (this.ownedControlQueues.TryAdd(partitionId, controlQueue)) { _ = Task.Run(() => this.DequeueLoop(partitionId, controlQueue, cancellationToken)); @@ -271,21 +275,25 @@ async Task> DedupeExecutionStartedMessagesAsync( // Terminology: // "Local" -> the instance ID info comes from the local copy of the message we're examining // "Remote" -> the instance ID info comes from the Instances table that we're querying - IAsyncEnumerable instances = this.trackingStore.GetStateAsync(instanceIds, cancellationToken); - IDictionary remoteOrchestrationsById = - await instances.ToDictionaryAsync(o => o.OrchestrationInstance.InstanceId, cancellationToken); + IAsyncEnumerable instances = this.trackingStore.FetchInstanceStatusAsync(instanceIds, cancellationToken); + IDictionary remoteOrchestrationsById = + await instances.ToDictionaryAsync(o => o.State.OrchestrationInstance.InstanceId, cancellationToken); foreach (MessageData message in executionStartedMessages) { OrchestrationInstance localInstance = message.TaskMessage.OrchestrationInstance; var expectedGeneration = ((ExecutionStartedEvent)message.TaskMessage.Event).Generation; - if (remoteOrchestrationsById.TryGetValue(localInstance.InstanceId, out OrchestrationState remoteInstance) && - (remoteInstance.OrchestrationInstance.ExecutionId == null || string.Equals(localInstance.ExecutionId, remoteInstance.OrchestrationInstance.ExecutionId, StringComparison.OrdinalIgnoreCase))) + if (remoteOrchestrationsById.TryGetValue(localInstance.InstanceId, out InstanceStatus remoteInstance) && + (remoteInstance.State.OrchestrationInstance.ExecutionId == null || string.Equals(localInstance.ExecutionId, remoteInstance.State.OrchestrationInstance.ExecutionId, StringComparison.OrdinalIgnoreCase))) { // Happy path: The message matches the table status. Alternatively, if the table doesn't have an ExecutionId field (older clients, pre-v1.8.5), // then we have no way of knowing if it's a duplicate. Either way, allow it to run. + if (this.settings.UseInstanceTableEtag) + { + message.MessageMetadata = remoteInstance.ETag; + } } - else if (expectedGeneration == remoteInstance?.Generation && this.IsScheduledAfterInstanceUpdate(message, remoteInstance)) + else if (expectedGeneration == remoteInstance?.State.Generation && this.IsScheduledAfterInstanceUpdate(message, remoteInstance?.State)) { // The message was scheduled after the Instances table was updated with the orchestration info. // We know almost certainly that this is a redundant message and can be safely discarded because @@ -476,6 +484,10 @@ internal void AddMessageToPendingOrchestration( if (targetBatch == null) { targetBatch = new PendingMessageBatch(controlQueue, instanceId, executionId); + if (this.settings.UseInstanceTableEtag && data.MessageMetadata is ETag instanceEtag) + { + targetBatch.ETags.InstanceETag = instanceEtag; + } node = this.pendingOrchestrationMessageBatches.AddLast(targetBatch); // Before the batch of messages can be processed, we need to download the latest execution state. @@ -519,9 +531,20 @@ async Task ScheduleOrchestrationStatePrefetch( cancellationToken); batch.OrchestrationState = new OrchestrationRuntimeState(history.Events); - batch.ETag = history.ETag; + batch.ETags.HistoryETag = history.ETag; batch.LastCheckpointTime = history.LastCheckpointTime; batch.TrackingStoreContext = history.TrackingStoreContext; + + // Try to get the instance ETag from the tracking store if it wasn't already provided + if (this.settings.UseInstanceTableEtag && batch.ETags.InstanceETag == null) + { + InstanceStatus? instanceStatus = await this.trackingStore.FetchInstanceStatusAsync( + batch.OrchestrationInstanceId, + cancellationToken); + // The instance could not exist in the case that these messages are for the first execution of a suborchestration, + // or an entity-started orchestration, for example + batch.ETags.InstanceETag = instanceStatus?.ETag; + } } if (this.settings.UseSeparateQueueForEntityWorkItems @@ -590,10 +613,11 @@ async Task ScheduleOrchestrationStatePrefetch( nextBatch.ControlQueue, nextBatch.Messages, nextBatch.OrchestrationState, - nextBatch.ETag, + nextBatch.ETags, nextBatch.LastCheckpointTime, nextBatch.TrackingStoreContext, this.settings.ExtendedSessionIdleTimeout, + this.shutdownToken, traceActivityId); this.activeOrchestrationSessions.Add(instance.InstanceId, session); @@ -637,6 +661,19 @@ async Task ScheduleOrchestrationStatePrefetch( return null; } + /// + /// Immediately removes all active sessions, causing + /// to return false for all partitions. This unblocks so that + /// a forced shutdown can complete without waiting for sessions to drain naturally. + /// + public void AbortAllSessions() + { + lock (this.messageAndSessionLock) + { + this.activeOrchestrationSessions.Clear(); + } + } + public bool TryGetExistingSession(string instanceId, out OrchestrationSession session) { lock (this.messageAndSessionLock) @@ -737,8 +774,10 @@ public OrchestrationRuntimeState? OrchestrationState } } - public ETag? ETag { get; set; } + public OrchestrationETags ETags { get; } = new OrchestrationETags(); + public DateTime LastCheckpointTime { get; set; } + public object? TrackingStoreContext { get; set; } } } diff --git a/src/DurableTask.AzureStorage/Storage/Table.cs b/src/DurableTask.AzureStorage/Storage/Table.cs index e47b49529..7d726e1d7 100644 --- a/src/DurableTask.AzureStorage/Storage/Table.cs +++ b/src/DurableTask.AzureStorage/Storage/Table.cs @@ -86,16 +86,18 @@ public async Task DeleteEntityAsync(T tableEntity, ETag ifMatch = default, Ca this.stats.TableEntitiesWritten.Increment(); } - public async Task InsertEntityAsync(T tableEntity, CancellationToken cancellationToken = default) where T : ITableEntity + public async Task InsertEntityAsync(T tableEntity, CancellationToken cancellationToken = default) where T : ITableEntity { - await this.tableClient.AddEntityAsync(tableEntity, cancellationToken).DecorateFailure(); + Response result = await this.tableClient.AddEntityAsync(tableEntity, cancellationToken).DecorateFailure(); this.stats.TableEntitiesWritten.Increment(); + return result; } - public async Task MergeEntityAsync(T tableEntity, ETag ifMatch, CancellationToken cancellationToken = default) where T : ITableEntity + public async Task MergeEntityAsync(T tableEntity, ETag ifMatch, CancellationToken cancellationToken = default) where T : ITableEntity { - await this.tableClient.UpdateEntityAsync(tableEntity, ifMatch, TableUpdateMode.Merge, cancellationToken).DecorateFailure(); + Response result = await this.tableClient.UpdateEntityAsync(tableEntity, ifMatch, TableUpdateMode.Merge, cancellationToken).DecorateFailure(); this.stats.TableEntitiesWritten.Increment(); + return result; } public async Task InsertOrMergeEntityAsync(T tableEntity, CancellationToken cancellationToken = default) where T : ITableEntity diff --git a/src/DurableTask.AzureStorage/Tracking/AzureTableTrackingStore.cs b/src/DurableTask.AzureStorage/Tracking/AzureTableTrackingStore.cs index 17824a511..f1e67a0bf 100644 --- a/src/DurableTask.AzureStorage/Tracking/AzureTableTrackingStore.cs +++ b/src/DurableTask.AzureStorage/Tracking/AzureTableTrackingStore.cs @@ -510,19 +510,19 @@ async Task ConvertFromAsync(OrchestrationInstanceStatus orch } /// - public override async IAsyncEnumerable GetStateAsync(IEnumerable instanceIds, [EnumeratorCancellation] CancellationToken cancellationToken = default) + public override async IAsyncEnumerable FetchInstanceStatusAsync(IEnumerable instanceIds, [EnumeratorCancellation] CancellationToken cancellationToken = default) { if (instanceIds == null) { yield break; } - IEnumerable> instanceQueries = instanceIds.Select(instance => this.GetStateAsync(instance, allExecutions: true, fetchInput: false, cancellationToken).SingleOrDefaultAsync().AsTask()); - foreach (OrchestrationState state in await Task.WhenAll(instanceQueries)) + IEnumerable> instanceQueries = instanceIds.Select(instance => this.FetchInstanceStatusAsync(instance, cancellationToken)); + foreach (InstanceStatus status in await Task.WhenAll(instanceQueries)) { - if (state != null) + if (status != null) { - yield return state; + yield return status; } } } @@ -795,6 +795,39 @@ public override async Task UpdateStatusForRewindAsync(string instanceId, Cancell stopwatch.ElapsedMilliseconds); } + /// + public override async Task UpdateStatusForTerminationAsync( + string instanceId, + ExecutionTerminatedEvent executionTerminatedEvent, + CancellationToken cancellationToken = default) + { + string sanitizedInstanceId = KeySanitation.EscapePartitionKey(instanceId); + TableEntity instanceEntity = new TableEntity(sanitizedInstanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Terminated.ToString("G"), + ["LastUpdatedTime"] = executionTerminatedEvent.Timestamp, + ["CompletedTime"] = DateTime.UtcNow, + // In the case of terminating an orchestration, the termination reason becomes the orchestration's output. + [OutputProperty] = executionTerminatedEvent.Input, + }; + + // Setting addBlobPropertyName to false ensures that the blob URL is saved as the "Output" of the instance entity, which is the expected behavior + // for large orchestration outputs. + await this.CompressLargeMessageAsync(instanceEntity, listOfBlobs: null, cancellationToken: cancellationToken, addBlobPropertyName: false); + + Stopwatch stopwatch = Stopwatch.StartNew(); + await this.InstancesTable.MergeEntityAsync(instanceEntity, ETag.All, cancellationToken); + + this.settings.Logger.InstanceStatusUpdate( + this.storageAccountName, + this.taskHubName, + instanceId, + string.Empty, + OrchestrationStatus.Terminated, + episode: 0, + stopwatch.ElapsedMilliseconds); + } + /// public override Task StartAsync(CancellationToken cancellationToken = default) @@ -806,12 +839,12 @@ public override Task StartAsync(CancellationToken cancellationToken = default) } /// - public override async Task UpdateStateAsync( + public override async Task UpdateStateAsync( OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, - ETag? eTagValue, + OrchestrationETags eTags, object trackingStoreContext, CancellationToken cancellationToken = default) { @@ -835,6 +868,7 @@ public override Task StartAsync(CancellationToken cancellationToken = default) ["CustomStatus"] = newRuntimeState.Status ?? "null", ["ExecutionId"] = executionId, ["LastUpdatedTime"] = newEvents.Last().Timestamp, + ["TaskHubName"] = this.settings.TaskHubName, }; // check if we are replacing a previous execution with blobs; those will be deleted from the store after the update. This could occur in a ContinueAsNew scenario @@ -881,6 +915,8 @@ public override Task StartAsync(CancellationToken cancellationToken = default) instanceEntity["Version"] = executionStartedEvent.Version; instanceEntity["CreatedTime"] = executionStartedEvent.Timestamp; instanceEntity["RuntimeStatus"] = OrchestrationStatus.Running.ToString(); + instanceEntity["Tags"] = TagsSerializer.Serialize(executionStartedEvent.Tags); + instanceEntity["Generation"] = executionStartedEvent.Generation; if (executionStartedEvent.ScheduledStartTime.HasValue) { instanceEntity["ScheduledStartTime"] = executionStartedEvent.ScheduledStartTime; @@ -955,7 +991,7 @@ public override Task StartAsync(CancellationToken cancellationToken = default) // Table storage only supports inserts of up to 100 entities at a time or 4 MB at a time. if (historyEventBatch.Count == 99 || estimatedBytes > 3 * 1024 * 1024 /* 3 MB */) { - eTagValue = await this.UploadHistoryBatch( + eTags.HistoryETag = await this.UploadHistoryBatch( instanceId, sanitizedInstanceId, executionId, @@ -964,7 +1000,7 @@ public override Task StartAsync(CancellationToken cancellationToken = default) allEvents.Count, episodeNumber, estimatedBytes, - eTagValue, + eTags.HistoryETag, isFinalBatch: isFinalEvent, cancellationToken: cancellationToken); @@ -978,7 +1014,7 @@ public override Task StartAsync(CancellationToken cancellationToken = default) // First persistence step is to commit history to the history table. Messages must come after. if (historyEventBatch.Count > 0) { - eTagValue = await this.UploadHistoryBatch( + eTags.HistoryETag = await this.UploadHistoryBatch( instanceId, sanitizedInstanceId, executionId, @@ -987,22 +1023,12 @@ public override Task StartAsync(CancellationToken cancellationToken = default) allEvents.Count, episodeNumber, estimatedBytes, - eTagValue, + eTags.HistoryETag, isFinalBatch: true, cancellationToken: cancellationToken); } - Stopwatch orchestrationInstanceUpdateStopwatch = Stopwatch.StartNew(); - await this.InstancesTable.InsertOrMergeEntityAsync(instanceEntity); - - this.settings.Logger.InstanceStatusUpdate( - this.storageAccountName, - this.taskHubName, - instanceId, - executionId, - runtimeStatus, - episodeNumber, - orchestrationInstanceUpdateStopwatch.ElapsedMilliseconds); + eTags.InstanceETag = await this.UpdateInstanceTableAsync(instanceEntity, eTags.InstanceETag, instanceId, executionId, runtimeStatus, episodeNumber); // finally, delete orphaned blobs from the previous execution history. // We had to wait until the new history has committed to make sure the blobs are no longer necessary. @@ -1015,8 +1041,119 @@ public override Task StartAsync(CancellationToken cancellationToken = default) } await Task.WhenAll(tasks); } + } + + public override async Task UpdateInstanceStatusForCompletedOrchestrationAsync( + string instanceId, + string executionId, + OrchestrationRuntimeState runtimeState, + bool instanceEntityExists, + CancellationToken cancellationToken = default) + { + if (runtimeState.OrchestrationStatus != OrchestrationStatus.Completed && + runtimeState.OrchestrationStatus != OrchestrationStatus.Canceled && + runtimeState.OrchestrationStatus != OrchestrationStatus.Failed && + runtimeState.OrchestrationStatus != OrchestrationStatus.Terminated) + { + return; + } + + string sanitizedInstanceId = KeySanitation.EscapePartitionKey(instanceId); + ExecutionStartedEvent executionStartedEvent = runtimeState.ExecutionStartedEvent; + + // We need to set all of the fields of the instance entity in the case that it was never created for the orchestration. + // This can be the case for a suborchestration that completed in one execution, for example. + var instanceEntity = new TableEntity(sanitizedInstanceId, string.Empty) + { + ["Name"] = runtimeState.Name, + ["Version"] = runtimeState.Version, + ["CreatedTime"] = executionStartedEvent.Timestamp, + // TODO: Translating null to "null" is a temporary workaround. We should prioritize + // https://github.com/Azure/durabletask/issues/477 so that this is no longer necessary. + ["CustomStatus"] = runtimeState.Status ?? "null", + ["ExecutionId"] = executionId, + ["LastUpdatedTime"] = runtimeState.Events.Last().Timestamp, + ["RuntimeStatus"] = runtimeState.OrchestrationStatus.ToString(), + ["CompletedTime"] = runtimeState.CompletedTime, + ["Tags"] = TagsSerializer.Serialize(executionStartedEvent.Tags), + ["TaskHubName"] = this.settings.TaskHubName, + }; + if (runtimeState.ExecutionStartedEvent.ScheduledStartTime.HasValue) + { + instanceEntity["ScheduledStartTime"] = executionStartedEvent.ScheduledStartTime; + } + + static TableEntity GetSingleEntityFromHistoryTableResults(IReadOnlyList entities, string dataType) + { + try + { + TableEntity singleEntity = entities.SingleOrDefault(); - return eTagValue; + return singleEntity ?? throw new DurableTaskStorageException($"The history table query to determine the blob storage URL " + + $"for the large orchestration {dataType} returned no rows. Unable to extract the URL from these results."); + } + catch (InvalidOperationException) + { + throw new DurableTaskStorageException($"The history table query to determine the blob storage URL for the large orchestration " + + $"{dataType} returned more than one row, when exactly one row is expected. " + + $"Unable to extract the URL from these results."); + } + } + + // Set the output. + // In the case that the output is too large and is stored in blob storage, extract the blob name from the ExecutionCompleted history entity. + if (this.ExceedsMaxTablePropertySize(runtimeState.Output)) + { + string filter = $"{nameof(ITableEntity.PartitionKey)} eq '{KeySanitation.EscapePartitionKey(instanceId)}'" + + $" and {nameof(OrchestrationInstance.ExecutionId)} eq '{executionId}'" + + $" and {nameof(HistoryEvent.EventType)} eq '{nameof(EventType.ExecutionCompleted)}'"; + TableEntity executionCompletedEntity = GetSingleEntityFromHistoryTableResults(await this.QueryHistoryAsync(filter, instanceId, cancellationToken), "output"); + this.SetInstancesTablePropertyFromHistoryProperty( + executionCompletedEntity, + instanceEntity, + historyPropertyName: nameof(runtimeState.ExecutionCompletedEvent.Result), + instancePropertyName: OutputProperty, + data: runtimeState.Output); + } + else + { + instanceEntity[OutputProperty] = runtimeState.Output; + } + + // If the input has not been set by a previous execution, set the input. + if (!instanceEntityExists) + { + // In the case that the input is too large and is stored in blob storage, extract the blob name from the ExecutionStarted history entity. + if (this.ExceedsMaxTablePropertySize(runtimeState.Input)) + { + string filter = $"{nameof(ITableEntity.PartitionKey)} eq '{KeySanitation.EscapePartitionKey(instanceId)}'" + + $" and {nameof(OrchestrationInstance.ExecutionId)} eq '{executionId}'" + + $" and {nameof(HistoryEvent.EventType)} eq '{nameof(EventType.ExecutionStarted)}'"; + TableEntity executionStartedEntity = GetSingleEntityFromHistoryTableResults(await this.QueryHistoryAsync(filter, instanceId, cancellationToken), "input"); + this.SetInstancesTablePropertyFromHistoryProperty( + executionStartedEntity, + instanceEntity, + historyPropertyName: nameof(executionStartedEvent.Input), + instancePropertyName: InputProperty, + data: executionStartedEvent.Input); + } + else + { + instanceEntity[InputProperty] = runtimeState.Input; + } + } + + Stopwatch orchestrationInstanceUpdateStopwatch = Stopwatch.StartNew(); + await this.InstancesTable.InsertOrMergeEntityAsync(instanceEntity); + + this.settings.Logger.InstanceStatusUpdate( + this.storageAccountName, + this.taskHubName, + instanceId, + executionId, + runtimeState.OrchestrationStatus, + Utils.GetEpisodeNumber(runtimeState), + orchestrationInstanceUpdateStopwatch.ElapsedMilliseconds); } static int GetEstimatedByteCount(TableEntity entity) @@ -1081,7 +1218,7 @@ void SetInstancesTablePropertyFromHistoryProperty( } } - async Task CompressLargeMessageAsync(TableEntity entity, List listOfBlobs, CancellationToken cancellationToken) + async Task CompressLargeMessageAsync(TableEntity entity, List listOfBlobs, CancellationToken cancellationToken, bool addBlobPropertyName = true) { foreach (string propertyName in VariableSizeEntityProperties) { @@ -1096,9 +1233,16 @@ property is string stringProperty && // Clear out the original property value and create a new "*BlobName"-suffixed property. // The runtime will look for the new "*BlobName"-suffixed column to know if a property is stored in a blob. - string blobPropertyName = GetBlobPropertyName(propertyName); - entity.Add(blobPropertyName, blobName); - entity[propertyName] = string.Empty; + if (addBlobPropertyName) + { + string blobPropertyName = GetBlobPropertyName(propertyName); + entity.Add(blobPropertyName, blobName); + entity[propertyName] = string.Empty; + } + else + { + entity[propertyName] = this.messageManager.GetBlobUrl(blobName); + } // if necessary, keep track of all the blobs associated with this execution listOfBlobs?.Add(blobName); @@ -1146,6 +1290,12 @@ static string GetBlobName(TableEntity entity, string property) // EventType. Use a hardcoded value to record the orchestration input. eventType = "Input"; } + else if (property == "Output") + { + // This message is used to terminate an orchestration with no history, so it does not have a + // corresponding EventType. Use a hardcoded value to record the orchestration output. + eventType = "Output"; + } else if (property == "Tags") { eventType = "Tags"; @@ -1203,7 +1353,7 @@ static string GetBlobName(TableEntity entity, string property) } catch (DurableTaskStorageException ex) { - // Handle the case where the the history has already been updated by another caller. + // Handle the case where the history has already been updated by another caller. // Common case: the resulting code is 'PreconditionFailed', which means "eTagValue" no longer matches the one stored, and TableTransactionActionType is "Update". // Edge case: the resulting code is 'Conflict'. This is the case when eTagValue is null, and the TableTransactionActionType is "Add", // in which case the exception indicates that the table entity we are trying to "add" already exists. @@ -1262,6 +1412,62 @@ bool ExceedsMaxTablePropertySize(string data) return false; } + async Task UpdateInstanceTableAsync(TableEntity instanceEntity, ETag? eTag, string instanceId, string executionId, OrchestrationStatus runtimeStatus, int episodeNumber) + { + var orchestrationInstanceUpdateStopwatch = Stopwatch.StartNew(); + + ETag? newEtag = null; + + if (!this.settings.UseInstanceTableEtag) + { + await this.InstancesTable.InsertOrMergeEntityAsync(instanceEntity); + } + else + { + try + { + Response result = await (eTag == null + ? this.InstancesTable.InsertEntityAsync(instanceEntity) + : this.InstancesTable.MergeEntityAsync(instanceEntity, eTag.Value)); + newEtag = result.Headers.ETag; + } + catch (DurableTaskStorageException ex) + { + // Handle the case where the instance table has already been updated by another caller. + // Common case: the resulting code is 'PreconditionFailed', which means we are trying to update an existing instance entity and "eTag" no longer matches the one stored. + // Edge case: the resulting code is 'Conflict'. This is the case when eTag is null, and we are trying to insert a new instance entity, in which case the exception + // indicates that the table entity we are trying to "add" already exists. + if (ex.HttpStatusCode == (int)HttpStatusCode.Conflict || ex.HttpStatusCode == (int)HttpStatusCode.PreconditionFailed) + { + this.settings.Logger.SplitBrainDetected( + this.storageAccountName, + this.taskHubName, + instanceId, + executionId, + newEventCount: 0, + totalEventCount: 1, + "InstanceEntity", + orchestrationInstanceUpdateStopwatch.ElapsedMilliseconds, + eTag is null ? string.Empty : eTag.ToString()); + } + + throw; + } + } + + this.settings.Logger.InstanceStatusUpdate( + this.storageAccountName, + this.taskHubName, + instanceId, + executionId, + runtimeStatus, + episodeNumber, + orchestrationInstanceUpdateStopwatch.ElapsedMilliseconds); + + return newEtag; + + } + class TrackingStoreContext { public List Blobs { get; set; } = new List(); diff --git a/src/DurableTask.AzureStorage/Tracking/ITrackingStore.cs b/src/DurableTask.AzureStorage/Tracking/ITrackingStore.cs index db6338bfb..a1fc52e9f 100644 --- a/src/DurableTask.AzureStorage/Tracking/ITrackingStore.cs +++ b/src/DurableTask.AzureStorage/Tracking/ITrackingStore.cs @@ -72,10 +72,10 @@ interface ITrackingStore /// The RuntimeState for an olderExecution /// InstanceId for the Orchestration Update /// ExecutionId for the Orchestration Update - /// The ETag value to use for safe updates + /// The ETag value for the instance and history tables to use for safe updates /// Additional context for the execution that is maintained by the tracking store. /// The token to monitor for cancellation requests. The default value is . - Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, ETag? eTag, object trackingStoreContext, CancellationToken cancellationToken = default); + Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, OrchestrationETags eTags, object trackingStoreContext, CancellationToken cancellationToken = default); /// /// Get The Orchestration State for the Latest or All Executions @@ -103,6 +103,18 @@ interface ITrackingStore /// Returns the instance status or null if none was found. Task FetchInstanceStatusAsync(string instanceId, CancellationToken cancellationToken = default); + /// + /// Updates the instance status of the specified orchestration instance to match that of for a completed orchestration. + /// This method is meant to be called in the case that there is an inconsistency between the instance and history table due to a failure during a call to + /// for a completing orchestration. If the orchestration is not in a terminal state, the method will immediately return and do nothing. + /// + /// The ID of the orchestration. + /// The execution ID of the orchestration. + /// The runtime state of the orchestration. + /// Whether the instance entity already exists in the instance store. + /// The token to monitor for cancellation requests. The default value is . + Task UpdateInstanceStatusForCompletedOrchestrationAsync(string instanceId, string executionId, OrchestrationRuntimeState runtimeState, bool instanceEntityExists, CancellationToken cancellationToken = default); + /// /// Get The Orchestration State for querying all orchestration instances /// @@ -115,7 +127,7 @@ interface ITrackingStore /// /// The list of instances to query for. /// The token to monitor for cancellation requests. The default value is . - IAsyncEnumerable GetStateAsync(IEnumerable instanceIds, CancellationToken cancellationToken = default); + IAsyncEnumerable FetchInstanceStatusAsync(IEnumerable instanceIds, CancellationToken cancellationToken = default); /// /// Get The Orchestration State for querying orchestration instances by the condition @@ -152,6 +164,14 @@ interface ITrackingStore /// The token to monitor for cancellation requests. The default value is . Task UpdateStatusForRewindAsync(string instanceId, CancellationToken cancellationToken = default); + /// + /// Used to update the instance status to "Terminated" when a pending orchestration is terminated. + /// + /// The instance being terminated + /// The termination history event. + /// The token to monitor for cancellation requests. The default value is . + Task UpdateStatusForTerminationAsync(string instanceId, ExecutionTerminatedEvent executionTerminatedEvent, CancellationToken cancellationToken = default); + /// /// Purge The History and state which is older than thresholdDateTimeUtc based on the timestamp type specified by timeRangeFilterType /// diff --git a/src/DurableTask.AzureStorage/Tracking/InstanceStoreBackedTrackingStore.cs b/src/DurableTask.AzureStorage/Tracking/InstanceStoreBackedTrackingStore.cs index 4b816aacd..f719c92dd 100644 --- a/src/DurableTask.AzureStorage/Tracking/InstanceStoreBackedTrackingStore.cs +++ b/src/DurableTask.AzureStorage/Tracking/InstanceStoreBackedTrackingStore.cs @@ -136,20 +136,20 @@ public override Task StartAsync(CancellationToken cancellationToken = default) } /// - public override async Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, ETag? eTag, object executionData, CancellationToken cancellationToken = default) + public override async Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, OrchestrationETags eTags, object executionData, CancellationToken cancellationToken = default) { //In case there is a runtime state for an older execution/iteration as well that needs to be committed, commit it. //This may be the case if a ContinueAsNew was executed on the orchestration if (newRuntimeState != oldRuntimeState) { - eTag = await UpdateStateAsync(oldRuntimeState, instanceId, oldRuntimeState.OrchestrationInstance.ExecutionId, eTag, cancellationToken); + await UpdateStateAsync(oldRuntimeState, instanceId, oldRuntimeState.OrchestrationInstance.ExecutionId, eTags, cancellationToken); } - return await UpdateStateAsync(newRuntimeState, instanceId, executionId, eTag, cancellationToken); + await UpdateStateAsync(newRuntimeState, instanceId, executionId, eTags, cancellationToken); } /// - private async Task UpdateStateAsync(OrchestrationRuntimeState runtimeState, string instanceId, string executionId, ETag? eTag, CancellationToken cancellationToken = default) + private async Task UpdateStateAsync(OrchestrationRuntimeState runtimeState, string instanceId, string executionId, OrchestrationETags eTags, CancellationToken cancellationToken = default) { int oldEventsCount = (runtimeState.Events.Count - runtimeState.NewEvents.Count); await instanceStore.WriteEntitiesAsync(runtimeState.NewEvents.Select((x, i) => @@ -173,8 +173,45 @@ await instanceStore.WriteEntitiesAsync(new InstanceEntityBase[] SequenceNumber = runtimeState.Events.Count } }); + } + + public override async Task UpdateStatusForTerminationAsync( + string instanceId, + ExecutionTerminatedEvent executionTerminatedEvent, + CancellationToken cancellationToken = default) + { + // Get the most recent execution and update its status to terminated + IEnumerable instanceEntity = await this.instanceStore.GetOrchestrationStateAsync(instanceId, allInstances: false); + instanceEntity.Single().State.OrchestrationStatus = OrchestrationStatus.Terminated; + instanceEntity.Single().State.LastUpdatedTime = executionTerminatedEvent.Timestamp; + instanceEntity.Single().State.CompletedTime = DateTime.UtcNow; + instanceEntity.Single().State.Output = executionTerminatedEvent.Input; + await this.instanceStore.WriteEntitiesAsync(instanceEntity); + } + + public override async Task UpdateInstanceStatusForCompletedOrchestrationAsync( + string instanceId, + string executionId, + OrchestrationRuntimeState runtimeState, + bool instanceEntityExists, + CancellationToken cancellationToken = default) + { + if (runtimeState.OrchestrationStatus != OrchestrationStatus.Completed && + runtimeState.OrchestrationStatus != OrchestrationStatus.Canceled && + runtimeState.OrchestrationStatus != OrchestrationStatus.Failed && + runtimeState.OrchestrationStatus != OrchestrationStatus.Terminated) + { + return; + } - return null; + await instanceStore.WriteEntitiesAsync(new InstanceEntityBase[] + { + new OrchestrationStateInstanceEntity() + { + State = Core.Common.Utils.BuildOrchestrationState(runtimeState), + SequenceNumber = runtimeState.Events.Count + } + }); } } } diff --git a/src/DurableTask.AzureStorage/Tracking/TrackingStoreBase.cs b/src/DurableTask.AzureStorage/Tracking/TrackingStoreBase.cs index 95465461f..d02a729c0 100644 --- a/src/DurableTask.AzureStorage/Tracking/TrackingStoreBase.cs +++ b/src/DurableTask.AzureStorage/Tracking/TrackingStoreBase.cs @@ -60,7 +60,7 @@ public virtual IAsyncEnumerable GetStateAsync(CancellationTo } /// - public virtual IAsyncEnumerable GetStateAsync(IEnumerable instanceIds, CancellationToken cancellationToken = default) + public virtual IAsyncEnumerable FetchInstanceStatusAsync(IEnumerable instanceIds, CancellationToken cancellationToken = default) { throw new NotSupportedException(); } @@ -100,10 +100,16 @@ public virtual Task UpdateStatusForRewindAsync(string instanceId, CancellationTo throw new NotSupportedException(); } + /// + public abstract Task UpdateStatusForTerminationAsync(string instanceId, ExecutionTerminatedEvent executionTerminatedEvent, CancellationToken cancellationToken = default); + /// public abstract Task StartAsync(CancellationToken cancellationToken = default); /// - public abstract Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, ETag? eTag, object executionData, CancellationToken cancellationToken = default); + public abstract Task UpdateStateAsync(OrchestrationRuntimeState newRuntimeState, OrchestrationRuntimeState oldRuntimeState, string instanceId, string executionId, OrchestrationETags eTags, object executionData, CancellationToken cancellationToken = default); + + /// + public abstract Task UpdateInstanceStatusForCompletedOrchestrationAsync(string instanceId, string executionId, OrchestrationRuntimeState runtimeState, bool instanceEntityExists, CancellationToken cancellationToken = default); } } diff --git a/src/DurableTask.Core/Command/OrchestrationCompleteOrchestratorAction.cs b/src/DurableTask.Core/Command/OrchestrationCompleteOrchestratorAction.cs index 56551b13e..54abd5225 100644 --- a/src/DurableTask.Core/Command/OrchestrationCompleteOrchestratorAction.cs +++ b/src/DurableTask.Core/Command/OrchestrationCompleteOrchestratorAction.cs @@ -56,5 +56,10 @@ public class OrchestrationCompleteOrchestratorAction : OrchestratorAction /// Gets a list of events that should be carried over when continuing an orchestration as new. /// public IList CarryoverEvents { get; } = new List(); + + /// + /// Gets a collection of tags associated with the completion action. + /// + public IDictionary Tags { get; } = new Dictionary(); } } \ No newline at end of file diff --git a/src/DurableTask.Core/Command/OrchestratorActionType.cs b/src/DurableTask.Core/Command/OrchestratorActionType.cs index 7e9cbdc69..34d256aaa 100644 --- a/src/DurableTask.Core/Command/OrchestratorActionType.cs +++ b/src/DurableTask.Core/Command/OrchestratorActionType.cs @@ -42,5 +42,10 @@ public enum OrchestratorActionType /// The orchestrator completed. /// OrchestrationComplete, + + /// + /// The orchestration was rewound. + /// + RewindOrchestration, } } \ No newline at end of file diff --git a/src/DurableTask.Core/Command/RewindOrchestrationAction.cs b/src/DurableTask.Core/Command/RewindOrchestrationAction.cs new file mode 100644 index 000000000..6913726c7 --- /dev/null +++ b/src/DurableTask.Core/Command/RewindOrchestrationAction.cs @@ -0,0 +1,25 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.Core.Command +{ + + /// + /// Orchestrator action for rewinding orchestrations. + /// + public class RewindOrchestrationAction : OrchestratorAction + { + /// + public override OrchestratorActionType OrchestratorActionType => OrchestratorActionType.RewindOrchestration; + } +} \ No newline at end of file diff --git a/src/DurableTask.Core/DurableTask.Core.csproj b/src/DurableTask.Core/DurableTask.Core.csproj index 802fe8097..73af3f4d0 100644 --- a/src/DurableTask.Core/DurableTask.Core.csproj +++ b/src/DurableTask.Core/DurableTask.Core.csproj @@ -17,8 +17,8 @@ 3 - 4 - 0 + 7 + 1 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 diff --git a/src/DurableTask.Core/Entities/OrchestrationEntityContext.cs b/src/DurableTask.Core/Entities/OrchestrationEntityContext.cs index d50015bf1..323cba441 100644 --- a/src/DurableTask.Core/Entities/OrchestrationEntityContext.cs +++ b/src/DurableTask.Core/Entities/OrchestrationEntityContext.cs @@ -187,7 +187,7 @@ public bool ValidateAcquireTransition(out string? errorMessage) /// public void RecoverLockAfterCall(string targetInstanceId) { - if (this.IsInsideCriticalSection) + if (this.IsInsideCriticalSection && !this.lockAcquisitionPending) { var lockToUse = EntityId.FromString(targetInstanceId); this.availableLocks!.Add(lockToUse); diff --git a/src/DurableTask.Core/ExceptionPropertiesProviderExtensions.cs b/src/DurableTask.Core/ExceptionPropertiesProviderExtensions.cs new file mode 100644 index 000000000..08b2d9692 --- /dev/null +++ b/src/DurableTask.Core/ExceptionPropertiesProviderExtensions.cs @@ -0,0 +1,46 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.Core +{ + using System; + using System.Collections.Generic; + using DurableTask.Core.Exceptions; + + /// + /// Extension methods for . + /// + public static class ExceptionPropertiesProviderExtensions + { + /// + /// Extracts properties of the exception specified at provider. + /// + public static IDictionary? ExtractProperties(this IExceptionPropertiesProvider? provider, Exception exception) + { + if (exception is OrchestrationException orchestrationException && + orchestrationException.FailureDetails?.Properties != null) + { + return orchestrationException.FailureDetails.Properties; + } + + if (provider == null) + { + return null; + } + + return provider.GetExceptionProperties(exception); + } + } +} + + diff --git a/src/DurableTask.Core/FailureDetails.cs b/src/DurableTask.Core/FailureDetails.cs index 5dfd02ab9..c4ad18895 100644 --- a/src/DurableTask.Core/FailureDetails.cs +++ b/src/DurableTask.Core/FailureDetails.cs @@ -38,14 +38,29 @@ public class FailureDetails : IEquatable /// The exception stack trace. /// The inner cause of the failure. /// Whether the failure is non-retriable. + /// Additional properties associated with the failure. [JsonConstructor] - public FailureDetails(string errorType, string errorMessage, string? stackTrace, FailureDetails? innerFailure, bool isNonRetriable) + public FailureDetails(string errorType, string errorMessage, string? stackTrace, FailureDetails? innerFailure, bool isNonRetriable, IDictionary? properties = null) { this.ErrorType = errorType; this.ErrorMessage = errorMessage; this.StackTrace = stackTrace; this.InnerFailure = innerFailure; this.IsNonRetriable = isNonRetriable; + this.Properties = properties; + } + + /// + /// Initializes a new instance of the class. + /// + /// The name of the error, which is expected to the the namespace-qualified name of the exception type. + /// The message associated with the error, which is expected to be the exception's property. + /// The exception stack trace. + /// The inner cause of the failure. + /// Whether the failure is non-retriable. + public FailureDetails(string errorType, string errorMessage, string? stackTrace, FailureDetails? innerFailure, bool isNonRetriable) + : this(errorType, errorMessage, stackTrace, innerFailure, isNonRetriable, properties:null) + { } /// @@ -54,7 +69,7 @@ public FailureDetails(string errorType, string errorMessage, string? stackTrace, /// The exception used to generate the failure details. /// The inner cause of the failure. public FailureDetails(Exception e, FailureDetails innerFailure) - : this(e.GetType().FullName, GetErrorMessage(e), e.StackTrace, innerFailure, false) + : this(e, innerFailure, properties: null) { } @@ -63,7 +78,28 @@ public FailureDetails(Exception e, FailureDetails innerFailure) /// /// The exception used to generate the failure details. public FailureDetails(Exception e) - : this(e.GetType().FullName, GetErrorMessage(e), e.StackTrace, FromException(e.InnerException), false) + : this(e, properties: null) + { + } + + /// + /// Initializes a new instance of the class from an exception object. + /// + /// The exception used to generate the failure details. + /// The exception properties to include in failure details. + public FailureDetails(Exception e, IDictionary? properties) + : this(e.GetType().FullName, GetErrorMessage(e), e.StackTrace, FromException(e.InnerException), false, properties) + { + } + + /// + /// Initializes a new instance of the class from an exception object. + /// + /// The exception used to generate the failure details. + /// The inner cause of the failure. + /// The exception properties to include in failure details. + public FailureDetails(Exception e, FailureDetails innerFailure, IDictionary? properties) + : this(e.GetType().FullName, GetErrorMessage(e), e.StackTrace, innerFailure, false, properties) { } @@ -74,6 +110,7 @@ public FailureDetails() { this.ErrorType = "None"; this.ErrorMessage = string.Empty; + this.Properties = null; } /// @@ -85,6 +122,16 @@ protected FailureDetails(SerializationInfo info, StreamingContext context) this.ErrorMessage = info.GetString(nameof(this.ErrorMessage)); this.StackTrace = info.GetString(nameof(this.StackTrace)); this.InnerFailure = (FailureDetails)info.GetValue(nameof(this.InnerFailure), typeof(FailureDetails)); + // Handle backward compatibility for Properties property - defaults to null + try + { + this.Properties = (IDictionary?)info.GetValue(nameof(this.Properties), typeof(IDictionary)); + } + catch (SerializationException) + { + // Default to null for backward compatibility + this.Properties = null; + } } /// @@ -112,6 +159,11 @@ protected FailureDetails(SerializationInfo info, StreamingContext context) /// public bool IsNonRetriable { get; } + /// + /// Gets additional properties associated with the failure. + /// + public IDictionary? Properties { get; } + /// /// Gets a debug-friendly description of the failure information. /// @@ -145,18 +197,13 @@ public bool IsCausedBy() where T : Exception { // This last check works for exception types defined in any loaded assembly (e.g. NuGet packages, etc.). // This is a fallback that should rarely be needed except in obscure cases. - List matchingExceptionTypes = AppDomain.CurrentDomain.GetAssemblies() + var matchingExceptionTypes = AppDomain.CurrentDomain.GetAssemblies() .Select(a => a.GetType(this.ErrorType, throwOnError: false)) - .Where(t => t is not null) - .ToList(); - if (matchingExceptionTypes.Count == 1) - { - exceptionType = matchingExceptionTypes[0]; - } - else if (matchingExceptionTypes.Count > 1) - { - throw new AmbiguousMatchException($"Multiple exception types with the name '{this.ErrorType}' were found."); - } + .Where(t => t is not null); + + // Previously, this logic would only return true if matchingExceptionTypes found only one assembly with a type matching ErrorType. + // Now, it will return true if any matching assembly has a type that is assignable to T. + return matchingExceptionTypes.Any(matchType => typeof(T).IsAssignableFrom(matchType)); } return exceptionType != null && typeof(T).IsAssignableFrom(exceptionType); @@ -204,7 +251,13 @@ static string GetErrorMessage(Exception e) static FailureDetails? FromException(Exception? e) { - return e == null ? null : new FailureDetails(e); + return FromException(e, properties : null); } + + static FailureDetails? FromException(Exception? e, IDictionary? properties) + { + return e == null ? null : new FailureDetails(e, properties : properties); + } + } } diff --git a/src/DurableTask.Core/History/EventType.cs b/src/DurableTask.Core/History/EventType.cs index f9412d0f2..52c44ad11 100644 --- a/src/DurableTask.Core/History/EventType.cs +++ b/src/DurableTask.Core/History/EventType.cs @@ -125,5 +125,10 @@ public enum EventType /// Orchestration was resumed event /// ExecutionResumed, + + /// + /// Orchestration was rewound event. + /// + ExecutionRewound, } } \ No newline at end of file diff --git a/src/DurableTask.Core/History/ExecutionRewoundEvent.cs b/src/DurableTask.Core/History/ExecutionRewoundEvent.cs new file mode 100644 index 000000000..c838e9eb2 --- /dev/null +++ b/src/DurableTask.Core/History/ExecutionRewoundEvent.cs @@ -0,0 +1,71 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.Core.History +{ + using DurableTask.Core.Tracing; + using System.Runtime.Serialization; + + /// + /// Generic History event + /// + [DataContract] + public class ExecutionRewoundEvent : HistoryEvent, ISupportsDurableTraceContext + { + /// + /// Creates a new ExecutionRewoundEvent with the supplied event id and empty reason. + /// + /// The integer event id + public ExecutionRewoundEvent(int eventId) : base(eventId) { } + + /// + /// Creates a new ExecutionRewoundEvent with the supplied event id and reason. + /// + /// The integer event id + /// The reason for the rewind event + public ExecutionRewoundEvent(int eventId, string? reason) + : base(eventId) + { + this.Reason = reason; + } + + /// + /// Gets the event type + /// + public override EventType EventType => EventType.ExecutionRewound; + + /// + /// Gets or sets the reason for the rewind event. + /// + [DataMember] + public string? Reason { get; set; } + + /// + /// Gets or sets the parent execution id of the rewound suborchestration. + /// + [DataMember] + public string? ParentExecutionId { get; set; } + + /// + /// Gets or sets the instance ID of the rewound orchestration. + /// + [DataMember] + public string? InstanceId { get; set; } + + /// + /// Gets or sets the parent trace context of the rewound suborchestration. + /// + [DataMember] + public DistributedTraceContext? ParentTraceContext { get; set; } + } +} \ No newline at end of file diff --git a/src/DurableTask.Core/History/ExecutionStartedEvent.cs b/src/DurableTask.Core/History/ExecutionStartedEvent.cs index 6ae593359..59c6b8202 100644 --- a/src/DurableTask.Core/History/ExecutionStartedEvent.cs +++ b/src/DurableTask.Core/History/ExecutionStartedEvent.cs @@ -48,6 +48,31 @@ internal ExecutionStartedEvent() { } + /// + /// Creates a new ExecutionStartedEvent with the same fields as . + /// A deep copy is performed on all non-base class fields. + /// + internal ExecutionStartedEvent(ExecutionStartedEvent other) + { + // Copy base class fields + EventId = other.EventId; + Timestamp = other.Timestamp; + ExtensionData = other.ExtensionData; + IsPlayed = other.IsPlayed; + + // Deep copy all other fields + OrchestrationInstance = other.OrchestrationInstance?.Clone(); + ParentInstance = other.ParentInstance?.Clone(); + ParentTraceContext = other.ParentTraceContext?.Clone(); + Input = other.Input; + Name = other.Name; + Version = other.Version; + Tags = other.Tags != null ? new Dictionary(other.Tags) : null; + Correlation = other.Correlation; + ScheduledStartTime = other.ScheduledStartTime; + Generation = other.Generation; + } + /// /// Gets the event type /// diff --git a/src/DurableTask.Core/History/SubOrchestrationInstanceCreatedEvent.cs b/src/DurableTask.Core/History/SubOrchestrationInstanceCreatedEvent.cs index e611eb99e..1931d2d8d 100644 --- a/src/DurableTask.Core/History/SubOrchestrationInstanceCreatedEvent.cs +++ b/src/DurableTask.Core/History/SubOrchestrationInstanceCreatedEvent.cs @@ -13,6 +13,7 @@ namespace DurableTask.Core.History { + using System.Collections.Generic; using System.Runtime.Serialization; /// @@ -30,6 +31,26 @@ public SubOrchestrationInstanceCreatedEvent(int eventId) { } + /// + /// Creates a new ExecutionStartedEvent with the same fields as . + /// + internal SubOrchestrationInstanceCreatedEvent(SubOrchestrationInstanceCreatedEvent other) + { + // Copy base class fields + EventId = other.EventId; + Timestamp = other.Timestamp; + ExtensionData = other.ExtensionData; + IsPlayed = other.IsPlayed; + + // Copy all other fields + Name = other.Name; + Version = other.Version; + InstanceId = other.InstanceId; + Input = other.Input; + ClientSpanId = other.ClientSpanId; + Tags = other.Tags; + } + /// /// Gets the event type /// @@ -64,5 +85,11 @@ public SubOrchestrationInstanceCreatedEvent(int eventId) /// [DataMember] public string ClientSpanId { get; set; } + + /// + /// Gets or sets a dictionary of tags of string, string + /// + [DataMember] + public IDictionary Tags { get; set; } } } \ No newline at end of file diff --git a/src/DurableTask.Core/IExceptionPropertiesProvider.cs b/src/DurableTask.Core/IExceptionPropertiesProvider.cs new file mode 100644 index 000000000..e40bc1c3e --- /dev/null +++ b/src/DurableTask.Core/IExceptionPropertiesProvider.cs @@ -0,0 +1,33 @@ +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.Core +{ + using System; + using System.Collections.Generic; + + /// + /// Interface for providing custom properties from exceptions that will be included in FailureDetails. + /// This interface is intended for implementation by the durabletask-dotnet layer, which will + /// convert customer implementations to this interface and register them with DurableTask.Core. + /// + public interface IExceptionPropertiesProvider + { + /// + /// Extracts custom properties from an exception. + /// + /// The exception to extract properties from. + /// A dictionary of custom properties to include in the FailureDetails, or null if no properties should be added. + IDictionary? GetExceptionProperties(Exception exception); + } +} diff --git a/src/DurableTask.Core/Logging/EventIds.cs b/src/DurableTask.Core/Logging/EventIds.cs index f7386eb99..049af43f3 100644 --- a/src/DurableTask.Core/Logging/EventIds.cs +++ b/src/DurableTask.Core/Logging/EventIds.cs @@ -31,6 +31,7 @@ static class EventIds public const int ProcessWorkItemStarting = 27; public const int ProcessWorkItemCompleted = 28; public const int ProcessWorkItemFailed = 29; + public const int DispatcherLoopFailed = 30; public const int SchedulingOrchestration = 40; public const int RaisingEvent = 41; @@ -69,5 +70,7 @@ static class EventIds public const int RenewOrchestrationWorkItemFailed = 72; public const int OrchestrationDebugTrace = 73; + + public const int OrchestrationCompletedWithWarning = 74; } } diff --git a/src/DurableTask.Core/Logging/LogEvents.cs b/src/DurableTask.Core/Logging/LogEvents.cs index fd626d3be..e5fe2ee33 100644 --- a/src/DurableTask.Core/Logging/LogEvents.cs +++ b/src/DurableTask.Core/Logging/LogEvents.cs @@ -158,6 +158,37 @@ void IEventSourceEvent.WriteEventSource() => StructuredEventSource.Log.DispatcherStopped(this.Dispatcher, Utils.AppName, Utils.PackageVersion); } + internal class DispatcherLoopFailed : StructuredLogEvent, IEventSourceEvent + { + public DispatcherLoopFailed(WorkItemDispatcherContext context, Exception exception) + { + this.Dispatcher = context.GetDisplayName(); + this.Details = exception.ToString(); + } + + [StructuredLogField] + public string Dispatcher { get; } + + [StructuredLogField] + public string Details { get; } + + public override EventId EventId => new EventId( + EventIds.DispatcherLoopFailed, + nameof(EventIds.DispatcherLoopFailed)); + + public override LogLevel Level => LogLevel.Error; + + protected override string CreateLogMessage() => + $"{this.Dispatcher}: Unhandled exception in dispatch loop. Will retry after backoff. Details: {this.Details}"; + + void IEventSourceEvent.WriteEventSource() => + StructuredEventSource.Log.DispatcherLoopFailed( + this.Dispatcher, + this.Details, + Utils.AppName, + Utils.PackageVersion); + } + internal class DispatchersStopping : StructuredLogEvent, IEventSourceEvent { public DispatchersStopping(string name, string id, int concurrentWorkItemCount, int activeFetchers) @@ -1098,6 +1129,54 @@ void IEventSourceEvent.WriteEventSource() => } #nullable disable + /// + /// Log event representing a warning associated with an orchestration completing. + /// + internal class OrchestrationCompletedWithWarning : StructuredLogEvent, IEventSourceEvent + { + + public OrchestrationCompletedWithWarning( + OrchestrationInstance instance, + string orchestrationStatus, + string warningMessage) + { + this.InstanceId = instance.InstanceId; + this.ExecutionId = instance.ExecutionId; + this.RuntimeStatus = orchestrationStatus; + this.Details = warningMessage; + } + + [StructuredLogField] + public string InstanceId { get; } + + [StructuredLogField] + public string ExecutionId { get; } + + [StructuredLogField] + public string RuntimeStatus { get; } + + [StructuredLogField] + public string Details { get; } + + public override EventId EventId => new EventId( + EventIds.OrchestrationCompletedWithWarning, + nameof(EventIds.OrchestrationCompletedWithWarning)); + + public override LogLevel Level => LogLevel.Warning; + + protected override string CreateLogMessage() => + $"{this.InstanceId}: Orchestration completed with warning: {this.Details}"; + + void IEventSourceEvent.WriteEventSource() => + StructuredEventSource.Log.OrchestrationCompletedWithWarning( + this.InstanceId, + this.ExecutionId, + this.RuntimeStatus, + this.Details, + Utils.AppName, + Utils.PackageVersion); + } + /// /// Log event representing an orchestration aborted event, which can happen if the host is shutting down. /// diff --git a/src/DurableTask.Core/Logging/LogHelper.cs b/src/DurableTask.Core/Logging/LogHelper.cs index 109c14c42..6efbe0cfe 100644 --- a/src/DurableTask.Core/Logging/LogHelper.cs +++ b/src/DurableTask.Core/Logging/LogHelper.cs @@ -107,6 +107,21 @@ internal void DispatcherStopped(WorkItemDispatcherContext context) } } + /// + /// Logs that a work item dispatch loop encountered an unhandled exception. + /// + /// The context of the dispatcher that failed. + /// The unhandled exception. + internal void DispatcherLoopFailed(WorkItemDispatcherContext context, Exception exception) + { + if (this.IsStructuredLoggingEnabled) + { + this.WriteStructuredLog( + new LogEvents.DispatcherLoopFailed(context, exception), + exception); + } + } + /// /// Logs that the work item dispatcher is watching for individual dispatch loops to finish stopping. /// @@ -484,6 +499,23 @@ internal void OrchestrationCompleted( } } + /// + /// Logs a warning associated with an orchestration completing. + /// + /// The orchestration instance of the orchestration. + /// The status of the completed orchestration. + /// The warning message to log. + internal void OrchestrationCompletedWithWarning( + OrchestrationInstance instance, + OrchestrationStatus orchestrationStatus, + string warningMessage) + { + if (this.IsStructuredLoggingEnabled) + { + this.WriteStructuredLog(new LogEvents.OrchestrationCompletedWithWarning(instance, orchestrationStatus.ToString(), warningMessage)); + } + } + /// /// Logs that an orchestration execution was aborted. /// diff --git a/src/DurableTask.Core/Logging/StructuredEventSource.cs b/src/DurableTask.Core/Logging/StructuredEventSource.cs index 162a226f2..129bac158 100644 --- a/src/DurableTask.Core/Logging/StructuredEventSource.cs +++ b/src/DurableTask.Core/Logging/StructuredEventSource.cs @@ -102,6 +102,15 @@ internal void DispatcherStopped(string Dispatcher, string AppName, string Extens } } + [Event(EventIds.DispatcherLoopFailed, Level = EventLevel.Error, Version = 1)] + internal void DispatcherLoopFailed(string Dispatcher, string Details, string AppName, string ExtensionVersion) + { + if (this.IsEnabled(EventLevel.Error)) + { + this.WriteEvent(EventIds.DispatcherLoopFailed, Dispatcher, Details, AppName, ExtensionVersion); + } + } + [Event(EventIds.DispatchersStopping, Level = EventLevel.Verbose, Version = 1)] internal void DispatchersStopping( string Dispatcher, @@ -579,6 +588,29 @@ internal void OrchestrationCompleted( } } + [Event(EventIds.OrchestrationCompletedWithWarning, Level = EventLevel.Warning, Version = 1)] + internal void OrchestrationCompletedWithWarning( + string InstanceId, + string ExecutionId, + string RuntimeStatus, + string Details, + string AppName, + string ExtensionVersion) + { + if (this.IsEnabled(EventLevel.Warning)) + { + // TODO: Use WriteEventCore for better performance + this.WriteEvent( + EventIds.OrchestrationCompletedWithWarning, + InstanceId, + ExecutionId, + RuntimeStatus, + Details, + AppName, + ExtensionVersion); + } + } + [Event(EventIds.OrchestrationAborted, Level = EventLevel.Warning, Version = 1)] internal void OrchestrationAborted( string InstanceId, diff --git a/src/DurableTask.Core/OrchestrationContext.cs b/src/DurableTask.Core/OrchestrationContext.cs index 97c6f6f6d..63179f485 100644 --- a/src/DurableTask.Core/OrchestrationContext.cs +++ b/src/DurableTask.Core/OrchestrationContext.cs @@ -73,6 +73,11 @@ public abstract class OrchestrationContext /// internal ErrorPropagationMode ErrorPropagationMode { get; set; } + /// + /// Gets or sets the exception properties provider that extracts custom properties from exceptions + /// + internal IExceptionPropertiesProvider ExceptionPropertiesProvider { get;set; } + /// /// Information about backend entity support, or null if the configured backend does not support entities. /// @@ -273,6 +278,25 @@ public virtual Task CreateSubOrchestrationInstanceWithRetry(string name, s return retryInterceptor.Invoke(); } + /// + /// Create a sub-orchestration of the specified name and version. Also retry on failure as per supplied policy. + /// + /// Return Type of the TaskOrchestration.RunTask method + /// Name of the orchestration as specified by the ObjectCreator + /// Name of the orchestration as specified by the ObjectCreator + /// Instance Id of the sub-orchestration + /// Retry policy + /// Input for the TaskOrchestration.RunTask method + /// Dictionary of key/value tags associated with this instance + /// Task that represents the execution of the specified sub-orchestration + public virtual Task CreateSubOrchestrationInstanceWithRetry(string name, string version, string instanceId, + RetryOptions retryOptions, object input, IDictionary tags) + { + Task RetryCall() => CreateSubOrchestrationInstance(name, version, instanceId, input, tags); + var retryInterceptor = new RetryInterceptor(this, retryOptions, RetryCall); + return retryInterceptor.Invoke(); + } + /// /// Schedule a TaskActivity by type. /// diff --git a/src/DurableTask.Core/OrchestrationRuntimeState.cs b/src/DurableTask.Core/OrchestrationRuntimeState.cs index 3c10fe693..ab7d76044 100644 --- a/src/DurableTask.Core/OrchestrationRuntimeState.cs +++ b/src/DurableTask.Core/OrchestrationRuntimeState.cs @@ -389,6 +389,7 @@ HistoryEvent GenerateAbridgedEvent(HistoryEvent evt) Version = subOrchestrationInstanceCreatedEvent.Version, Input = "[..snipped..]", ClientSpanId = subOrchestrationInstanceCreatedEvent.ClientSpanId, + Tags = subOrchestrationInstanceCreatedEvent.Tags, }; } else if (evt is SubOrchestrationInstanceCompletedEvent subOrchestrationInstanceCompletedEvent) diff --git a/src/DurableTask.Core/OrchestrationStatus.cs b/src/DurableTask.Core/OrchestrationStatus.cs index 098aecec1..1e26747d8 100644 --- a/src/DurableTask.Core/OrchestrationStatus.cs +++ b/src/DurableTask.Core/OrchestrationStatus.cs @@ -56,6 +56,6 @@ public enum OrchestrationStatus /// /// Orchestration state of suspended /// - Suspended, + Suspended } } \ No newline at end of file diff --git a/src/DurableTask.Core/OrchestrationTags.cs b/src/DurableTask.Core/OrchestrationTags.cs index 00ff3a2b1..c8fba33a1 100644 --- a/src/DurableTask.Core/OrchestrationTags.cs +++ b/src/DurableTask.Core/OrchestrationTags.cs @@ -56,6 +56,11 @@ public static class OrchestrationTags /// public const string CreateTraceForNewOrchestration = "MS_CreateTrace"; + /// + /// The warning logged when an orchestration completes, if any. + /// + public const string CompleteOrchestrationLogWarning = "MS_CompleteOrchestrationLogWarning"; + /// /// Check whether the given tags contain the fire and forget tag /// diff --git a/src/DurableTask.Core/ReflectionBasedTaskActivity.cs b/src/DurableTask.Core/ReflectionBasedTaskActivity.cs index b935f8c1c..0ecd717b4 100644 --- a/src/DurableTask.Core/ReflectionBasedTaskActivity.cs +++ b/src/DurableTask.Core/ReflectionBasedTaskActivity.cs @@ -140,7 +140,8 @@ public override async Task RunAsync(TaskContext context, string input) } else { - failureDetails = new FailureDetails(exception); + var props = context.ExceptionPropertiesProvider.ExtractProperties(exception); + failureDetails = new FailureDetails(exception, props); } throw new TaskFailureException(exception.Message, exception, details) diff --git a/src/DurableTask.Core/TaskActivity.cs b/src/DurableTask.Core/TaskActivity.cs index b05f020eb..f872c9bda 100644 --- a/src/DurableTask.Core/TaskActivity.cs +++ b/src/DurableTask.Core/TaskActivity.cs @@ -13,12 +13,12 @@ namespace DurableTask.Core { - using System; - using System.Threading.Tasks; using DurableTask.Core.Common; using DurableTask.Core.Exceptions; using DurableTask.Core.Serializing; using Newtonsoft.Json.Linq; + using System; + using System.Threading.Tasks; /// /// Base class for TaskActivity. @@ -142,7 +142,16 @@ public override async Task RunAsync(TaskContext context, string input) } else { - failureDetails = new FailureDetails(e); + if(context != null) + { + var props = context.ExceptionPropertiesProvider.ExtractProperties(e); + failureDetails = new FailureDetails(e, props); + } + else + { + // Handle case for TaskContext is null. + failureDetails = new FailureDetails(e); + } } throw new TaskFailureException(e.Message, e, details) diff --git a/src/DurableTask.Core/TaskActivityDispatcher.cs b/src/DurableTask.Core/TaskActivityDispatcher.cs index bdafffdd5..8f4c24dca 100644 --- a/src/DurableTask.Core/TaskActivityDispatcher.cs +++ b/src/DurableTask.Core/TaskActivityDispatcher.cs @@ -36,19 +36,31 @@ public sealed class TaskActivityDispatcher readonly DispatchMiddlewarePipeline dispatchPipeline; readonly LogHelper logHelper; readonly ErrorPropagationMode errorPropagationMode; + readonly IExceptionPropertiesProvider? exceptionPropertiesProvider; + /// + /// Initializes a new instance of the class with an exception properties provider. + /// + /// The orchestration service implementation + /// The object manager for activities + /// The dispatch middleware pipeline + /// The log helper + /// The error propagation mode + /// The exception properties provider for extracting custom properties from exceptions internal TaskActivityDispatcher( IOrchestrationService orchestrationService, INameVersionObjectManager objectManager, DispatchMiddlewarePipeline dispatchPipeline, LogHelper logHelper, - ErrorPropagationMode errorPropagationMode) + ErrorPropagationMode errorPropagationMode, + IExceptionPropertiesProvider? exceptionPropertiesProvider) { this.orchestrationService = orchestrationService ?? throw new ArgumentNullException(nameof(orchestrationService)); this.objectManager = objectManager ?? throw new ArgumentNullException(nameof(objectManager)); this.dispatchPipeline = dispatchPipeline ?? throw new ArgumentNullException(nameof(dispatchPipeline)); this.logHelper = logHelper; this.errorPropagationMode = errorPropagationMode; + this.exceptionPropertiesProvider = exceptionPropertiesProvider; this.dispatcher = new WorkItemDispatcher( "TaskActivityDispatcher", @@ -190,6 +202,7 @@ await this.dispatchPipeline.RunAsync(dispatchContext, async _ => scheduledEvent.Version, scheduledEvent.EventId); context.ErrorPropagationMode = this.errorPropagationMode; + context.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; HistoryEvent? responseEvent; @@ -254,6 +267,12 @@ await this.dispatchPipeline.RunAsync(dispatchContext, async _ => eventToRespond = new TaskCompletedEvent(-1, scheduledEvent.EventId, null); } + if (traceActivity != null && eventToRespond is TaskCompletedEvent) + { + // Ensure successful executions don't preserve a prior error status from custom instrumentation. + traceActivity.SetStatus(ActivityStatusCode.OK, "Completed"); + } + var responseTaskMessage = new TaskMessage { Event = eventToRespond, diff --git a/src/DurableTask.Core/TaskContext.cs b/src/DurableTask.Core/TaskContext.cs index d8152976c..4fe4322f0 100644 --- a/src/DurableTask.Core/TaskContext.cs +++ b/src/DurableTask.Core/TaskContext.cs @@ -62,5 +62,10 @@ public TaskContext(OrchestrationInstance orchestrationInstance, string name, str /// Gets or sets a value indicating how to propagate unhandled exception metadata. /// internal ErrorPropagationMode ErrorPropagationMode { get; set; } + + /// + /// Gets or sets the properties of exceptions with the provider. + /// + public IExceptionPropertiesProvider? ExceptionPropertiesProvider { get; set; } } -} \ No newline at end of file +} diff --git a/src/DurableTask.Core/TaskEntityDispatcher.cs b/src/DurableTask.Core/TaskEntityDispatcher.cs index a91ae97e2..f63048a1a 100644 --- a/src/DurableTask.Core/TaskEntityDispatcher.cs +++ b/src/DurableTask.Core/TaskEntityDispatcher.cs @@ -42,19 +42,31 @@ public class TaskEntityDispatcher readonly LogHelper logHelper; readonly ErrorPropagationMode errorPropagationMode; readonly TaskOrchestrationDispatcher.NonBlockingCountdownLock concurrentSessionLock; + readonly IExceptionPropertiesProvider exceptionPropertiesProvider; + /// + /// Initializes a new instance of the class with an exception properties provider. + /// + /// The orchestration service implementation + /// The object manager for entities + /// The dispatch middleware pipeline + /// The log helper + /// The error propagation mode + /// The exception properties provider for extracting custom properties from exceptions internal TaskEntityDispatcher( IOrchestrationService orchestrationService, INameVersionObjectManager entityObjectManager, DispatchMiddlewarePipeline entityDispatchPipeline, LogHelper logHelper, - ErrorPropagationMode errorPropagationMode) + ErrorPropagationMode errorPropagationMode, + IExceptionPropertiesProvider exceptionPropertiesProvider) { this.objectManager = entityObjectManager ?? throw new ArgumentNullException(nameof(entityObjectManager)); this.orchestrationService = orchestrationService ?? throw new ArgumentNullException(nameof(orchestrationService)); this.dispatchPipeline = entityDispatchPipeline ?? throw new ArgumentNullException(nameof(entityDispatchPipeline)); this.logHelper = logHelper ?? throw new ArgumentNullException(nameof(logHelper)); this.errorPropagationMode = errorPropagationMode; + this.exceptionPropertiesProvider = exceptionPropertiesProvider; this.entityOrchestrationService = (orchestrationService as IEntityOrchestrationService)!; this.entityBackendProperties = entityOrchestrationService.EntityBackendProperties; @@ -119,13 +131,13 @@ async Task OnProcessWorkItemSessionAsync(TaskOrchestrationWorkItem workItem) if (workItem.Session == null) { // Legacy behavior - await this.OnProcessWorkItemAsync(workItem); + await this.OnProcessWorkItemAsync(workItem, null); return; } - var isExtendedSession = false; - + var concurrencyLockAcquired = false; var processCount = 0; + SchedulerState schedulerState = null; try { while (true) @@ -133,23 +145,34 @@ async Task OnProcessWorkItemSessionAsync(TaskOrchestrationWorkItem workItem) // While the work item contains messages that need to be processed, execute them. if (workItem.NewMessages?.Count > 0) { - bool isCompletedOrInterrupted = await this.OnProcessWorkItemAsync(workItem); - if (isCompletedOrInterrupted) + // We only need to acquire the lock on the first execution within the extended session + if (!concurrencyLockAcquired) + { + concurrencyLockAcquired = this.concurrentSessionLock.Acquire(); + } + workItem.IsExtendedSession = concurrencyLockAcquired; + // Regardless of whether or not we acquired the concurrent session lock, we will make sure to execute this work item. + // If we failed to acquire it, we will end the extended session after this execution. + schedulerState = await this.OnProcessWorkItemAsync(workItem, schedulerState); + + // The entity has been deleted, so we end the extended session. + if (this.EntityIsDeleted(schedulerState)) { break; } + // When extended sessions are enabled, the handler caches the entity state after the first execution of the extended session, so there + // is no need to retain a reference to it here. + // We set the local reference to null so that the entity state can be garbage collected while we wait for more messages to arrive. + schedulerState.EntityState = null; + processCount++; } - // Fetches beyond the first require getting an extended session lock, used to prevent starvation. - if (processCount > 0 && !isExtendedSession) + // If we failed to acquire the concurrent session lock, we will end the extended session after the execution of the first work item + if (processCount > 0 && !concurrencyLockAcquired) { - isExtendedSession = this.concurrentSessionLock.Acquire(); - if (!isExtendedSession) - { - break; - } + break; } Stopwatch timer = Stopwatch.StartNew(); @@ -167,7 +190,7 @@ async Task OnProcessWorkItemSessionAsync(TaskOrchestrationWorkItem workItem) } finally { - if (isExtendedSession) + if (concurrencyLockAcquired) { this.concurrentSessionLock.Release(); } @@ -196,7 +219,9 @@ internal class WorkItemEffects /// Method to process a new work item /// /// The work item to process - protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem workItem) + /// If extended sessions are enabled, the scheduler state that is being cached across executions. + /// If they are not enabled, or if this is the first execution from within an extended session, this parameter is null. + private async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem workItem, SchedulerState schedulerState) { OrchestrationRuntimeState originalOrchestrationRuntimeState = workItem.OrchestrationRuntimeState; @@ -233,19 +258,20 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work } else { + bool firstExecutionIfExtendedSession = schedulerState == null; // we start with processing all the requests and figuring out which ones to execute now // results can depend on whether the entity is locked, what the maximum batch size is, // and whether the messages arrived out of order this.DetermineWork(workItem.OrchestrationRuntimeState, - out SchedulerState schedulerState, + ref schedulerState, out Work workToDoNow); if (workToDoNow.OperationCount > 0) { // execute the user-defined operations on this entity, via the middleware - var result = await this.ExecuteViaMiddlewareAsync(workToDoNow, runtimeState.OrchestrationInstance, schedulerState.EntityState); + var result = await this.ExecuteViaMiddlewareAsync(workToDoNow, runtimeState.OrchestrationInstance, schedulerState.EntityState, workItem.IsExtendedSession, firstExecutionIfExtendedSession); var operationResults = result.Results!; // if we encountered an error, record it as the result of the operations @@ -405,7 +431,7 @@ await this.orchestrationService.CompleteTaskOrchestrationWorkItemAsync( workItem.OrchestrationRuntimeState = runtimeState; } - return true; + return schedulerState; } void ProcessLockRequest(WorkItemEffects effects, SchedulerState schedulerState, RequestMessage request) @@ -433,7 +459,7 @@ void ProcessLockRequest(WorkItemEffects effects, SchedulerState schedulerState, string SerializeSchedulerStateForNextExecution(SchedulerState schedulerState) { - if (this.entityBackendProperties.SupportsImplicitEntityDeletion && schedulerState.IsEmpty && !schedulerState.Suspended) + if (this.EntityIsDeleted(schedulerState)) { // this entity scheduler is idle and the entity is deleted, so the instance and history can be removed from storage // we convey this to the durability provider by issuing a continue-as-new with null input @@ -448,10 +474,11 @@ string SerializeSchedulerStateForNextExecution(SchedulerState schedulerState) #region Preprocess to determine work - void DetermineWork(OrchestrationRuntimeState runtimeState, out SchedulerState schedulerState, out Work batch) + void DetermineWork(OrchestrationRuntimeState runtimeState, ref SchedulerState schedulerState, out Work batch) { string instanceId = runtimeState.OrchestrationInstance.InstanceId; - schedulerState = new SchedulerState(); + bool deserializeState = schedulerState == null; + schedulerState ??= new(); batch = new Work(); Queue lockHolderMessages = null; @@ -462,8 +489,9 @@ void DetermineWork(OrchestrationRuntimeState runtimeState, out SchedulerState sc { case EventType.ExecutionStarted: - - if (runtimeState.Input != null) + // Only attempt to deserialize the scheduler state if we don't already have it in memory. + // This occurs on the first execution within an extended session, or when extended sessions are disabled. + if (runtimeState.Input != null && deserializeState) { try { @@ -612,6 +640,11 @@ void DetermineWork(OrchestrationRuntimeState runtimeState, out SchedulerState sc } } + bool EntityIsDeleted(SchedulerState schedulerState) + { + return schedulerState != null && this.entityBackendProperties.SupportsImplicitEntityDeletion && schedulerState.IsEmpty && !schedulerState.Suspended; + } + class Work { List operationBatch; // a (possibly empty) sequence of operations to be executed on the entity @@ -919,7 +952,7 @@ internal void ProcessSendStartMessage(WorkItemEffects effects, OrchestrationRunt #endregion - async Task ExecuteViaMiddlewareAsync(Work workToDoNow, OrchestrationInstance instance, string serializedEntityState) + async Task ExecuteViaMiddlewareAsync(Work workToDoNow, OrchestrationInstance instance, string serializedEntityState, bool isExtendedSession, bool includeEntityState) { var (operations, traceActivities) = workToDoNow.GetOperationRequestsAndTraceActivities(instance.InstanceId); // the request object that will be passed to the worker @@ -942,6 +975,7 @@ async Task ExecuteViaMiddlewareAsync(Work workToDoNow, Orches var dispatchContext = new DispatchMiddlewareContext(); dispatchContext.SetProperty(request); + dispatchContext.SetProperty(new WorkItemMetadata(isExtendedSession, includeEntityState)); await this.dispatchPipeline.RunAsync(dispatchContext, async _ => { diff --git a/src/DurableTask.Core/TaskHubWorker.cs b/src/DurableTask.Core/TaskHubWorker.cs index 629453645..65dfbb47e 100644 --- a/src/DurableTask.Core/TaskHubWorker.cs +++ b/src/DurableTask.Core/TaskHubWorker.cs @@ -247,6 +247,12 @@ public TaskHubWorker( /// public ErrorPropagationMode ErrorPropagationMode { get; set; } + /// + /// Gets or sets the exception properties provider that extracts custom properties from exceptions + /// when creating FailureDetails objects. + /// + public IExceptionPropertiesProvider ExceptionPropertiesProvider { get; set; } + /// /// Adds a middleware delegate to the orchestration dispatch pipeline. /// @@ -296,13 +302,15 @@ public async Task StartAsync() this.orchestrationDispatchPipeline, this.logHelper, this.ErrorPropagationMode, - this.versioningSettings); + this.versioningSettings, + this.ExceptionPropertiesProvider); this.activityDispatcher = new TaskActivityDispatcher( this.orchestrationService, this.activityManager, this.activityDispatchPipeline, this.logHelper, - this.ErrorPropagationMode); + this.ErrorPropagationMode, + this.ExceptionPropertiesProvider); if (this.dispatchEntitiesSeparately) { @@ -311,7 +319,8 @@ public async Task StartAsync() this.entityManager, this.entityDispatchPipeline, this.logHelper, - this.ErrorPropagationMode); + this.ErrorPropagationMode, + this.ExceptionPropertiesProvider); } await this.orchestrationService.StartAsync(); diff --git a/src/DurableTask.Core/TaskOrchestration.cs b/src/DurableTask.Core/TaskOrchestration.cs index c198c0855..d449d3584 100644 --- a/src/DurableTask.Core/TaskOrchestration.cs +++ b/src/DurableTask.Core/TaskOrchestration.cs @@ -105,7 +105,8 @@ public override async Task Execute(OrchestrationContext context, string } else { - failureDetails = new FailureDetails(e); + var props = context.ExceptionPropertiesProvider.ExtractProperties(e); + failureDetails = new FailureDetails(e, props); } throw new OrchestrationFailureException(e.Message, details) diff --git a/src/DurableTask.Core/TaskOrchestrationContext.cs b/src/DurableTask.Core/TaskOrchestrationContext.cs index b12cb1b08..4972e6fcd 100644 --- a/src/DurableTask.Core/TaskOrchestrationContext.cs +++ b/src/DurableTask.Core/TaskOrchestrationContext.cs @@ -51,7 +51,8 @@ public TaskOrchestrationContext( OrchestrationInstance orchestrationInstance, TaskScheduler taskScheduler, TaskOrchestrationEntityParameters entityParameters = null, - ErrorPropagationMode errorPropagationMode = ErrorPropagationMode.SerializeExceptions) + ErrorPropagationMode errorPropagationMode = ErrorPropagationMode.SerializeExceptions, + IExceptionPropertiesProvider exceptionPropertiesProvider = null) { Utils.UnusedParameter(taskScheduler); @@ -66,6 +67,7 @@ public TaskOrchestrationContext( ErrorPropagationMode = errorPropagationMode; this.eventsWhileSuspended = new Queue(); this.suspendedActionsMap = new SortedDictionary(); + this.ExceptionPropertiesProvider = exceptionPropertiesProvider; } public IEnumerable OrchestratorActions => this.orchestratorActionsMap.Values; @@ -684,7 +686,7 @@ public void FailOrchestration(Exception failure, OrchestrationRuntimeState runti { if (this.ErrorPropagationMode == ErrorPropagationMode.UseFailureDetails) { - failureDetails = new FailureDetails(failure); + failureDetails = new FailureDetails(failure, this.ExceptionPropertiesProvider.ExtractProperties(failure)); } else { @@ -719,6 +721,12 @@ public void CompleteOrchestration(string result, string details, OrchestrationSt completedOrchestratorAction.Details = details; completedOrchestratorAction.OrchestrationStatus = orchestrationStatus; completedOrchestratorAction.FailureDetails = failureDetails; + + if (this.continueAsNew != null && orchestrationStatus == OrchestrationStatus.Failed) + { + completedOrchestratorAction.Tags[OrchestrationTags.CompleteOrchestrationLogWarning] = + "Continue as new called for a failed orchestration, orchestration will complete."; + } } completedOrchestratorAction.Id = id; diff --git a/src/DurableTask.Core/TaskOrchestrationDispatcher.cs b/src/DurableTask.Core/TaskOrchestrationDispatcher.cs index 44adbbbd7..c85536793 100644 --- a/src/DurableTask.Core/TaskOrchestrationDispatcher.cs +++ b/src/DurableTask.Core/TaskOrchestrationDispatcher.cs @@ -49,14 +49,26 @@ public class TaskOrchestrationDispatcher readonly EntityBackendProperties? entityBackendProperties; readonly TaskOrchestrationEntityParameters? entityParameters; readonly VersioningSettings? versioningSettings; + readonly IExceptionPropertiesProvider? exceptionPropertiesProvider; + /// + /// Initializes a new instance of the class with an exception properties provider. + /// + /// The orchestration service implementation + /// The object manager for orchestrations + /// The dispatch middleware pipeline + /// The log helper + /// The error propagation mode + /// The versioning settings + /// The exception properties provider for extracting custom properties from exceptions internal TaskOrchestrationDispatcher( IOrchestrationService orchestrationService, INameVersionObjectManager objectManager, DispatchMiddlewarePipeline dispatchPipeline, LogHelper logHelper, ErrorPropagationMode errorPropagationMode, - VersioningSettings versioningSettings) + VersioningSettings versioningSettings, + IExceptionPropertiesProvider? exceptionPropertiesProvider) { this.objectManager = objectManager ?? throw new ArgumentNullException(nameof(objectManager)); this.orchestrationService = orchestrationService ?? throw new ArgumentNullException(nameof(orchestrationService)); @@ -67,6 +79,7 @@ internal TaskOrchestrationDispatcher( this.entityBackendProperties = this.entityOrchestrationService?.EntityBackendProperties; this.entityParameters = TaskOrchestrationEntityParameters.FromEntityBackendProperties(this.entityBackendProperties); this.versioningSettings = versioningSettings; + this.exceptionPropertiesProvider = exceptionPropertiesProvider; this.dispatcher = new WorkItemDispatcher( "TaskOrchestrationDispatcher", @@ -305,14 +318,13 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work var timerMessages = new List(); var orchestratorMessages = new List(); var isCompleted = false; - var continuedAsNew = false; var isInterrupted = false; + var isRewinding = false; // correlation CorrelationTraceClient.Propagate(() => CorrelationTraceContext.Current = workItem.TraceContext); ExecutionStartedEvent? continueAsNewExecutionStarted = null; - TaskMessage? continuedAsNewMessage = null; IList? carryOverEvents = null; string? carryOverStatus = null; @@ -329,6 +341,21 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work ExecutionStartedEvent startEvent = runtimeState.ExecutionStartedEvent ?? workItem.NewMessages.Select(msg => msg.Event).OfType().FirstOrDefault(); + ExecutionRewoundEvent rewindEvent = + workItem.NewMessages.Select(msg => msg.Event).OfType().LastOrDefault(); + + if (rewindEvent is not null && runtimeState.OrchestrationStatus != OrchestrationStatus.Running) + { + isRewinding = true; + if (rewindEvent.ParentTraceContext != null) + { + startEvent.ParentTraceContext = rewindEvent.ParentTraceContext; + } + // We set these to null here so that a new Activity is created to represent the execution of the rewound orchestration. + startEvent.ParentTraceContext.SpanId = null; + startEvent.ParentTraceContext.Id = null; + startEvent.ParentTraceContext.ActivityStartTime = null; + } Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution(startEvent); OrchestrationState? instanceState = null; @@ -360,10 +387,11 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work } else { + bool continuedAsNew; do { continuedAsNew = false; - continuedAsNewMessage = null; + TaskMessage? continuedAsNewMessage = null; IReadOnlyList decisions = new List(); bool versioningFailed = false; @@ -414,15 +442,25 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work if (!versioningFailed) { - if (workItem.Cursor == null) + // In this case we skip the orchestration's execution since all tasks have been completed and it is in a terminal state. + // Instead we "rewind" its execution by removing all failed tasks (see ProcessRewindOrchestrationDecision). + // Upon receiving the next work item for the rewound orchestration, the failed tasks will be re-executed. + if (isRewinding) { - workItem.Cursor = await this.ExecuteOrchestrationAsync(runtimeState, workItem); + decisions = new List { new RewindOrchestrationAction() }; } else { - await this.ResumeOrchestrationAsync(workItem); + if (workItem.Cursor == null) + { + workItem.Cursor = await this.ExecuteOrchestrationAsync(runtimeState, workItem); + } + else + { + await this.ResumeOrchestrationAsync(workItem); + } + decisions = workItem.Cursor.LatestDecisions.ToList(); } - decisions = workItem.Cursor.LatestDecisions.ToList(); } this.logHelper.OrchestrationExecuted( @@ -506,6 +544,19 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work isCompleted = !continuedAsNew; break; + case OrchestratorActionType.RewindOrchestration: + this.ProcessRewindOrchestrationDecision( + runtimeState, + out List subOrchestrationRewindMessages, + out OrchestrationRuntimeState newRuntimeState); + orchestratorMessages.AddRange(subOrchestrationRewindMessages); + workItem.OrchestrationRuntimeState = newRuntimeState; + runtimeState = newRuntimeState; + // Setting this to true here will end an extended session if it is in progress. + // We don't want to save the state across executions, since we essentially manually modify + // the orchestration history here and so that stored by the extended session is incorrect. + isRewinding = true; + break; default: throw TraceHelper.TraceExceptionInstance( TraceEventType.Error, @@ -655,10 +706,10 @@ protected async Task OnProcessWorkItemAsync(TaskOrchestrationWorkItem work await this.orchestrationService.CompleteTaskOrchestrationWorkItemAsync( workItem, runtimeState, - continuedAsNew ? null : messagesToSend, + messagesToSend, orchestratorMessages, - continuedAsNew ? null : timerMessages, - continuedAsNewMessage, + timerMessages, + continuedAsNewMessage: null, instanceState); if (workItem.RestoreOriginalRuntimeStateDuringCompletion) @@ -666,7 +717,7 @@ await this.orchestrationService.CompleteTaskOrchestrationWorkItemAsync( workItem.OrchestrationRuntimeState = runtimeState; } - return isCompleted || continuedAsNew || isInterrupted; + return isCompleted || isInterrupted || isRewinding; } static OrchestrationExecutionContext GetOrchestrationExecutionContext(OrchestrationRuntimeState runtimeState) @@ -728,7 +779,7 @@ async Task ExecuteOrchestrationAsync(Orchestration dispatchContext.SetProperty(workItem); dispatchContext.SetProperty(GetOrchestrationExecutionContext(runtimeState)); dispatchContext.SetProperty(this.entityParameters); - dispatchContext.SetProperty(new WorkItemMetadata { IsExtendedSession = workItem.IsExtendedSession, IncludePastEvents = true }); + dispatchContext.SetProperty(new WorkItemMetadata(workItem.IsExtendedSession, includeState: true)); TaskOrchestrationExecutor? executor = null; @@ -757,7 +808,8 @@ await this.dispatchPipeline.RunAsync(dispatchContext, _ => taskOrchestration, this.orchestrationService.EventBehaviourForContinueAsNew, this.entityParameters, - this.errorPropagationMode); + this.errorPropagationMode, + this.exceptionPropertiesProvider); OrchestratorExecutionResult resultFromOrchestrator = executor.Execute(); dispatchContext.SetProperty(resultFromOrchestrator); @@ -780,7 +832,7 @@ async Task ResumeOrchestrationAsync(TaskOrchestrationWorkItem workItem) dispatchContext.SetProperty(cursor.TaskOrchestration); dispatchContext.SetProperty(cursor.RuntimeState); dispatchContext.SetProperty(workItem); - dispatchContext.SetProperty(new WorkItemMetadata { IsExtendedSession = true, IncludePastEvents = false }); + dispatchContext.SetProperty(new WorkItemMetadata(isExtendedSession: true, includeState: false)); cursor.LatestDecisions = Enumerable.Empty(); await this.dispatchPipeline.RunAsync(dispatchContext, _ => @@ -841,6 +893,22 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt return false; } + if (message.Event.EventType == EventType.ExecutionRewound + && workItem.OrchestrationRuntimeState.OrchestrationStatus != OrchestrationStatus.Running + && workItem.NewMessages.Count > 1) + { + foreach (TaskMessage droppedMessage in workItem.NewMessages) + { + if (droppedMessage.Event.EventType != EventType.ExecutionRewound) + { + logHelper.DroppingOrchestrationMessage(workItem, droppedMessage, "Multiple messages sent to an instance " + + "that is attempting to rewind from a terminal state. The only message that can be sent in " + + "this case is the rewind request."); + } + } + return false; + } + logHelper.ProcessingOrchestrationMessage(workItem, message); TraceHelper.TraceInstance( TraceEventType.Information, @@ -921,7 +989,12 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt TraceHelper.EmitTraceActivityForTaskFailed(workItem.OrchestrationRuntimeState.OrchestrationInstance, taskScheduledEvent, taskFailedEvent, errorPropagationMode); } - workItem.OrchestrationRuntimeState.AddEvent(message.Event); + // In this case, the ExecutionRewoundEvent has already been added to the history and is just sent as a way to trigger the failed deepest suborchestrations to rerun. + // We do not redundantly add it to the history in this situation. + if (!(message.Event is ExecutionRewoundEvent executionRewoundEvent && workItem.OrchestrationRuntimeState.OrchestrationStatus == OrchestrationStatus.Running)) + { + workItem.OrchestrationRuntimeState.AddEvent(message.Event); + } } return true; @@ -950,6 +1023,10 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt runtimeState.AddEvent(executionCompletedEvent); + if (completeOrchestratorAction.Tags.TryGetValue(OrchestrationTags.CompleteOrchestrationLogWarning, out string warningMessage)) + { + this.logHelper.OrchestrationCompletedWithWarning(runtimeState.OrchestrationInstance!, completeOrchestratorAction.OrchestrationStatus, warningMessage); + } this.logHelper.OrchestrationCompleted(runtimeState, completeOrchestratorAction); TraceHelper.TraceInstance( runtimeState.OrchestrationStatus == OrchestrationStatus.Failed ? TraceEventType.Warning : TraceEventType.Information, @@ -965,6 +1042,8 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt runtimeState.OrchestrationInstance!, () => Utils.EscapeJson(JsonDataConverter.Default.Serialize(runtimeState.GetOrchestrationRuntimeStateDump(), true))); + SetOrchestrationActivityStatus(completeOrchestratorAction); + // Check to see if we need to start a new execution if (completeOrchestratorAction.OrchestrationStatus == OrchestrationStatus.ContinuedAsNew) { @@ -1012,12 +1091,6 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt subOrchestrationFailedEvent.FailureDetails = completeOrchestratorAction.FailureDetails; taskMessage.Event = subOrchestrationFailedEvent; - - if (completeOrchestratorAction.OrchestrationStatus == OrchestrationStatus.Failed) - { - DistributedTraceActivity.Current?.SetStatus( - ActivityStatusCode.Error, completeOrchestratorAction.Result); - } } ResetDistributedTraceActivity(runtimeState); @@ -1029,12 +1102,6 @@ internal static bool ReconcileMessagesWithState(TaskOrchestrationWorkItem workIt } } - if (completeOrchestratorAction.OrchestrationStatus == OrchestrationStatus.Failed) - { - DistributedTraceActivity.Current?.SetStatus( - ActivityStatusCode.Error, completeOrchestratorAction.Result); - } - ResetDistributedTraceActivity(runtimeState); return null; @@ -1047,6 +1114,34 @@ private void ResetDistributedTraceActivity(OrchestrationRuntimeState runtimeStat DistributedTraceActivity.Current = null; } + private static void SetOrchestrationActivityStatus(OrchestrationCompleteOrchestratorAction completeOrchestratorAction) + { + if (DistributedTraceActivity.Current == null) + { + return; + } + + string failureDescription = completeOrchestratorAction.FailureDetails?.ErrorMessage + ?? completeOrchestratorAction.Result + ?? completeOrchestratorAction.OrchestrationStatus.ToString(); + + switch (completeOrchestratorAction.OrchestrationStatus) + { + case OrchestrationStatus.Completed: + DistributedTraceActivity.Current.SetStatus(ActivityStatusCode.OK, OrchestrationStatus.Completed.ToString()); + break; + case OrchestrationStatus.ContinuedAsNew: + DistributedTraceActivity.Current.SetStatus(ActivityStatusCode.OK, OrchestrationStatus.ContinuedAsNew.ToString()); + break; + case OrchestrationStatus.Failed: + DistributedTraceActivity.Current.SetStatus(ActivityStatusCode.Error, failureDescription); + break; + case OrchestrationStatus.Terminated: + DistributedTraceActivity.Current.SetStatus(ActivityStatusCode.Error, OrchestrationStatus.Terminated.ToString()); + break; + } + } + TaskMessage ProcessScheduleTaskDecision( ScheduleTaskOrchestratorAction scheduleTaskOrchestratorAction, OrchestrationRuntimeState runtimeState, @@ -1143,11 +1238,14 @@ TaskMessage ProcessCreateSubOrchestrationInstanceDecision( bool includeParameters, Activity? parentTraceActivity) { + IDictionary mergedTags = OrchestrationTags.MergeTags(createSubOrchestrationAction.Tags, runtimeState.Tags); + var historyEvent = new SubOrchestrationInstanceCreatedEvent(createSubOrchestrationAction.Id) { Name = createSubOrchestrationAction.Name, Version = createSubOrchestrationAction.Version, - InstanceId = createSubOrchestrationAction.InstanceId + InstanceId = createSubOrchestrationAction.InstanceId, + Tags = mergedTags, }; if (includeParameters) { @@ -1160,7 +1258,6 @@ TaskMessage ProcessCreateSubOrchestrationInstanceDecision( var startedEvent = new ExecutionStartedEvent(-1, createSubOrchestrationAction.Input) { - Tags = OrchestrationTags.MergeTags(createSubOrchestrationAction.Tags, runtimeState.Tags), OrchestrationInstance = new OrchestrationInstance { InstanceId = createSubOrchestrationAction.InstanceId, @@ -1174,7 +1271,8 @@ TaskMessage ProcessCreateSubOrchestrationInstanceDecision( TaskScheduleId = createSubOrchestrationAction.Id }, Name = createSubOrchestrationAction.Name, - Version = createSubOrchestrationAction.Version + Version = createSubOrchestrationAction.Version, + Tags = mergedTags, }; // If a parent trace context was provided via the CreateSubOrchestrationAction.Tags, we will use this as the parent trace context of the suborchestration execution Activity rather than Activity.Current.Context. @@ -1243,6 +1341,130 @@ TaskMessage ProcessSendEventDecision( }; } + void ProcessRewindOrchestrationDecision( + OrchestrationRuntimeState runtimeState, + out List subOrchestrationRewindMessages, + out OrchestrationRuntimeState newRuntimeState) + { + + /* WARNING!!!: + * If any changes are made to how this method modifies the orchestration's history, then corresponding changes *must* + * be made in the backend implementations that rely on this method for executing a rewind. + */ + + HashSet failedTaskIds = new(); + subOrchestrationRewindMessages = new(); + + newRuntimeState = new() + { + Status = runtimeState.Status + }; + + // Determine the task IDs of the failed tasks and suborchestrations + foreach (var evt in runtimeState.Events) + { + if (evt is TaskFailedEvent taskFailedEvent) + { + failedTaskIds.Add(taskFailedEvent.TaskScheduledId); + } + else if (evt is SubOrchestrationInstanceFailedEvent subOrchestrationInstanceFailedEvent) + { + failedTaskIds.Add(subOrchestrationInstanceFailedEvent.TaskScheduledId); + } + } + + ExecutionRewoundEvent executionRewoundEvent = (runtimeState.NewEvents.Last(e => e is ExecutionRewoundEvent) as ExecutionRewoundEvent)!; + string newExecutionId = Guid.NewGuid().ToString("N"); + + // Copy the existing history, removing the failed task/suborchestration events and generating rewind events for each of the failed suborchestrations. + foreach (var evt in runtimeState.Events) + { + // Do not add the TaskScheduledEvents for the failed tasks so that they get rescheduled, and do not add any of + // the failed task/suborchestration/execution events to the new history. + if (!(evt is TaskScheduledEvent taskScheduledEvent && failedTaskIds.Contains(taskScheduledEvent.EventId)) + && evt is not TaskFailedEvent + && evt is not SubOrchestrationInstanceFailedEvent + && evt is not ExecutionCompletedEvent) + { + HistoryEvent eventToAdd = evt; + + if (evt is ExecutionStartedEvent executionStartedEvent) + { + // Copy all information from the old ExecutionStartedEvent except for the ExecutionId, since we create a new one + var newExecutionStartedEvent = new ExecutionStartedEvent(executionStartedEvent); + newExecutionStartedEvent.OrchestrationInstance.ExecutionId = newExecutionId; + + // If this is a suborchestration, we also need to update the ParentInstance's ExecutionId to match the new ExecutionId of the rewinding parent orchestration + if (!string.IsNullOrEmpty(executionRewoundEvent.ParentExecutionId)) + { + newExecutionStartedEvent.ParentInstance.OrchestrationInstance.ExecutionId = executionRewoundEvent.ParentExecutionId; + } + eventToAdd = newExecutionStartedEvent; + } + + // For each of the failed suborchestrations, generate a rewind event + else if (evt is SubOrchestrationInstanceCreatedEvent subOrchestrationInstanceCreatedEvent + && failedTaskIds.Contains(subOrchestrationInstanceCreatedEvent.EventId)) + { + var childExecutionRewoundEvent = new ExecutionRewoundEvent(-1, executionRewoundEvent!.Reason) + { + ParentExecutionId = newExecutionId, + InstanceId = subOrchestrationInstanceCreatedEvent.InstanceId + }; + + if (runtimeState.ExecutionStartedEvent.TryGetParentTraceContext(out ActivityContext parentTraceContext)) + { + // We set a new client span ID here so that the execution of the rewound suborchestration is not tied to the + // old parent. + var newClientSpanId = ActivitySpanId.CreateRandom(); + var newSubOrchestrationInstanceCreatedEvent = new SubOrchestrationInstanceCreatedEvent(subOrchestrationInstanceCreatedEvent) + { + ClientSpanId = newClientSpanId.ToString() + }; + eventToAdd = newSubOrchestrationInstanceCreatedEvent; + + ActivityContext childActivityContext = new( + parentTraceContext.TraceId, + newClientSpanId, + parentTraceContext.TraceFlags, + parentTraceContext.TraceState); + childExecutionRewoundEvent.SetParentTraceContext(childActivityContext); + } + + subOrchestrationRewindMessages.Add + ( + new TaskMessage + { + Event = childExecutionRewoundEvent, + OrchestrationInstance = new OrchestrationInstance + { + InstanceId = subOrchestrationInstanceCreatedEvent.InstanceId + }, + } + ); + } + + // Finally, add the event to the new history + newRuntimeState.AddEvent(eventToAdd); + } + } + + // If this is a "terminal leaf" with no suborchestrations, we need to add an outbound message to it to force it to rerun. + // This will trigger the orchestration to rerun with the altered history, so it will only rerun the failed tasks. + // Once it finishes, it will send a completion message to its parent orchestration, which will trigger the parents to rerun as well. + if (subOrchestrationRewindMessages.Count == 0) + { + subOrchestrationRewindMessages.Add( + new TaskMessage + { + // This is a "dummy event" that will not be added to the history and is used just to trigger the rerun. + Event = new ExecutionRewoundEvent(-1, string.Empty), + OrchestrationInstance = newRuntimeState.OrchestrationInstance, + } + ); + } + } + internal class NonBlockingCountdownLock { int available; diff --git a/src/DurableTask.Core/TaskOrchestrationExecutor.cs b/src/DurableTask.Core/TaskOrchestrationExecutor.cs index e3dc6fc49..540851e50 100644 --- a/src/DurableTask.Core/TaskOrchestrationExecutor.cs +++ b/src/DurableTask.Core/TaskOrchestrationExecutor.cs @@ -35,6 +35,7 @@ public class TaskOrchestrationExecutor readonly OrchestrationRuntimeState orchestrationRuntimeState; readonly TaskOrchestration taskOrchestration; readonly bool skipCarryOverEvents; + readonly IExceptionPropertiesProvider? exceptionPropertiesProvider; Task? result; /// @@ -51,16 +52,8 @@ public TaskOrchestrationExecutor( BehaviorOnContinueAsNew eventBehaviourForContinueAsNew, TaskOrchestrationEntityParameters? entityParameters, ErrorPropagationMode errorPropagationMode = ErrorPropagationMode.SerializeExceptions) + : this(orchestrationRuntimeState, taskOrchestration, eventBehaviourForContinueAsNew, entityParameters, errorPropagationMode, null) { - this.decisionScheduler = new SynchronousTaskScheduler(); - this.context = new TaskOrchestrationContext( - orchestrationRuntimeState.OrchestrationInstance, - this.decisionScheduler, - entityParameters, - errorPropagationMode); - this.orchestrationRuntimeState = orchestrationRuntimeState; - this.taskOrchestration = taskOrchestration; - this.skipCarryOverEvents = eventBehaviourForContinueAsNew == BehaviorOnContinueAsNew.Ignore; } /// @@ -76,8 +69,38 @@ public TaskOrchestrationExecutor( TaskOrchestration taskOrchestration, BehaviorOnContinueAsNew eventBehaviourForContinueAsNew, ErrorPropagationMode errorPropagationMode = ErrorPropagationMode.SerializeExceptions) - : this(orchestrationRuntimeState, taskOrchestration, eventBehaviourForContinueAsNew, entityParameters: null, errorPropagationMode) + : this(orchestrationRuntimeState, taskOrchestration, eventBehaviourForContinueAsNew, entityParameters: null, errorPropagationMode, null) + { + } + + /// + /// Initializes a new instance of the class. + /// + /// + /// + /// + /// + /// + /// + public TaskOrchestrationExecutor( + OrchestrationRuntimeState orchestrationRuntimeState, + TaskOrchestration taskOrchestration, + BehaviorOnContinueAsNew eventBehaviourForContinueAsNew, + TaskOrchestrationEntityParameters? entityParameters, + ErrorPropagationMode errorPropagationMode, + IExceptionPropertiesProvider? exceptionPropertiesProvider) { + this.decisionScheduler = new SynchronousTaskScheduler(); + this.context = new TaskOrchestrationContext( + orchestrationRuntimeState.OrchestrationInstance, + this.decisionScheduler, + entityParameters, + errorPropagationMode, + exceptionPropertiesProvider); + this.orchestrationRuntimeState = orchestrationRuntimeState; + this.taskOrchestration = taskOrchestration; + this.skipCarryOverEvents = eventBehaviourForContinueAsNew == BehaviorOnContinueAsNew.Ignore; + this.exceptionPropertiesProvider = exceptionPropertiesProvider; } /// diff --git a/src/DurableTask.Core/Tracing/DistributedTraceContext.cs b/src/DurableTask.Core/Tracing/DistributedTraceContext.cs index b69b1f5f6..612caa386 100644 --- a/src/DurableTask.Core/Tracing/DistributedTraceContext.cs +++ b/src/DurableTask.Core/Tracing/DistributedTraceContext.cs @@ -80,5 +80,15 @@ public string? TraceState /// [DataMember] public DateTimeOffset? ActivityStartTime { get; set; } + + internal DistributedTraceContext Clone() + { + return new DistributedTraceContext(this.TraceParent, this.TraceState) + { + Id = Id, + SpanId = SpanId, + ActivityStartTime = ActivityStartTime + }; + } } } diff --git a/src/DurableTask.Core/Tracing/TraceHelper.cs b/src/DurableTask.Core/Tracing/TraceHelper.cs index 5f80784cd..9c45b2889 100644 --- a/src/DurableTask.Core/Tracing/TraceHelper.cs +++ b/src/DurableTask.Core/Tracing/TraceHelper.cs @@ -602,7 +602,13 @@ internal static void EndActivitiesForProcessingEntityInvocation(List t { if (result.ErrorMessage != null || result.FailureDetails != null) { - activity.SetTag(Schema.Task.ErrorMessage, result.ErrorMessage ?? result.FailureDetails!.ErrorMessage); + string errorDetails = result.ErrorMessage ?? result.FailureDetails!.ErrorMessage; + activity.SetTag(Schema.Task.ErrorMessage, errorDetails); + activity.SetStatus(ActivityStatusCode.Error, errorDetails); + } + else + { + activity.SetStatus(ActivityStatusCode.OK, "Completed"); } if (result.StartTimeUtc is DateTime startTime) { @@ -630,6 +636,7 @@ internal static void EndActivitiesForProcessingEntityInvocation(List t if (activity != null) { activity.SetTag(Schema.Task.ErrorMessage, errorMessage); + activity.SetStatus(ActivityStatusCode.Error, errorMessage); activity.Dispose(); } } diff --git a/src/DurableTask.Core/WorkItemDispatcher.cs b/src/DurableTask.Core/WorkItemDispatcher.cs index 3829578d8..1a64065e5 100644 --- a/src/DurableTask.Core/WorkItemDispatcher.cs +++ b/src/DurableTask.Core/WorkItemDispatcher.cs @@ -142,7 +142,14 @@ public async Task StartAsync() // We just want this to Run we intentionally don't wait #pragma warning disable 4014 - Task.Run(() => this.DispatchAsync(context)); + Task.Run(() => this.DispatchAsync(context)).ContinueWith(t => + { + TraceHelper.TraceException( + TraceEventType.Critical, + "WorkItemDispatcherDispatch-FatalTermination", + t.Exception, + $"Dispatch loop for '{this.name}' terminated fatally!"); + }, TaskContinuationOptions.OnlyOnFaulted); #pragma warning restore 4014 } } @@ -224,128 +231,167 @@ async Task DispatchAsync(WorkItemDispatcherContext context) bool logThrottle = true; while (this.isStarted) { - if (!await this.concurrencyLock.WaitAsync(TimeSpan.FromSeconds(5))) + var semaphoreAcquired = false; + var scheduledWorkItem = false; + try { - if (logThrottle) + if (!await this.concurrencyLock.WaitAsync(TimeSpan.FromSeconds(5))) { - // This can happen frequently under heavy load. - // To avoid log spam, we log just once until we can proceed. - this.LogHelper.FetchingThrottled( - context, - this.concurrentWorkItemCount, - this.MaxConcurrentWorkItems); - TraceHelper.Trace( - TraceEventType.Warning, - "WorkItemDispatcherDispatch-MaxOperations", - this.GetFormattedLog(dispatcherId, $"Max concurrent operations ({this.concurrentWorkItemCount}) are already in progress. Still waiting for next accept.")); - - logThrottle = false; + if (logThrottle) + { + // This can happen frequently under heavy load. + // To avoid log spam, we log just once until we can proceed. + this.LogHelper.FetchingThrottled( + context, + this.concurrentWorkItemCount, + this.MaxConcurrentWorkItems); + TraceHelper.Trace( + TraceEventType.Warning, + "WorkItemDispatcherDispatch-MaxOperations", + this.GetFormattedLog(dispatcherId, $"Max concurrent operations ({this.concurrentWorkItemCount}) are already in progress. Still waiting for next accept.")); + + logThrottle = false; + } + + continue; } - continue; - } + semaphoreAcquired = true; + logThrottle = true; - logThrottle = true; + var delaySecs = 0; + T workItem = default(T); + try + { + Interlocked.Increment(ref this.activeFetchers); + this.LogHelper.FetchWorkItemStarting(context, DefaultReceiveTimeout, this.concurrentWorkItemCount, this.MaxConcurrentWorkItems); + TraceHelper.Trace( + TraceEventType.Verbose, + "WorkItemDispatcherDispatch-StartFetch", + this.GetFormattedLog(dispatcherId, $"Starting fetch with timeout of {DefaultReceiveTimeout} ({this.concurrentWorkItemCount}/{this.MaxConcurrentWorkItems} max)")); - var delaySecs = 0; - T workItem = default(T); - try - { - Interlocked.Increment(ref this.activeFetchers); - this.LogHelper.FetchWorkItemStarting(context, DefaultReceiveTimeout, this.concurrentWorkItemCount, this.MaxConcurrentWorkItems); - TraceHelper.Trace( - TraceEventType.Verbose, - "WorkItemDispatcherDispatch-StartFetch", - this.GetFormattedLog(dispatcherId, $"Starting fetch with timeout of {DefaultReceiveTimeout} ({this.concurrentWorkItemCount}/{this.MaxConcurrentWorkItems} max)")); + Stopwatch timer = Stopwatch.StartNew(); + workItem = await this.FetchWorkItem(DefaultReceiveTimeout, this.shutdownCancellationTokenSource.Token); - Stopwatch timer = Stopwatch.StartNew(); - workItem = await this.FetchWorkItem(DefaultReceiveTimeout, this.shutdownCancellationTokenSource.Token); + if (!IsNull(workItem)) + { + string workItemId = this.workItemIdentifier(workItem); + this.LogHelper.FetchWorkItemCompleted( + context, + workItemId, + timer.Elapsed, + this.concurrentWorkItemCount, + this.MaxConcurrentWorkItems); + } - if (!IsNull(workItem)) + TraceHelper.Trace( + TraceEventType.Verbose, + "WorkItemDispatcherDispatch-EndFetch", + this.GetFormattedLog(dispatcherId, $"After fetch ({timer.ElapsedMilliseconds} ms) ({this.concurrentWorkItemCount}/{this.MaxConcurrentWorkItems} max)")); + } + catch (TimeoutException) { - string workItemId = this.workItemIdentifier(workItem); - this.LogHelper.FetchWorkItemCompleted( - context, - workItemId, - timer.Elapsed, - this.concurrentWorkItemCount, - this.MaxConcurrentWorkItems); + delaySecs = 0; } - - TraceHelper.Trace( - TraceEventType.Verbose, - "WorkItemDispatcherDispatch-EndFetch", - this.GetFormattedLog(dispatcherId, $"After fetch ({timer.ElapsedMilliseconds} ms) ({this.concurrentWorkItemCount}/{this.MaxConcurrentWorkItems} max)")); - } - catch (TimeoutException) - { - delaySecs = 0; - } - catch (TaskCanceledException exception) - { - TraceHelper.Trace( - TraceEventType.Information, - "WorkItemDispatcherDispatch-TaskCanceledException", - this.GetFormattedLog(dispatcherId, $"TaskCanceledException while fetching workItem, should be harmless: {exception.Message}")); - delaySecs = this.GetDelayInSecondsAfterOnFetchException(exception); - } - catch (Exception exception) - { - if (!this.isStarted) + catch (TaskCanceledException exception) { TraceHelper.Trace( - TraceEventType.Information, - "WorkItemDispatcherDispatch-HarmlessException", - this.GetFormattedLog(dispatcherId, $"Harmless exception while fetching workItem after Stop(): {exception.Message}")); + TraceEventType.Information, + "WorkItemDispatcherDispatch-TaskCanceledException", + this.GetFormattedLog(dispatcherId, $"TaskCanceledException while fetching workItem, should be harmless: {exception.Message}")); + delaySecs = this.GetDelayInSecondsAfterOnFetchException(exception); } - else + catch (Exception exception) { - this.LogHelper.FetchWorkItemFailure(context, exception); - // TODO : dump full node context here - TraceHelper.TraceException( - TraceEventType.Warning, - "WorkItemDispatcherDispatch-Exception", - exception, - this.GetFormattedLog(dispatcherId, $"Exception while fetching workItem: {exception.Message}")); - delaySecs = this.GetDelayInSecondsAfterOnFetchException(exception); + if (!this.isStarted) + { + TraceHelper.Trace( + TraceEventType.Information, + "WorkItemDispatcherDispatch-HarmlessException", + this.GetFormattedLog(dispatcherId, $"Harmless exception while fetching workItem after Stop(): {exception.Message}")); + } + else + { + this.LogHelper.FetchWorkItemFailure(context, exception); + // TODO : dump full node context here + TraceHelper.TraceException( + TraceEventType.Warning, + "WorkItemDispatcherDispatch-Exception", + exception, + this.GetFormattedLog(dispatcherId, $"Exception while fetching workItem: {exception.Message}")); + delaySecs = this.GetDelayInSecondsAfterOnFetchException(exception); + } + } + finally + { + Interlocked.Decrement(ref this.activeFetchers); } - } - finally - { - Interlocked.Decrement(ref this.activeFetchers); - } - var scheduledWorkItem = false; - if (!IsNull(workItem)) - { - if (!this.isStarted) + if (!IsNull(workItem)) { - if (this.SafeReleaseWorkItem != null) + if (!this.isStarted) { - await this.SafeReleaseWorkItem(workItem); + if (this.SafeReleaseWorkItem != null) + { + await this.SafeReleaseWorkItem(workItem); + } + } + else + { + Interlocked.Increment(ref this.concurrentWorkItemCount); + // We just want this to Run we intentionally don't wait + #pragma warning disable 4014 + Task.Run(() => this.ProcessWorkItemAsync(context, workItem)); + #pragma warning restore 4014 + + scheduledWorkItem = true; } } - else + + delaySecs = Math.Max(this.delayOverrideSecs, delaySecs); + if (delaySecs > 0) { - Interlocked.Increment(ref this.concurrentWorkItemCount); - // We just want this to Run we intentionally don't wait - #pragma warning disable 4014 - Task.Run(() => this.ProcessWorkItemAsync(context, workItem)); - #pragma warning restore 4014 + await Task.Delay(TimeSpan.FromSeconds(delaySecs)); + } - scheduledWorkItem = true; + if (!scheduledWorkItem) + { + this.concurrencyLock.Release(); } } - - delaySecs = Math.Max(this.delayOverrideSecs, delaySecs); - if (delaySecs > 0) + catch (Exception exception) when (!Utils.IsFatal(exception)) { - await Task.Delay(TimeSpan.FromSeconds(delaySecs)); - } + // Catch-all for any unhandled exception in the dispatch loop body. + // Without this, the dispatch loop would silently terminate because + // DispatchAsync runs as a fire-and-forget Task.Run. + this.LogHelper.DispatcherLoopFailed(context, exception); + TraceHelper.TraceException( + TraceEventType.Error, + "WorkItemDispatcherDispatch-UnhandledException", + exception, + this.GetFormattedLog(dispatcherId, + $"Unhandled exception in dispatch loop. Will retry after backoff.")); + + // Release the semaphore if we acquired it but never handed it off + // to ProcessWorkItemAsync, to avoid permanently reducing concurrency. + if (semaphoreAcquired && !scheduledWorkItem) + { + try { this.concurrencyLock.Release(); } catch { /* best effort */ } + } - if (!scheduledWorkItem) - { - this.concurrencyLock.Release(); + try + { + await Task.Delay(TimeSpan.FromSeconds(BackOffIntervalOnInvalidOperationSecs), this.shutdownCancellationTokenSource.Token); + } + catch (OperationCanceledException) + { + // Shutdown requested during backoff; exit promptly. + } + catch (ObjectDisposedException) + { + // CancellationTokenSource was disposed (e.g., Dispose called + // shortly after StopAsync); treat as shutdown. + } } } diff --git a/src/DurableTask.Core/WorkItemMetadata.cs b/src/DurableTask.Core/WorkItemMetadata.cs index ae3de4651..c2e908f48 100644 --- a/src/DurableTask.Core/WorkItemMetadata.cs +++ b/src/DurableTask.Core/WorkItemMetadata.cs @@ -1,19 +1,39 @@ -namespace DurableTask.Core +// ---------------------------------------------------------------------------------- +// Copyright Microsoft Corporation +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ---------------------------------------------------------------------------------- +#nullable enable +namespace DurableTask.Core { /// /// A class representing metadata information about a work item. /// public class WorkItemMetadata { + internal WorkItemMetadata(bool isExtendedSession, bool includeState) + { + this.IsExtendedSession = isExtendedSession; + this.IncludeState = includeState; + } + /// - /// Gets or sets whether or not the execution of the work item is within an extended session. + /// Gets whether or not the execution of the work item is within an extended session. /// - public bool IsExtendedSession { get; set; } + public bool IsExtendedSession { get; private set; } /// - /// Gets or sets whether or not to include past events in the orchestration history when executing the work item via middleware. - /// This assumes that the middleware is able to handle extended sessions and does not require history for replays. + /// Gets whether or not to include instance state when executing the work item via middleware. + /// When false, this assumes that the middleware is able to handle extended sessions and has already cached + /// the instance state from a previous execution, so it does not need to be included again. /// - public bool IncludePastEvents { get; set; } + public bool IncludeState { get; private set; } } } diff --git a/src/DurableTask.Emulator/DurableTask.Emulator.csproj b/src/DurableTask.Emulator/DurableTask.Emulator.csproj index b89e40bcc..6b4d3aa1c 100644 --- a/src/DurableTask.Emulator/DurableTask.Emulator.csproj +++ b/src/DurableTask.Emulator/DurableTask.Emulator.csproj @@ -2,7 +2,7 @@ - netstandard2.0;net462 + netstandard2.0;net48 Microsoft.Azure.DurableTask.Emulator NU5125;NU5048 diff --git a/src/DurableTask.ServiceBus/DurableTask.ServiceBus.csproj b/src/DurableTask.ServiceBus/DurableTask.ServiceBus.csproj index 36038afaa..0152c0860 100644 --- a/src/DurableTask.ServiceBus/DurableTask.ServiceBus.csproj +++ b/src/DurableTask.ServiceBus/DurableTask.ServiceBus.csproj @@ -2,7 +2,7 @@ - netstandard2.0;net462 + netstandard2.0;net48 Microsoft.Azure.DurableTask.ServiceBus AnyCPU;x64 @@ -11,7 +11,7 @@ 4 0 - 2 + 3 $(MajorVersion).$(MinorVersion).$(PatchVersion) $(VersionPrefix).0 @@ -24,7 +24,7 @@ $(VersionPrefix) - + diff --git a/src/DurableTask.ServiceBus/ServiceBusOrchestrationService.cs b/src/DurableTask.ServiceBus/ServiceBusOrchestrationService.cs index 3d49bb749..30bcb81c2 100644 --- a/src/DurableTask.ServiceBus/ServiceBusOrchestrationService.cs +++ b/src/DurableTask.ServiceBus/ServiceBusOrchestrationService.cs @@ -242,8 +242,8 @@ public ServiceBusOrchestrationService( public async Task StartAsync() { this.cancellationTokenSource = new CancellationTokenSource(); - this.orchestrationSessions = new ConcurrentDictionary(); - this.orchestrationMessages = new ConcurrentDictionary(); + this.orchestrationSessions = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); + this.orchestrationMessages = new ConcurrentDictionary(StringComparer.OrdinalIgnoreCase); this.orchestratorSender = new MessageSender(this.serviceBusConnection, this.orchestratorEntityName, this.workerEntityName); this.workerSender = new MessageSender(this.serviceBusConnection, this.workerEntityName, this.orchestratorEntityName); @@ -1580,13 +1580,31 @@ void LogSentMessages(IMessageSession session, string messageType, IList $"{m.Message.MessageId} <{m.Action?.Event.EventId.ToString()}>"))}")); + string.Join(",", messages.Select(m => + { + string scheduledTime = m.Message.ScheduledEnqueueTimeUtc > DateTime.MinValue + ? $" scheduledAt:{m.Message.ScheduledEnqueueTimeUtc:o}" + : ""; + string targetSession = !string.IsNullOrEmpty(m.Message.SessionId) + ? $" targetSession:{m.Message.SessionId}" + : ""; + return $"{m.Message.MessageId} <{m.Action?.Event.EventId.ToString()}>{scheduledTime}{targetSession}"; + }))}")); } async Task GetSessionStateAsync(IMessageSession session, IOrchestrationServiceBlobStore orchestrationServiceBlobStore) { byte[] state = await session.GetStateAsync(); + if (state == null || state.Length == 0) + { + TraceHelper.TraceSession( + TraceEventType.Information, + "ServiceBusOrchestrationService-GetSessionState-EmptyState", + session.SessionId, + $"Session '{session.SessionId}' has null or empty state ({state?.Length ?? 0} bytes)."); + } + using (Stream rawSessionStream = state != null ? new MemoryStream(state) : null) { this.ServiceStats.OrchestrationDispatcherStats.SessionGets.Increment(); @@ -1615,6 +1633,32 @@ async Task TrySetSessionStateAsync( newOrchestrationRuntimeState.ExecutionStartedEvent == null || newOrchestrationRuntimeState.OrchestrationStatus != OrchestrationStatus.Running) { + string reason; + TraceEventType traceLevel; + + if (newOrchestrationRuntimeState == null) + { + reason = "newOrchestrationRuntimeState is null"; + traceLevel = TraceEventType.Warning; + } + else if (newOrchestrationRuntimeState.ExecutionStartedEvent == null) + { + reason = "ExecutionStartedEvent is null (possible ghost session with empty state)"; + traceLevel = TraceEventType.Warning; + } + else + { + reason = $"OrchestrationStatus is {newOrchestrationRuntimeState.OrchestrationStatus}"; + traceLevel = TraceEventType.Information; + } + + TraceHelper.TraceSession( + traceLevel, + "ServiceBusOrchestrationService-TrySetSessionState-DeletingState", + workItem.InstanceId, + $"Setting session state to null. Reason: {reason}. " + + $"Session: '{session.SessionId}', InstanceId: '{workItem.InstanceId}'"); + await session.SetStateAsync(null); return true; } diff --git a/test/DurableTask.AzureServiceFabric.Integration.Tests/DurableTask.AzureServiceFabric.Integration.Tests.csproj b/test/DurableTask.AzureServiceFabric.Integration.Tests/DurableTask.AzureServiceFabric.Integration.Tests.csproj index 203744464..09d30327f 100644 --- a/test/DurableTask.AzureServiceFabric.Integration.Tests/DurableTask.AzureServiceFabric.Integration.Tests.csproj +++ b/test/DurableTask.AzureServiceFabric.Integration.Tests/DurableTask.AzureServiceFabric.Integration.Tests.csproj @@ -2,7 +2,7 @@ - net462 + net48 true AnyCPU;x64 diff --git a/test/DurableTask.AzureServiceFabric.Tests/DurableTask.AzureServiceFabric.Tests.csproj b/test/DurableTask.AzureServiceFabric.Tests/DurableTask.AzureServiceFabric.Tests.csproj index c74247869..e409082a0 100644 --- a/test/DurableTask.AzureServiceFabric.Tests/DurableTask.AzureServiceFabric.Tests.csproj +++ b/test/DurableTask.AzureServiceFabric.Tests/DurableTask.AzureServiceFabric.Tests.csproj @@ -2,7 +2,7 @@ - net462 + net48 AnyCPU;x64 diff --git a/test/DurableTask.AzureStorage.Tests/AzureStorageScaleTests.cs b/test/DurableTask.AzureStorage.Tests/AzureStorageScaleTests.cs index 64b9eb22b..7d7ac8a78 100644 --- a/test/DurableTask.AzureStorage.Tests/AzureStorageScaleTests.cs +++ b/test/DurableTask.AzureStorage.Tests/AzureStorageScaleTests.cs @@ -145,7 +145,7 @@ async Task EnsureTaskHubAsync( try { - Assert.IsTrue(trackingStore.ExistsAsync().Result, $"Tracking Store was not created."); + Assert.IsTrue(await trackingStore.ExistsAsync(), $"Tracking Store was not created."); } catch (NotSupportedException) { } @@ -182,7 +182,7 @@ async Task EnsureTaskHubAsync( try { - Assert.IsFalse(trackingStore.ExistsAsync().Result, $"Tracking Store was not deleted."); + Assert.IsFalse(await trackingStore.ExistsAsync(), $"Tracking Store was not deleted."); } catch (NotSupportedException) { } @@ -195,7 +195,7 @@ async Task EnsureTaskHubAsync( private async Task EnsureLeasesMatchControlQueue(string directoryReference, BlobContainerClient taskHubContainer, ControlQueue[] controlQueues) { - BlobItem[] leaseBlobs = await taskHubContainer.GetBlobsAsync(prefix: directoryReference).ToArrayAsync(); + BlobItem[] leaseBlobs = await taskHubContainer.GetBlobsAsync(traits: BlobTraits.None, states: BlobStates.None, prefix: directoryReference, cancellationToken: default).ToArrayAsync(); Assert.AreEqual(controlQueues.Length, leaseBlobs.Length, "Expected to see the same number of control queues and lease blobs."); foreach (BlobItem blobItem in leaseBlobs) { @@ -322,9 +322,12 @@ public async Task MultiWorkerLeaseMovement(PartitionManagerType partitionManager Assert.IsTrue( service.OwnedControlQueues.All(q => ownedLeases.Any(l => l.Name.Contains(q.Name))), "Mismatch between queue assignment and lease ownership."); - Assert.IsTrue( - service.OwnedControlQueues.All(q => q.InnerQueue.ExistsAsync().GetAwaiter().GetResult()), - $"One or more control queues owned by {service.WorkerId} do not exist"); + foreach (var q in service.OwnedControlQueues) + { + Assert.IsTrue( + await q.InnerQueue.ExistsAsync(), + $"Control queue {q.Name} owned by {service.WorkerId} does not exist"); + } } Assert.AreEqual(totalLeaseCount, allQueueNames.Count, "Unexpected number of queues!"); @@ -505,133 +508,142 @@ await TestHelpers.WaitFor( [TestMethod] public async Task MultipleWorkersAttemptingToCompleteSameWorkItem() { - var orchestrationInstance = new OrchestrationInstance - { - InstanceId = "instance_id", - ExecutionId = "execution_id", - }; - - ExecutionStartedEvent startedEvent = new(-1, string.Empty) + AzureStorageOrchestrationService service1 = null; + AzureStorageOrchestrationService service2 = null; + try { - Name = "orchestration", - Version = string.Empty, - OrchestrationInstance = orchestrationInstance, - ScheduledStartTime = DateTime.UtcNow, - }; - - // Create worker 1, wait for it to acquire the lease. - // Make sure to set a small control queue visibility timeout so that worker 2 can reacquire the work item quickly once worker 1 loses the lease. - var service1 = await this.EnsureTaskHubAsync( - nameof(MultipleWorkersAttemptingToCompleteSameWorkItem), - testDeletion: false, - deleteBeforeCreate: true, - partitionCount: 1, - workerId: "1", - controlQueueVisibilityTimeout: TimeSpan.FromSeconds(1) - ); - await service1.StartAsync(); - await TestHelpers.WaitFor( - condition: () => service1.OwnedControlQueues.Any(), - timeout: TimeSpan.FromSeconds(30)); - ControlQueue controlQueue = service1.OwnedControlQueues.Single(); + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = "instance_id", + ExecutionId = "execution_id", + }; - // Create the orchestration and get the first work item and start "working" on it - await service1.CreateTaskOrchestrationAsync( - new TaskMessage() + ExecutionStartedEvent startedEvent = new(-1, string.Empty) { + Name = "orchestration", + Version = string.Empty, OrchestrationInstance = orchestrationInstance, - Event = startedEvent - }); - var workItem1 = await service1.LockNextTaskOrchestrationWorkItemAsync( - TimeSpan.FromMinutes(5), - CancellationToken.None); - var runtimeState = workItem1.OrchestrationRuntimeState; - runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); - runtimeState.AddEvent(startedEvent); - runtimeState.AddEvent(new TaskScheduledEvent(0, "task")); - runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); - - // Now lose the lease - BlobPartitionLease lease = await service1.ListBlobLeasesAsync().SingleAsync(); - await service1.OnOwnershipLeaseReleasedAsync(lease, CloseReason.LeaseLost); - await TestHelpers.WaitFor( - condition: () => !service1.OwnedControlQueues.Any(), - timeout: TimeSpan.FromSeconds(30)); - - // Create worker 2, wait for it to now acquire the lease - var service2 = await this.EnsureTaskHubAsync( - nameof(MultipleWorkersAttemptingToCompleteSameWorkItem), - testDeletion: false, - deleteBeforeCreate: false, - workerId: "2", - partitionCount: 1, - controlQueueVisibilityTimeout: TimeSpan.FromSeconds(1) - ); - await service2.StartAsync(); - await service2.OnOwnershipLeaseAquiredAsync(lease); - await TestHelpers.WaitFor( - condition: () => service2.OwnedControlQueues.Any(), - timeout: TimeSpan.FromSeconds(60)); - - // Have worker 2 dequeue the same work item and start "working" on it - var workItem2 = await service2.LockNextTaskOrchestrationWorkItemAsync( - TimeSpan.FromMinutes(5), - CancellationToken.None); - workItem2.OrchestrationRuntimeState = runtimeState; - - // Worker 2 completes the work item - await service2.CompleteTaskOrchestrationWorkItemAsync(workItem2, runtimeState, new List(), new List(), new List(), null, null); - // Now worker 1 will attempt to complete the same work item. Since this is the first attempt to complete a work item and add a history for the orchestration (by worker 1), - // there is no etag stored for the OrchestrationSession, and so the a "conflict" exception will be thrown as worker 2 already created a history for the orchestration. - SessionAbortedException exception = await Assert.ThrowsExceptionAsync(async () => - await service1.CompleteTaskOrchestrationWorkItemAsync(workItem1, runtimeState, new List(), new List(), new List(), null, null) - ); - Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); - DurableTaskStorageException dtse = (DurableTaskStorageException)exception.InnerException; - Assert.AreEqual((int)HttpStatusCode.Conflict, dtse.HttpStatusCode); - await service1.ReleaseTaskOrchestrationWorkItemAsync(workItem1); - await service2.ReleaseTaskOrchestrationWorkItemAsync(workItem2); - - // Now simulate a task completing for the orchestration - var taskCompletedEvent = new TaskCompletedEvent(-1, 0, string.Empty); - await service2.SendTaskOrchestrationMessageAsync(new TaskMessage { Event = taskCompletedEvent, OrchestrationInstance = orchestrationInstance }); - // Worker 2 gets the next work item related to this task completion and starts "working" on it - workItem2 = await service2.LockNextTaskOrchestrationWorkItemAsync( - TimeSpan.FromMinutes(5), - CancellationToken.None); - runtimeState = workItem2.OrchestrationRuntimeState; - runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); - runtimeState.AddEvent(taskCompletedEvent); - runtimeState.AddEvent(new ExecutionCompletedEvent(1, string.Empty, OrchestrationStatus.Completed)); - runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); - - // Now force worker 2 to lose the lease and have worker 1 acquire it - lease = await service2.ListBlobLeasesAsync().SingleAsync(); - await service2.OnOwnershipLeaseReleasedAsync(lease, CloseReason.LeaseLost); - await TestHelpers.WaitFor( - condition: () => !service2.OwnedControlQueues.Any(), - timeout: TimeSpan.FromSeconds(30)); - await service1.OnOwnershipLeaseAquiredAsync(lease); - await TestHelpers.WaitFor( - condition: () => service1.OwnedControlQueues.Any(), - timeout: TimeSpan.FromSeconds(60)); + ScheduledStartTime = DateTime.UtcNow, + }; - // Worker 1 also acquires the work item and starts "working" on it - workItem1 = await service1.LockNextTaskOrchestrationWorkItemAsync( - TimeSpan.FromMinutes(5), - CancellationToken.None); - workItem1.OrchestrationRuntimeState = runtimeState; - - // Worker 1 completes the work item - await service1.CompleteTaskOrchestrationWorkItemAsync(workItem1, runtimeState, new List(), new List(), new List(), null, null); - // Now worker 2 attempts to complete the same work item. Since this is not the first work item for the orchestration, now an etag exists for the OrchestrationSession, and the exception - // that is thrown will be "precondition failed" as the Etag is stale after worker 1 completed the work item. - exception = await Assert.ThrowsExceptionAsync(async () => - await service2.CompleteTaskOrchestrationWorkItemAsync(workItem2, runtimeState, new List(), new List(), new List(), null, null) - ); - Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); - dtse = (DurableTaskStorageException)exception.InnerException; - Assert.AreEqual((int)HttpStatusCode.PreconditionFailed, dtse.HttpStatusCode); + // Create worker 1, wait for it to acquire the lease. + // Make sure to set a small control queue visibility timeout so that worker 2 can reacquire the work item quickly once worker 1 loses the lease. + service1 = await this.EnsureTaskHubAsync( + nameof(MultipleWorkersAttemptingToCompleteSameWorkItem), + testDeletion: false, + deleteBeforeCreate: true, + partitionCount: 1, + workerId: "1", + controlQueueVisibilityTimeout: TimeSpan.FromSeconds(1) + ); + await service1.StartAsync(); + await TestHelpers.WaitFor( + condition: () => service1.OwnedControlQueues.Any(), + timeout: TimeSpan.FromSeconds(30)); + + // Create the orchestration and get the first work item and start "working" on it + await service1.CreateTaskOrchestrationAsync( + new TaskMessage() + { + OrchestrationInstance = orchestrationInstance, + Event = startedEvent + }); + var workItem1 = await service1.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + var runtimeState = workItem1.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(startedEvent); + runtimeState.AddEvent(new TaskScheduledEvent(0, "task")); + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + // Now lose the lease + BlobPartitionLease lease = await service1.ListBlobLeasesAsync().SingleAsync(); + await service1.OnOwnershipLeaseReleasedAsync(lease, CloseReason.LeaseLost); + await TestHelpers.WaitFor( + condition: () => !service1.OwnedControlQueues.Any(), + timeout: TimeSpan.FromSeconds(30)); + + // Create worker 2, wait for it to now acquire the lease + service2 = await this.EnsureTaskHubAsync( + nameof(MultipleWorkersAttemptingToCompleteSameWorkItem), + testDeletion: false, + deleteBeforeCreate: false, + workerId: "2", + partitionCount: 1, + controlQueueVisibilityTimeout: TimeSpan.FromSeconds(1) + ); + await service2.StartAsync(); + await service2.OnOwnershipLeaseAquiredAsync(lease); + await TestHelpers.WaitFor( + condition: () => service2.OwnedControlQueues.Any(), + timeout: TimeSpan.FromSeconds(60)); + + // Have worker 2 dequeue the same work item and start "working" on it + var workItem2 = await service2.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + workItem2.OrchestrationRuntimeState = runtimeState; + + // Worker 2 completes the work item + await service2.CompleteTaskOrchestrationWorkItemAsync(workItem2, runtimeState, new List(), new List(), new List(), null, null); + // Now worker 1 will attempt to complete the same work item. Since this is the first attempt to complete a work item and add a history for the orchestration (by worker 1), + // there is no etag stored for the OrchestrationSession, and so the a "conflict" exception will be thrown as worker 2 already created a history for the orchestration. + SessionAbortedException exception = await Assert.ThrowsExceptionAsync(async () => + await service1.CompleteTaskOrchestrationWorkItemAsync(workItem1, runtimeState, new List(), new List(), new List(), null, null) + ); + Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); + DurableTaskStorageException dtse = (DurableTaskStorageException)exception.InnerException; + Assert.AreEqual((int)HttpStatusCode.Conflict, dtse.HttpStatusCode); + await service1.ReleaseTaskOrchestrationWorkItemAsync(workItem1); + await service2.ReleaseTaskOrchestrationWorkItemAsync(workItem2); + + // Now simulate a task completing for the orchestration + var taskCompletedEvent = new TaskCompletedEvent(-1, 0, string.Empty); + await service2.SendTaskOrchestrationMessageAsync(new TaskMessage { Event = taskCompletedEvent, OrchestrationInstance = orchestrationInstance }); + // Worker 2 gets the next work item related to this task completion and starts "working" on it + workItem2 = await service2.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + runtimeState = workItem2.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(taskCompletedEvent); + runtimeState.AddEvent(new ExecutionCompletedEvent(1, string.Empty, OrchestrationStatus.Completed)); + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + // Now force worker 2 to lose the lease and have worker 1 acquire it + lease = await service2.ListBlobLeasesAsync().SingleAsync(); + await service2.OnOwnershipLeaseReleasedAsync(lease, CloseReason.LeaseLost); + await TestHelpers.WaitFor( + condition: () => !service2.OwnedControlQueues.Any(), + timeout: TimeSpan.FromSeconds(30)); + await service1.OnOwnershipLeaseAquiredAsync(lease); + await TestHelpers.WaitFor( + condition: () => service1.OwnedControlQueues.Any(), + timeout: TimeSpan.FromSeconds(60)); + + // Worker 1 also acquires the work item and starts "working" on it + workItem1 = await service1.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + workItem1.OrchestrationRuntimeState = runtimeState; + + // Worker 1 completes the work item + await service1.CompleteTaskOrchestrationWorkItemAsync(workItem1, runtimeState, new List(), new List(), new List(), null, null); + // Now worker 2 attempts to complete the same work item. Since this is not the first work item for the orchestration, now an etag exists for the OrchestrationSession, and the exception + // that is thrown will be "precondition failed" as the Etag is stale after worker 1 completed the work item. + exception = await Assert.ThrowsExceptionAsync(async () => + await service2.CompleteTaskOrchestrationWorkItemAsync(workItem2, runtimeState, new List(), new List(), new List(), null, null) + ); + Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); + dtse = (DurableTaskStorageException)exception.InnerException; + Assert.AreEqual((int)HttpStatusCode.PreconditionFailed, dtse.HttpStatusCode); + } + finally + { + await service1?.StopAsync(isForced: true); + await service2?.StopAsync(isForced: true); + } } [TestMethod] diff --git a/test/DurableTask.AzureStorage.Tests/AzureStorageScenarioTests.cs b/test/DurableTask.AzureStorage.Tests/AzureStorageScenarioTests.cs index aee05e945..d7a8fccaa 100644 --- a/test/DurableTask.AzureStorage.Tests/AzureStorageScenarioTests.cs +++ b/test/DurableTask.AzureStorage.Tests/AzureStorageScenarioTests.cs @@ -13,15 +13,6 @@ namespace DurableTask.AzureStorage.Tests { - using System; - using System.Collections.Generic; - using System.Diagnostics; - using System.IO; - using System.Linq; - using System.Runtime.Serialization; - using System.Text; - using System.Threading; - using System.Threading.Tasks; using Azure.Data.Tables; using Azure.Storage.Blobs; using Azure.Storage.Blobs.Models; @@ -36,7 +27,17 @@ namespace DurableTask.AzureStorage.Tests using Moq; using Newtonsoft.Json; using Newtonsoft.Json.Linq; -#if !NET462 + using System; + using System.Collections.Generic; + using System.Diagnostics; + using System.IO; + using System.Linq; + using System.Net; + using System.Runtime.Serialization; + using System.Text; + using System.Threading; + using System.Threading.Tasks; +#if !NET48 using OpenTelemetry; using OpenTelemetry.Trace; #endif @@ -556,7 +557,7 @@ private async Task GetBlobCount(string containerName, string directoryName) var containerClient = client.GetBlobContainerClient(containerName); await containerClient.CreateIfNotExistsAsync(); - return await containerClient.GetBlobsAsync(traits: BlobTraits.Metadata, prefix: directoryName).CountAsync(); + return await containerClient.GetBlobsAsync(traits: BlobTraits.Metadata, states: BlobStates.None, prefix: directoryName, cancellationToken: default).CountAsync(); } @@ -884,7 +885,7 @@ public async Task TerminateOrchestration(bool enableExtendedSessions) var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Counter), 0); // Need to wait for the instance to start before we can terminate it. - // TODO: This requirement may not be ideal and should be revisited. + // TerminatePendingOrchestration tests terminating a pending orchestration. await client.WaitForStartupAsync(TimeSpan.FromSeconds(10)); await client.TerminateAsync("sayōnara"); @@ -965,6 +966,50 @@ public async Task TerminateSuspendedOrchestration(bool enableExtendedSessions) } } + /// + /// Test that a pending orchestration can be terminated (including tests with a large termination reason that will need to be + /// stored in blob storage). + /// + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TerminatePendingOrchestration(bool enableExtendedSessions, bool largeTerminationReason) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) + { + await host.StartAsync(); + // Schedule a start time to ensure that the orchestration is in a Pending state when we attempt to terminate. + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Counter), 0, startAt: DateTime.UtcNow.AddMinutes(1)); + await client.WaitForStatusChange(TimeSpan.FromSeconds(5), OrchestrationStatus.Pending); + + string message = largeTerminationReason ? this.GenerateMediumRandomStringPayload().ToString() : "terminate"; + await client.TerminateAsync(message); + + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + + if (largeTerminationReason) + { + int blobCount = await this.GetBlobCount("test-largemessages", client.InstanceId); + Assert.IsTrue(blobCount > 0); + } + + // Confirm the pending orchestration was terminated. + Assert.AreEqual(OrchestrationStatus.Terminated, status?.OrchestrationStatus); + Assert.AreEqual(message, status?.Output); + + // Now sleep for a minute and confirm that the orchestration does not start after its scheduled time. + Thread.Sleep(TimeSpan.FromMinutes(1)); + + status = await client.GetStatusAsync(); + Assert.AreEqual(OrchestrationStatus.Terminated, status?.OrchestrationStatus); + Assert.AreEqual(message, status?.Output); + + await host.StopAsync(); + } + } + /// /// End-to-end test which validates the Rewind functionality on more than one orchestration. /// @@ -1618,7 +1663,7 @@ public async Task LargeMessage_WithEscapedInstanceId_CanBeStoredAndFetchedSucces // Use an instanceId that contains special characters which must be escaped in URIs string id = "test|123:with white spcae"; - var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Echo), input:message, instanceId: id); + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Echo), input: message, instanceId: id); var status = await client.WaitForCompletionAsync(TimeSpan.FromMinutes(2)); Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); @@ -2191,7 +2236,7 @@ private static async Task ValidateLargeMessageBlobUrlAsync(string taskHubName, s BlobContainerClient container = serviceClient.GetBlobContainerClient(containerName); Assert.IsTrue(await container.ExistsAsync(), $"Blob container {containerName} is expected to exist."); BlobItem blob = await container - .GetBlobsByHierarchyAsync(BlobTraits.Metadata, prefix: sanitizedInstanceId) + .GetBlobsByHierarchyAsync(traits: BlobTraits.Metadata, states: BlobStates.None, delimiter: null, prefix: sanitizedInstanceId, cancellationToken: default) .Where(x => x.IsBlob && x.Blob.Name == sanitizedInstanceId + "/" + blobName) .Select(x => x.Blob) .SingleOrDefaultAsync(); @@ -2367,6 +2412,7 @@ await Task.WhenAll( [DataRow(false, true, false)] [DataRow(false, false, true)] [DataRow(false, false, false)] + [Ignore("Skipping since this functionality has since changed, see TestWorkerFailingDuringCompleteWorkItemCall")] public async Task TestAllowReplayingTerminalInstances(bool enableExtendedSessions, bool sendTerminateEvent, bool allowReplayingTerminalInstances) { using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost( @@ -2441,179 +2487,1267 @@ public async Task TestAllowReplayingTerminalInstances(bool enableExtendedSession } } - [TestMethod] - [DataRow(VersioningSettings.VersionMatchStrategy.Strict)] - [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder)] - [DataRow(VersioningSettings.VersionMatchStrategy.None)] - public async Task OrchestrationFailsWithVersionMismatch(VersioningSettings.VersionMatchStrategy matchStrategy) + /// + /// Confirm that if a worker fails after committing the new history but before updating the instance state in a call to + /// for an orchestration that has + /// reached a terminal state, then storage is brought to consistent state by the call to + /// . + /// Since we cannot simulate a worker failure at this precise point, instead what is done by this test is that we + /// let an orchestration run to completion, and then manually change the instance table state back to "Running". + /// We then send an event to the orchestration, which triggers a call to lock the next task work item, at which point + /// the inconsistent state in storage for the terminal instance is recognized, the instance state is updated, and the work item discarded. + /// Note that this test does not confirm that orphaned blobs are deleted by the call to lock the next orchestration work item + /// in the case of a terminal orchestration with inconsistent state in storage. This is because there is no easy way to mock/inject + /// the tracking store context object that is part of the orchestration session state which keeps track of the blobs. + /// + /// + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallCompletedOrchestration(bool enableExtendedSessions, bool terminate) { - using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings - { - Version = "1", - MatchStrategy = matchStrategy, - FailureStrategy = VersioningSettings.VersionFailureStrategy.Fail - })) + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) { await host.StartAsync(); - var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: "2"); - var status = await client.WaitForCompletionAsync(StandardTimeout); + // Run simple orchestrator to completion, this will help us obtain a valid terminal history for the orchestrator + string input = "hello!"; + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Echo), input, tags: new Dictionary { { "key", "value" } }); + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; - if (matchStrategy == VersioningSettings.VersionMatchStrategy.None) + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + var instanceId = client.InstanceId; + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new TableEntity(instanceId, "") { - Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); + + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); } else { - Assert.AreEqual(OrchestrationStatus.Failed, status?.OrchestrationStatus); + await client.RaiseEventAsync("Foo", "Bar"); } + await Task.Delay(TimeSpan.FromSeconds(30)); - await host.StopAsync(); - } - } + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); - [TestMethod] - [DataRow(VersioningSettings.VersionMatchStrategy.Strict, "1.0.0")] - [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder, "1.0.0")] - [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder, "0.9.0")] - [DataRow(VersioningSettings.VersionMatchStrategy.None, "1.0.0")] - public async Task OrchestrationSucceedsWithVersion(VersioningSettings.VersionMatchStrategy matchStrategy, string orchestrationVersion) - { - using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings - { - Version = "1.0.0", - MatchStrategy = matchStrategy, - FailureStrategy = VersioningSettings.VersionFailureStrategy.Fail - })) - { - await host.StartAsync(); + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Completed, status.OrchestrationStatus); + Assert.AreEqual(input, JToken.Parse(status.Output).ToString()); + Assert.AreEqual(input, JToken.Parse(status.Input).ToString()); - var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: orchestrationVersion); - var status = await client.WaitForCompletionAsync(StandardTimeout); + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); - Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Completed, status.OrchestrationStatus); + Assert.AreEqual(input, JToken.Parse(status.Output).ToString()); + Assert.AreEqual(input, JToken.Parse(status.Input).ToString()); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.Echo))); + Assert.IsTrue(status.Tags.Contains(new KeyValuePair("key", "value"))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); await host.StopAsync(); } } - [TestMethod] - public async Task OrchestrationRejectsWithVersionMismatch() + /// + /// Same as but for a failed orchestration. + /// + /// + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallFailedOrchestration(bool enableExtendedSessions, bool terminate) { - using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings - { - Version = "1", - MatchStrategy = VersioningSettings.VersionMatchStrategy.Strict, - FailureStrategy = VersioningSettings.VersionFailureStrategy.Reject - })) + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) { await host.StartAsync(); - var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: "2"); - // We intend for this to timeout as the work should be getting rejected. + string failureReason = "Failure!"; + var client = await host.StartOrchestrationAsync( + typeof(Orchestrations.ThrowException), + input: failureReason); var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); - Assert.IsNull(status); + Assert.AreEqual(OrchestrationStatus.Failed, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; - // We should either be pending (recently rejected) or running (to be rejected). - status = await client.GetStatusAsync(); - Assert.IsTrue(OrchestrationStatus.Running == status?.OrchestrationStatus || OrchestrationStatus.Pending == status?.OrchestrationStatus); + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + var instanceId = client.InstanceId; + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); - var history = await client.GetOrchestrationHistoryAsync(client.InstanceId); - Assert.AreEqual(0, history.Count, "A rejected orchestration should have no history as it should never have been started."); + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + + TableEntity entity = new TableEntity(instanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); + + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Failed, status.OrchestrationStatus); + Assert.AreEqual(failureReason, status.Output); + Assert.AreEqual(failureReason, JToken.Parse(status.Input).ToString()); + + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); + + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Failed, status.OrchestrationStatus); + Assert.AreEqual(failureReason, status.Output); + Assert.AreEqual(failureReason, JToken.Parse(status.Input).ToString()); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.ThrowException))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); await host.StopAsync(); } } -#if !NET462 /// - /// End-to-end test which validates a simple orchestrator function that calls an activity function - /// and checks the OpenTelemetry trace information + /// Same as but for a terminated orchestration. /// [DataTestMethod] - [DataRow(true)] - [DataRow(false)] - public async Task OpenTelemetry_SayHelloWithActivity(bool enableExtendedSessions) + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallTerminatedOrchestration(bool enableExtendedSessions, bool terminate) { - var processor = new Mock>(); - using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) { - using (Sdk.CreateTracerProviderBuilder() - .AddSource("DurableTask.Core") - .AddProcessor(processor.Object) - .Build()) + await host.StartAsync(); + + // Using the counter orchestration because it will wait indefinitely for input. + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Counter), 0); + await client.WaitForStartupAsync(TimeSpan.FromSeconds(10)); + // Terminate the orchestration + string reason = "terminate"; + await client.TerminateAsync(reason); + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.AreEqual(OrchestrationStatus.Terminated, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; + + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + var instanceId = client.InstanceId; + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new TableEntity(instanceId, "") { - await host.StartAsync(); + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); - var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloWithActivity), "World"); - var status = await client.WaitForCompletionAsync(StandardTimeout); + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); - await host.StopAsync(); + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); } - } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); - // (1) Explanation about indexes: - // The orchestration Activity's start at Invocation[1] and each action logs - // two activities - (Processor.OnStart(Activity) and Processor.OnEnd(Activity) - // The Activity for orchestration execution is "started" (with the same Id, SpanId, etc.) - // upon every replay of the orchestration so will have an OnStart invocation for each such replay, - // but an OnEnd at the end of orchestration execution. - // The first OnEnd invocation is at index 2, so we start from there. + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); - // (2) Additional invocations: - // processor.Invocations[0] - processor.SetParentProvider(TracerProviderSdk) - // processor.Invocations[10] - processor.OnShutdown() - // processor.Invocations[11] - processor.Dispose(true) + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Terminated, status.OrchestrationStatus); + Assert.AreEqual(reason, status.Output); + Assert.AreEqual(0, int.Parse(status.Input)); - // Create orchestration Activity - Activity activity2 = (Activity)processor.Invocations[2].Arguments[0]; - // Task execution Activity - Activity activity5 = (Activity)processor.Invocations[5].Arguments[0]; - // Task completed Activity - Activity activity8 = (Activity)processor.Invocations[8].Arguments[0]; - // Orchestration execution Activity - Activity activity9 = (Activity)processor.Invocations[9].Arguments[0]; + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); - // Checking total number activities - Assert.AreEqual(12, processor.Invocations.Count); + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); - // Checking tag values - string activity2TypeValue = activity2.Tags.First(k => (k.Key).Equals("durabletask.type" )).Value; - string activity5TypeValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity8TypeValue = activity8.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity9TypeValue = activity9.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - - ActivityKind activity2Kind = activity2.Kind; - ActivityKind activity5Kind = activity5.Kind; - ActivityKind activity8Kind = activity8.Kind; - ActivityKind activity9Kind = activity9.Kind; - - Assert.AreEqual("orchestration", activity2TypeValue); - Assert.AreEqual("activity", activity5TypeValue); - Assert.AreEqual("activity", activity8TypeValue); - Assert.AreEqual("orchestration", activity9TypeValue); - Assert.AreEqual(ActivityKind.Producer, activity2Kind); - Assert.AreEqual(ActivityKind.Server, activity5Kind); - Assert.AreEqual(ActivityKind.Client, activity8Kind); - Assert.AreEqual(ActivityKind.Server, activity9Kind); + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); - // Checking span ID correlation between parent and child - Assert.AreEqual(activity2.SpanId, activity9.ParentSpanId); - Assert.AreEqual(activity8.SpanId, activity5.ParentSpanId); - Assert.AreEqual(activity9.SpanId, activity8.ParentSpanId); + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Terminated, status.OrchestrationStatus); + Assert.AreEqual(reason, status.Output); + Assert.AreEqual(0, int.Parse(status.Input)); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.Counter))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); - // Checking trace ID values - Assert.AreEqual(activity2.TraceId.ToString(), activity5.TraceId.ToString(), activity8.TraceId.ToString(), activity9.TraceId.ToString()); + await host.StopAsync(); + } } /// - /// End-to-end test which validates a simple orchestrator function that waits for an external event - /// raised through the RaiseEvent API and checks the OpenTelemetry trace information + /// Same as but for an orchestration with large input + /// and output, which will need to be stored in blob storage. /// [DataTestMethod] - [DataRow(true)] - [DataRow(false)] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallLargeInputOutput(bool enableExtendedSessions, bool terminate) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) + { + await host.StartAsync(); + + string message = this.GenerateMediumRandomStringPayload().ToString(); + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Echo), message); + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; + + var instanceId = client.InstanceId; + int blobCount = await this.GetBlobCount("test-largemessages", instanceId); + Assert.IsTrue(blobCount > 0); + + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new TableEntity(instanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); + + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Completed, status.OrchestrationStatus); + Assert.AreEqual(message, JToken.Parse(status.Output).ToString()); + Assert.AreEqual(message, JToken.Parse(status.Input).ToString()); + + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); + + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Completed, status.OrchestrationStatus); + Assert.AreEqual(message, JToken.Parse(status.Output).ToString()); + Assert.AreEqual(message, JToken.Parse(status.Input).ToString()); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.Echo))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); + + await host.StopAsync(); + } + } + + /// + /// Same as but for a large termination reason that + /// will need to be stored in blob storage. + /// + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallLargeTerminationReason(bool enableExtendedSessions, bool terminate) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) + { + await host.StartAsync(); + + string message = this.GenerateMediumRandomStringPayload().ToString(); + // Using the counter orchestration because it will wait indefinitely for input. + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.Counter), 0); + await client.WaitForStartupAsync(TimeSpan.FromSeconds(10)); + // Terminate the orchestration + await client.TerminateAsync(message); + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.AreEqual(OrchestrationStatus.Terminated, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; + + var instanceId = client.InstanceId; + int blobCount = await this.GetBlobCount("test-largemessages", instanceId); + Assert.IsTrue(blobCount > 0); + + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new TableEntity(instanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); + + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Terminated, status.OrchestrationStatus); + Assert.AreEqual(message, status.Output); + Assert.AreEqual(0, int.Parse(status.Input)); + + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); + + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Terminated, status.OrchestrationStatus); + Assert.AreEqual(message, status.Output); + Assert.AreEqual(0, int.Parse(status.Input)); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.Counter))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); + + await host.StopAsync(); + } + } + + /// + /// Same as but for a large exception message that will need + /// to be stored in blob storage. + /// + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task TestWorkerFailingDuringCompleteWorkItemCallLargeFailureReason(bool enableExtendedSessions, bool terminate) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) + { + await host.StartAsync(); + + string message = this.GenerateMediumRandomStringPayload().ToString(); + var client = await host.StartOrchestrationAsync( + typeof(Orchestrations.ThrowException), + input: message); + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.AreEqual(OrchestrationStatus.Failed, status?.OrchestrationStatus); + string executionId = status.OrchestrationInstance.ExecutionId; + + var instanceId = client.InstanceId; + int blobCount = await this.GetBlobCount("test-largemessages", instanceId); + Assert.IsTrue(blobCount > 0); + + // Simulate having an "out of date" Instance table, by setting it's runtime status to "Running". + // This simulates the scenario where the History table was updated, but not the Instance table. + AzureStorageOrchestrationServiceSettings settings = TestHelpers.GetTestAzureStorageOrchestrationServiceSettings( + enableExtendedSessions); + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new TableEntity(instanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Running.ToString("G"), + ["Output"] = "null", + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + // Assert that the status in the Instance table reads "Running" + IList state = await client.GetStateAsync(instanceId); + OrchestrationStatus forcedStatus = state.First().OrchestrationStatus; + Assert.AreEqual(OrchestrationStatus.Running, forcedStatus); + + // The type of event sent should not matter - the event itself should be discarded, and the instance table updated + // to reflect the status in the history table. + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Failed, status.OrchestrationStatus); + Assert.AreEqual(message, status.Output); + Assert.AreEqual(message, JToken.Parse(status.Input).ToString()); + + // Now simulate there being no instance entity (which can be the case for suborchestrations that complete in one execution), and try again + await instanceTable.DeleteEntityAsync(entity, Azure.ETag.All); + + if (terminate) + { + await client.TerminateAsync("testing"); + } + else + { + await client.RaiseEventAsync("Foo", "Bar"); + } + await Task.Delay(TimeSpan.FromSeconds(30)); + + // A replay should have occurred, forcing the instance table to be updated with a terminal status + state = await client.GetStateAsync(instanceId); + Assert.AreEqual(1, state.Count); + + status = state.First(); + Assert.AreEqual(OrchestrationStatus.Failed, status.OrchestrationStatus); + Assert.AreEqual(message, status.Output); + Assert.AreEqual(message, JToken.Parse(status.Input).ToString()); + Assert.IsTrue(status.Name.Contains(nameof(Orchestrations.ThrowException))); + Assert.AreEqual(executionId, status.OrchestrationInstance.ExecutionId); + + await host.StopAsync(); + } + } + + [TestMethod] + [DataRow(VersioningSettings.VersionMatchStrategy.Strict)] + [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder)] + [DataRow(VersioningSettings.VersionMatchStrategy.None)] + public async Task OrchestrationFailsWithVersionMismatch(VersioningSettings.VersionMatchStrategy matchStrategy) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings + { + Version = "1", + MatchStrategy = matchStrategy, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Fail + })) + { + await host.StartAsync(); + + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: "2"); + var status = await client.WaitForCompletionAsync(StandardTimeout); + + if (matchStrategy == VersioningSettings.VersionMatchStrategy.None) + { + Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + } + else + { + Assert.AreEqual(OrchestrationStatus.Failed, status?.OrchestrationStatus); + } + + await host.StopAsync(); + } + } + + [TestMethod] + [DataRow(VersioningSettings.VersionMatchStrategy.Strict, "1.0.0")] + [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder, "1.0.0")] + [DataRow(VersioningSettings.VersionMatchStrategy.CurrentOrOlder, "0.9.0")] + [DataRow(VersioningSettings.VersionMatchStrategy.None, "1.0.0")] + public async Task OrchestrationSucceedsWithVersion(VersioningSettings.VersionMatchStrategy matchStrategy, string orchestrationVersion) + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings + { + Version = "1.0.0", + MatchStrategy = matchStrategy, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Fail + })) + { + await host.StartAsync(); + + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: orchestrationVersion); + var status = await client.WaitForCompletionAsync(StandardTimeout); + + Assert.AreEqual(OrchestrationStatus.Completed, status?.OrchestrationStatus); + + await host.StopAsync(); + } + } + + [TestMethod] + public async Task OrchestrationRejectsWithVersionMismatch() + { + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(false, versioningSettings: new VersioningSettings + { + Version = "1", + MatchStrategy = VersioningSettings.VersionMatchStrategy.Strict, + FailureStrategy = VersioningSettings.VersionFailureStrategy.Reject + })) + { + await host.StartAsync(); + + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloInline), "World", tags: new Dictionary(), version: "2"); + // We intend for this to timeout as the work should be getting rejected. + var status = await client.WaitForCompletionAsync(TimeSpan.FromSeconds(10)); + Assert.IsNull(status); + + // We should either be pending (recently rejected) or running (to be rejected). + status = await client.GetStatusAsync(); + Assert.IsTrue(OrchestrationStatus.Running == status?.OrchestrationStatus || OrchestrationStatus.Pending == status?.OrchestrationStatus); + + var history = await client.GetOrchestrationHistoryAsync(client.InstanceId); + Assert.AreEqual(0, history.Count, "A rejected orchestration should have no history as it should never have been started."); + + await host.StopAsync(); + } + } + + /// + /// Confirm that: + /// 1. If is true, and a worker attempts to update the instance table with a stale + /// etag upon completing a work item, a SessionAbortedException is thrown which wraps the inner DurableTaskStorageException, which has the correct status code + /// (precondition failed). + /// The specific scenario tested is if the worker stalled after updating the history table but before updating the instance table. When it attempts to update + /// the instance table with a stale etag, it will fail. + /// 2. If is false for the above scenario, then the call to update the instance table + /// will go through, and the instance table will be updated with a "stale" status. + /// + /// + /// Since it is impossible to force stalling, we simulate the above scenario by manually updating the instance table before the worker + /// attempts to complete the work item. The history table update will go through, but the instance table update will fail since "another worker + /// has since updated" the instance table. + /// + /// The value to use for + /// + [DataTestMethod] + [DataRow(true)] + [DataRow(false)] + public async Task WorkerAttemptingToUpdateInstanceTableAfterStalling(bool useInstanceEtag) + { + AzureStorageOrchestrationService service = null; + try + { + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = "instance_id", + ExecutionId = "execution_id", + }; + + ExecutionStartedEvent startedEvent = new(-1, string.Empty) + { + Name = "orchestration", + Version = string.Empty, + OrchestrationInstance = orchestrationInstance, + ScheduledStartTime = DateTime.UtcNow, + }; + + var settings = new AzureStorageOrchestrationServiceSettings + { + PartitionCount = 1, + StorageAccountClientProvider = new StorageAccountClientProvider(TestHelpers.GetTestStorageAccountConnectionString()), + TaskHubName = TestHelpers.GetTestTaskHubName(), + ExtendedSessionsEnabled = false, + UseInstanceTableEtag = useInstanceEtag + }; + + service = new AzureStorageOrchestrationService(settings); + await service.CreateAsync(); + await service.StartAsync(); + + // Create the orchestration and get the first work item and start "working" on it + await service.CreateTaskOrchestrationAsync( + new TaskMessage() + { + OrchestrationInstance = orchestrationInstance, + Event = startedEvent + }); + var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + var runtimeState = workItem.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(startedEvent); + runtimeState.AddEvent(new TaskScheduledEvent(0)); + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + AzureStorageClient azureStorageClient = new AzureStorageClient(settings); + + // Now manually update the instance to have status "Completed" + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + TableEntity entity = new(orchestrationInstance.InstanceId, "") + { + ["RuntimeStatus"] = OrchestrationStatus.Completed.ToString("G"), + }; + await instanceTable.MergeEntityAsync(entity, Azure.ETag.All); + + if (useInstanceEtag) + { + // Confirm an exception is thrown due to the etag mismatch for the instance table when the worker attempts to complete the work item + SessionAbortedException exception = await Assert.ThrowsExceptionAsync(async () => + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null) + ); + Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); + DurableTaskStorageException dtse = (DurableTaskStorageException)exception.InnerException; + Assert.AreEqual((int)HttpStatusCode.PreconditionFailed, dtse.HttpStatusCode); + } + else + { + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null); + + var queryCondition = new OrchestrationInstanceStatusQueryCondition + { + InstanceId = "instance_id", + FetchInput = false, + }; + + ODataCondition odata = queryCondition.ToOData(); + OrchestrationInstanceStatus instanceTableEntity = await instanceTable + .ExecuteQueryAsync(odata.Filter, 1, odata.Select, CancellationToken.None) + .FirstOrDefaultAsync(); + + // Confirm the instance table was updated with a "stale" status + Assert.IsNotNull(instanceTableEntity); + Assert.AreEqual(OrchestrationStatus.Running.ToString(), instanceTableEntity.RuntimeStatus); + } + } + finally + { + await service?.StopAsync(isForced: true); + } + } + + /// + /// Confirm that: + /// 1. If is true, and a worker attempts to update the instance table with a stale + /// etag upon completing a work item for a suborchestration, a SessionAbortedException is thrown which wraps the inner DurableTaskStorageException, which has + /// the correct status code (conflict). + /// The specific scenario tested is if the worker stalled after updating the history table but before updating the instance table for the first work item + /// for a suborchestration. When it attempts to insert a new entity into the instance table for the suborchestration (since for a suborchestration, + /// the instance entity is only created upon completion of the first work item), it will fail. + /// 2. If is false for the above scenario, then the call to update the instance table + /// will go through, and the instance table will be updated with a "stale" status. + /// + /// + /// Since it is impossible to force stalling, we simulate the above scenario by manually updating the instance table before the worker + /// attempts to complete the work item. The history table update will go through, but the instance table update will fail since "another worker + /// has since updated" the instance table. + /// + /// The value to use for + /// + [DataTestMethod] + [DataRow(true)] + [DataRow(false)] + public async Task WorkerAttemptingToUpdateInstanceTableAfterStallingForSubOrchestration(bool useInstanceEtag) + { + AzureStorageOrchestrationService service = null; + try + { + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = "instance_id", + ExecutionId = "execution_id", + }; + + ExecutionStartedEvent startedEvent = new(-1, string.Empty) + { + Name = "orchestration", + Version = string.Empty, + OrchestrationInstance = orchestrationInstance, + ScheduledStartTime = DateTime.UtcNow, + }; + + var settings = new AzureStorageOrchestrationServiceSettings + { + PartitionCount = 1, + StorageAccountClientProvider = new StorageAccountClientProvider(TestHelpers.GetTestStorageAccountConnectionString()), + TaskHubName = TestHelpers.GetTestTaskHubName(), + ExtendedSessionsEnabled = false, + UseInstanceTableEtag = useInstanceEtag + }; + + service = new AzureStorageOrchestrationService(settings); + await service.CreateAsync(); + await service.StartAsync(); + + // Create the orchestration and get the first work item and start "working" on it + await service.CreateTaskOrchestrationAsync( + new TaskMessage() + { + OrchestrationInstance = orchestrationInstance, + Event = startedEvent + }); + var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + var runtimeState = workItem.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(startedEvent); + runtimeState.AddEvent(new SubOrchestrationInstanceCreatedEvent(0) + { + Name = "suborchestration", + InstanceId = "sub_instance_id" + }); + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + // Create the task message to start the suborchestration + var subOrchestrationExecutionStartedEvent = new ExecutionStartedEvent(-1, string.Empty) + { + OrchestrationInstance = new OrchestrationInstance + { + InstanceId = "sub_instance_id", + ExecutionId = Guid.NewGuid().ToString("N") + }, + ParentInstance = new ParentInstance + { + OrchestrationInstance = runtimeState.OrchestrationInstance, + Name = runtimeState.Name, + Version = runtimeState.Version, + TaskScheduleId = 0, + }, + Name = "suborchestration" + }; + List orchestratorMessages = + new() { + new TaskMessage() + { + OrchestrationInstance = subOrchestrationExecutionStartedEvent.OrchestrationInstance, + Event = subOrchestrationExecutionStartedEvent, + } + }; + + // Complete the first work item, which will send the execution started message for the suborchestration + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), orchestratorMessages, new List(), null, null); + + // Now get the work item for the suborchestration and "work" on it + workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + runtimeState = workItem.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(subOrchestrationExecutionStartedEvent); + runtimeState.AddEvent(new TaskScheduledEvent(0)); + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + AzureStorageClient azureStorageClient = new(settings); + Table instanceTable = azureStorageClient.GetTableReference(azureStorageClient.Settings.InstanceTableName); + // Now manually update the suborchestration to have status "Completed" + TableEntity entity = new("sub_instance_id", "") + { + ["RuntimeStatus"] = OrchestrationStatus.Completed.ToString("G"), + }; + await instanceTable.InsertEntityAsync(entity); + + if (useInstanceEtag) + { + // Confirm an exception is thrown because the worker attempts to insert a new entity for the suborchestration into the instance table + // when one already exists + SessionAbortedException exception = await Assert.ThrowsExceptionAsync(async () => + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null) + ); + Assert.IsInstanceOfType(exception.InnerException, typeof(DurableTaskStorageException)); + DurableTaskStorageException dtse = (DurableTaskStorageException)exception.InnerException; + Assert.AreEqual((int)HttpStatusCode.Conflict, dtse.HttpStatusCode); + } + else + { + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null); + + var queryCondition = new OrchestrationInstanceStatusQueryCondition + { + InstanceId = "sub_instance_id", + FetchInput = false, + }; + + ODataCondition odata = queryCondition.ToOData(); + OrchestrationInstanceStatus instanceTableEntity = await instanceTable + .ExecuteQueryAsync(odata.Filter, 1, odata.Select, CancellationToken.None) + .FirstOrDefaultAsync(); + + // Confirm the instance table was updated with a "stale" status + Assert.IsNotNull(instanceTableEntity); + Assert.AreEqual(OrchestrationStatus.Running.ToString(), instanceTableEntity.RuntimeStatus); + } + + } + finally + { + await service?.StopAsync(isForced: true); + } + } + + [DataTestMethod] + [DataRow(true)] + [DataRow(false)] + public async Task WorkerAttemptingToDequeueMessageForNonExistentInstance(bool extendedSessionsEnabled) + { + AzureStorageOrchestrationService service = null; + try + { + var settings = new AzureStorageOrchestrationServiceSettings + { + PartitionCount = 1, + StorageAccountClientProvider = new StorageAccountClientProvider(TestHelpers.GetTestStorageAccountConnectionString()), + TaskHubName = TestHelpers.GetTestTaskHubName(), + ExtendedSessionsEnabled = extendedSessionsEnabled, + }; + + service = new AzureStorageOrchestrationService(settings); + await service.CreateAsync(); + await service.StartAsync(); + + await service.SendTaskOrchestrationMessageAsync( + new TaskMessage + { + OrchestrationInstance = new OrchestrationInstance + { + InstanceId = "instance_id", + ExecutionId = "execution_id", + }, + Event = new TaskCompletedEvent(-1, 0, string.Empty) + { + Timestamp = DateTime.UtcNow - TimeSpan.FromMinutes(1), + } + }); + + for (int i = 0; i < 6; i++) + { + var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(1), + CancellationToken.None); + Assert.IsNull(workItem); + } + + // On the last attempt, the message should have been deleted since we have exceeded the maximum abandonment count + // for a message to a nonexistent instance (5) + Assert.IsNull(await service.OwnedControlQueues.Single().InnerQueue.PeekMessageAsync()); + } + finally + { + await service?.StopAsync(isForced: true); + } + } + + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task WorkerAttemptingToDequeueMessageWithNoTaskScheduledInHistory(bool extendedSessionsEnabled, bool addTaskScheduledEvent) + { + AzureStorageOrchestrationService service = null; + try + { + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = Guid.NewGuid().ToString(), + ExecutionId = Guid.NewGuid().ToString(), + }; + + ExecutionStartedEvent startedEvent = new(-1, string.Empty) + { + Name = "orchestration", + Version = string.Empty, + OrchestrationInstance = orchestrationInstance, + ScheduledStartTime = DateTime.UtcNow, + }; + + var settings = new AzureStorageOrchestrationServiceSettings + { + PartitionCount = 1, + StorageAccountClientProvider = new StorageAccountClientProvider(TestHelpers.GetTestStorageAccountConnectionString()), + TaskHubName = TestHelpers.GetTestTaskHubName(), + ExtendedSessionsEnabled = extendedSessionsEnabled + }; + + service = new AzureStorageOrchestrationService(settings); + await service.CreateAsync(); + await service.StartAsync(); + + // Create the orchestration and get the first work item and start "working" on it + await service.CreateTaskOrchestrationAsync( + new TaskMessage() + { + OrchestrationInstance = orchestrationInstance, + Event = startedEvent + }); + var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + var runtimeState = workItem.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(startedEvent); + if (addTaskScheduledEvent) + { + runtimeState.AddEvent(new TaskScheduledEvent(0)); + } + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null); + + // Necessary to force a new work item to be generated for the next message + await service.ReleaseTaskOrchestrationWorkItemAsync(workItem); + + // Send a task completed for a different task scheduled ID, message should be abandoned + await service.SendTaskOrchestrationMessageAsync( + new TaskMessage + { + OrchestrationInstance = orchestrationInstance, + Event = new TaskCompletedEvent(-1, 1, string.Empty) + }); + workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(1), + CancellationToken.None); + Assert.IsNull(workItem); + + if (addTaskScheduledEvent) + { + // Send a task completed for the same task scheduled ID, this should work + await service.SendTaskOrchestrationMessageAsync( + new TaskMessage + { + OrchestrationInstance = orchestrationInstance, + Event = new TaskCompletedEvent(-1, 0, string.Empty) + }); + workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(1), + CancellationToken.None); + Assert.IsNotNull(workItem); + } + } + finally + { + await service?.StopAsync(isForced: true); + } + } + + [DataTestMethod] + [DataRow(true, true)] + [DataRow(false, true)] + [DataRow(true, false)] + [DataRow(false, false)] + public async Task WorkerAttemptingToDequeueMessageWithNoEventSentInHistory(bool extendedSessionsEnabled, bool addEventSentEvent) + { + AzureStorageOrchestrationService service = null; + try + { + var orchestrationInstance = new OrchestrationInstance + { + InstanceId = Guid.NewGuid().ToString(), + ExecutionId = Guid.NewGuid().ToString(), + }; + + ExecutionStartedEvent startedEvent = new(-1, string.Empty) + { + Name = "orchestration", + Version = string.Empty, + OrchestrationInstance = orchestrationInstance, + ScheduledStartTime = DateTime.UtcNow, + }; + + var settings = new AzureStorageOrchestrationServiceSettings + { + PartitionCount = 1, + StorageAccountClientProvider = new StorageAccountClientProvider(TestHelpers.GetTestStorageAccountConnectionString()), + TaskHubName = TestHelpers.GetTestTaskHubName(), + ExtendedSessionsEnabled = extendedSessionsEnabled + }; + + service = new AzureStorageOrchestrationService(settings); + await service.CreateAsync(); + await service.StartAsync(); + + // Create the orchestration and get the first work item and start "working" on it + await service.CreateTaskOrchestrationAsync( + new TaskMessage() + { + OrchestrationInstance = orchestrationInstance, + Event = startedEvent + }); + var workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(5), + CancellationToken.None); + var runtimeState = workItem.OrchestrationRuntimeState; + runtimeState.AddEvent(new OrchestratorStartedEvent(-1)); + runtimeState.AddEvent(startedEvent); + string requestId = Guid.NewGuid().ToString(); + if (addEventSentEvent) + { + runtimeState.AddEvent(new EventSentEvent(-1) + { + Input = $"{{ \"id\": \"{requestId}\" }}" + }); + } + runtimeState.AddEvent(new OrchestratorCompletedEvent(-1)); + + await service.CompleteTaskOrchestrationWorkItemAsync(workItem, runtimeState, new List(), new List(), new List(), null, null); + + // Necessary to force a new work item to be generated for the next message + await service.ReleaseTaskOrchestrationWorkItemAsync(workItem); + + // Send an event raised for a different request ID, message should be abandoned + await service.SendTaskOrchestrationMessageInternalAsync( + sourceInstance: new OrchestrationInstance() + { + InstanceId = "@test@myEntity" + }, + controlQueue: service.OwnedControlQueues.Single(), + new TaskMessage + { + OrchestrationInstance = orchestrationInstance, + Event = new EventRaisedEvent(-1, string.Empty) + { + Name = Guid.NewGuid().ToString() + } + }); + workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(1), + CancellationToken.None); + Assert.IsNull(workItem); + + if (addEventSentEvent) + { + // Send an event raised for the same request ID, this should work + await service.SendTaskOrchestrationMessageAsync( + new TaskMessage + { + OrchestrationInstance = orchestrationInstance, + Event = new EventRaisedEvent(-1, string.Empty) + { + Name = requestId + } + }); + workItem = await service.LockNextTaskOrchestrationWorkItemAsync( + TimeSpan.FromMinutes(1), + CancellationToken.None); + Assert.IsNotNull(workItem); + } + } + finally + { + await service?.StopAsync(isForced: true); + } + } + +#if !NET48 + /// + /// End-to-end test which validates a simple orchestrator function that calls an activity function + /// and checks the OpenTelemetry trace information + /// + [DataTestMethod] + [DataRow(true)] + [DataRow(false)] + public async Task OpenTelemetry_SayHelloWithActivity(bool enableExtendedSessions) + { + var processor = new Mock>(); + + using (TestOrchestrationHost host = TestHelpers.GetTestOrchestrationHost(enableExtendedSessions)) + { + using (Sdk.CreateTracerProviderBuilder() + .AddSource("DurableTask.Core") + .AddProcessor(processor.Object) + .Build()) + { + await host.StartAsync(); + + var client = await host.StartOrchestrationAsync(typeof(Orchestrations.SayHelloWithActivity), "World"); + var status = await client.WaitForCompletionAsync(StandardTimeout); + + await host.StopAsync(); + } + } + + // Collect only the OnEnd activities from the processor invocations. + // Other invocations (SetParentProvider, OnStart, OnShutdown, OnForceFlush, Dispose) + // vary across OpenTelemetry SDK versions, so we filter by method name. + var endedActivities = processor.Invocations + .Where(i => i.Method.Name == "OnEnd") + .Select(i => (Activity)i.Arguments[0]) + .ToList(); + + Assert.AreEqual(4, endedActivities.Count); + + // Create orchestration Activity + Activity createOrchestration = endedActivities[0]; + // Task execution Activity + Activity taskExecution = endedActivities[1]; + // Task completed Activity + Activity taskCompleted = endedActivities[2]; + // Orchestration execution Activity + Activity orchestrationExecution = endedActivities[3]; + + // Checking tag values + string createOrchestrationTypeValue = createOrchestration.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string taskExecutionTypeValue = taskExecution.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string taskCompletedTypeValue = taskCompleted.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string orchestrationExecutionTypeValue = orchestrationExecution.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + + Assert.AreEqual("orchestration", createOrchestrationTypeValue); + Assert.AreEqual("activity", taskExecutionTypeValue); + Assert.AreEqual("activity", taskCompletedTypeValue); + Assert.AreEqual("orchestration", orchestrationExecutionTypeValue); + Assert.AreEqual(ActivityKind.Producer, createOrchestration.Kind); + Assert.AreEqual(ActivityKind.Server, taskExecution.Kind); + Assert.AreEqual(ActivityKind.Client, taskCompleted.Kind); + Assert.AreEqual(ActivityKind.Server, orchestrationExecution.Kind); + + // Checking span ID correlation between parent and child + Assert.AreEqual(createOrchestration.SpanId, orchestrationExecution.ParentSpanId); + Assert.AreEqual(taskCompleted.SpanId, taskExecution.ParentSpanId); + Assert.AreEqual(orchestrationExecution.SpanId, taskCompleted.ParentSpanId); + + // Checking trace ID values + Assert.AreEqual(createOrchestration.TraceId.ToString(), taskExecution.TraceId.ToString(), taskCompleted.TraceId.ToString(), orchestrationExecution.TraceId.ToString()); + } + + /// + /// End-to-end test which validates a simple orchestrator function that waits for an external event + /// raised through the RaiseEvent API and checks the OpenTelemetry trace information + /// + [DataTestMethod] + [DataRow(true)] + [DataRow(false)] public async Task OpenTelemetry_ExternalEvent_RaiseEvent(bool enableExtendedSessions) { var processor = new Mock>(); @@ -2642,52 +3776,40 @@ public async Task OpenTelemetry_ExternalEvent_RaiseEvent(bool enableExtendedSess } } - // (1) Explanation about indexes: - // The orchestration Activity's start at Invocation[1] and each action logs - // two activities - (Processor.OnStart(Activity) and Processor.OnEnd(Activity) - // The Activity for orchestration execution is "started" (with the same Id, SpanId, etc.) - // upon every replay of the orchestration so will have an OnStart invocation for each such replay, - // but an OnEnd at the end of orchestration execution. - // The first OnEnd invocation is at index 2, so we start from there. + // Collect only the OnEnd activities from the processor invocations. + var endedActivities = processor.Invocations + .Where(i => i.Method.Name == "OnEnd") + .Select(i => (Activity)i.Arguments[0]) + .ToList(); - // (2) Additional invocations: - // processor.Invocations[0] - processor.SetParentProvider(TracerProviderSdk) - // processor.Invocations[8] - processor.OnShutdown() - // processor.Invocations[9] - processor.Dispose(true) + Assert.AreEqual(3, endedActivities.Count); // Create orchestration Activity - Activity activity2 = (Activity)processor.Invocations[2].Arguments[0]; + Activity createOrchestration = endedActivities[0]; // External event Activity - Activity activity5 = (Activity)processor.Invocations[5].Arguments[0]; + Activity externalEvent = endedActivities[1]; // Orchestration execution Activity - Activity activity7 = (Activity)processor.Invocations[7].Arguments[0]; - - // Checking total number activities - Assert.AreEqual(10, processor.Invocations.Count); + Activity orchestrationExecution = endedActivities[2]; // Checking tag values - string activity2TypeValue = activity2.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity5TypeValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity7TypeValue = activity7.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity5TargetInstanceIdValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; - - ActivityKind activity2Kind = activity2.Kind; - ActivityKind activity5Kind = activity5.Kind; - ActivityKind activity7Kind = activity7.Kind; - - Assert.AreEqual("orchestration", activity2TypeValue); - Assert.AreEqual("event", activity5TypeValue); - Assert.AreEqual("orchestration", activity7TypeValue); - Assert.AreEqual(instanceId, activity5TargetInstanceIdValue); - Assert.AreEqual(ActivityKind.Producer, activity2Kind); - Assert.AreEqual(ActivityKind.Producer, activity5Kind); - Assert.AreEqual(ActivityKind.Server, activity7Kind); + string createOrchestrationTypeValue = createOrchestration.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string externalEventTypeValue = externalEvent.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string orchestrationExecutionTypeValue = orchestrationExecution.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string externalEventTargetInstanceIdValue = externalEvent.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; + + Assert.AreEqual("orchestration", createOrchestrationTypeValue); + Assert.AreEqual("event", externalEventTypeValue); + Assert.AreEqual("orchestration", orchestrationExecutionTypeValue); + Assert.AreEqual(instanceId, externalEventTargetInstanceIdValue); + Assert.AreEqual(ActivityKind.Producer, createOrchestration.Kind); + Assert.AreEqual(ActivityKind.Producer, externalEvent.Kind); + Assert.AreEqual(ActivityKind.Server, orchestrationExecution.Kind); // Checking span ID correlation between parent and child - Assert.AreEqual(activity2.SpanId, activity7.ParentSpanId); + Assert.AreEqual(createOrchestration.SpanId, orchestrationExecution.ParentSpanId); // Checking trace ID values (the external event from the client is its own trace) - Assert.AreEqual(activity2.TraceId.ToString(), activity7.TraceId.ToString()); + Assert.AreEqual(createOrchestration.TraceId.ToString(), orchestrationExecution.TraceId.ToString()); } /// @@ -2721,51 +3843,39 @@ public async Task OpenTelemetry_TimerFired(bool enableExtendedSessions) } } - // (1) Explanation about indexes: - // The orchestration Activity's start at Invocation[1] and each action logs - // two activities - (Processor.OnStart(Activity) and Processor.OnEnd(Activity) - // The Activity for orchestration execution is "started" (with the same Id, SpanId, etc.) - // upon every replay of the orchestration so will have an OnStart invocation for each such replay, - // but an OnEnd at the end of orchestration execution. - // The first OnEnd invocation is at index 2, so we start from there. + // Collect only the OnEnd activities from the processor invocations. + var endedActivities = processor.Invocations + .Where(i => i.Method.Name == "OnEnd") + .Select(i => (Activity)i.Arguments[0]) + .ToList(); - // (2) Additional invocations: - // processor.Invocations[0] - processor.SetParentProvider(TracerProviderSdk) - // processor.Invocations[8] - processor.OnShutdown() - // processor.Invocations[9] - processor.Dispose(true) + Assert.AreEqual(3, endedActivities.Count); // Create orchestration Activity - Activity activity2 = (Activity)processor.Invocations[2].Arguments[0]; + Activity createOrchestration = endedActivities[0]; // Timer fired Activity - Activity activity6 = (Activity)processor.Invocations[6].Arguments[0]; + Activity timerFired = endedActivities[1]; // Orchestration execution Activity - Activity activity7 = (Activity)processor.Invocations[7].Arguments[0]; - - // Checking total number activities - Assert.AreEqual(10, processor.Invocations.Count); + Activity orchestrationExecution = endedActivities[2]; // Checking tag values - string activity2TypeValue = activity2.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity6TypeValue = activity6.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity7TypeValue = activity7.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string createOrchestrationTypeValue = createOrchestration.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string timerFiredTypeValue = timerFired.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string orchestrationExecutionTypeValue = orchestrationExecution.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - ActivityKind activity2Kind = activity2.Kind; - ActivityKind activity6Kind = activity6.Kind; - ActivityKind activity7Kind = activity7.Kind; - - Assert.AreEqual("orchestration", activity2TypeValue); - Assert.AreEqual("timer", activity6TypeValue); - Assert.AreEqual("orchestration", activity7TypeValue); - Assert.AreEqual(ActivityKind.Producer, activity2Kind); - Assert.AreEqual(ActivityKind.Internal, activity6Kind); - Assert.AreEqual(ActivityKind.Server, activity7Kind); + Assert.AreEqual("orchestration", createOrchestrationTypeValue); + Assert.AreEqual("timer", timerFiredTypeValue); + Assert.AreEqual("orchestration", orchestrationExecutionTypeValue); + Assert.AreEqual(ActivityKind.Producer, createOrchestration.Kind); + Assert.AreEqual(ActivityKind.Internal, timerFired.Kind); + Assert.AreEqual(ActivityKind.Server, orchestrationExecution.Kind); // Checking span ID correlation between parent and child - Assert.AreEqual(activity2.SpanId, activity7.ParentSpanId); - Assert.AreEqual(activity7.SpanId, activity6.ParentSpanId); + Assert.AreEqual(createOrchestration.SpanId, orchestrationExecution.ParentSpanId); + Assert.AreEqual(orchestrationExecution.SpanId, timerFired.ParentSpanId); // Checking trace ID values - Assert.AreEqual(activity2.TraceId.ToString(), activity6.TraceId.ToString(), activity7.TraceId.ToString()); + Assert.AreEqual(createOrchestration.TraceId.ToString(), timerFired.TraceId.ToString(), orchestrationExecution.TraceId.ToString()); } /// @@ -2800,66 +3910,52 @@ public async Task OpenTelemetry_ExternalEvent_SendEvent(bool enableExtendedSessi } } - // (1) Explanation about indexes: - // The orchestration Activity's start at Invocation[1] and each action logs - // two activities - (Processor.OnStart(Activity) and Processor.OnEnd(Activity) - // The Activity for orchestration execution is "started" (with the same Id, SpanId, etc.) - // upon every replay of the orchestration so will have an OnStart invocation for each such replay, - // but an OnEnd at the end of orchestration execution. - // The first OnEnd invocation is at index 2, so we start from there. + // Collect only the OnEnd activities from the processor invocations. + var endedActivities = processor.Invocations + .Where(i => i.Method.Name == "OnEnd") + .Select(i => (Activity)i.Arguments[0]) + .ToList(); - // (2) Additional invocations: - // processor.Invocations[0] - processor.SetParentProvider(TracerProviderSdk) - // processor.Invocations[10] - processor.OnShutdown() - // processor.Invocations[11] - processor.Dispose(true) + Assert.AreEqual(4, endedActivities.Count); - var invocations = processor.Invocations; // Create orchestration (AutoStartOrchestration) Activity - Activity activity2 = (Activity)processor.Invocations[2].Arguments[0]; + Activity createOrchestration = endedActivities[0]; // Send event to AutoStartOrchestration.Responder Activity - Activity activity5 = (Activity)processor.Invocations[5].Arguments[0]; + Activity sendEvent = endedActivities[1]; // Send event from AutoStartOrchestration.Responder back to AutoStartOrchestration Activity - Activity activity7 = (Activity)processor.Invocations[7].Arguments[0]; + Activity sendEventBack = endedActivities[2]; // Orchestration execution Activity - Activity activity9 = (Activity)processor.Invocations[9].Arguments[0]; - - // Checking total number activities - Assert.AreEqual(12, processor.Invocations.Count); + Activity orchestrationExecution = endedActivities[3]; // Checking tag values - string activity2TypeValue = activity2.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity5TypeValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity7TypeValue = activity7.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity9TypeValue = activity9.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; - string activity5InstanceIdValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.task.instance_id")).Value; - string activity5TargetInstanceIdValue = activity5.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; - string activity7InstanceIdValue = activity7.Tags.First(k => (k.Key).Equals("durabletask.task.instance_id")).Value; - string activity7TargetInstanceIdValue = activity7.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; - - ActivityKind activity2Kind = activity2.Kind; - ActivityKind activity5Kind = activity5.Kind; - ActivityKind activity7Kind = activity7.Kind; - ActivityKind activity9Kind = activity9.Kind; - - Assert.AreEqual("orchestration", activity2TypeValue); - Assert.AreEqual("event", activity5TypeValue); - Assert.AreEqual("event", activity7TypeValue); - Assert.AreEqual("orchestration", activity9TypeValue); - Assert.AreEqual(instanceId, activity5InstanceIdValue); - Assert.AreEqual(responderId, activity5TargetInstanceIdValue); - Assert.AreEqual(responderId, activity7InstanceIdValue); - Assert.AreEqual(instanceId, activity7TargetInstanceIdValue); - Assert.AreEqual(ActivityKind.Producer, activity2Kind); - Assert.AreEqual(ActivityKind.Producer, activity5Kind); - Assert.AreEqual(ActivityKind.Producer, activity7Kind); - Assert.AreEqual(ActivityKind.Server, activity9Kind); + string createOrchestrationTypeValue = createOrchestration.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string sendEventTypeValue = sendEvent.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string sendEventBackTypeValue = sendEventBack.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string orchestrationExecutionTypeValue = orchestrationExecution.Tags.First(k => (k.Key).Equals("durabletask.type")).Value; + string sendEventInstanceIdValue = sendEvent.Tags.First(k => (k.Key).Equals("durabletask.task.instance_id")).Value; + string sendEventTargetInstanceIdValue = sendEvent.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; + string sendEventBackInstanceIdValue = sendEventBack.Tags.First(k => (k.Key).Equals("durabletask.task.instance_id")).Value; + string sendEventBackTargetInstanceIdValue = sendEventBack.Tags.First(k => (k.Key).Equals("durabletask.event.target_instance_id")).Value; + + Assert.AreEqual("orchestration", createOrchestrationTypeValue); + Assert.AreEqual("event", sendEventTypeValue); + Assert.AreEqual("event", sendEventBackTypeValue); + Assert.AreEqual("orchestration", orchestrationExecutionTypeValue); + Assert.AreEqual(instanceId, sendEventInstanceIdValue); + Assert.AreEqual(responderId, sendEventTargetInstanceIdValue); + Assert.AreEqual(responderId, sendEventBackInstanceIdValue); + Assert.AreEqual(instanceId, sendEventBackTargetInstanceIdValue); + Assert.AreEqual(ActivityKind.Producer, createOrchestration.Kind); + Assert.AreEqual(ActivityKind.Producer, sendEvent.Kind); + Assert.AreEqual(ActivityKind.Producer, sendEventBack.Kind); + Assert.AreEqual(ActivityKind.Server, orchestrationExecution.Kind); // Checking span ID correlation between parent and child - Assert.AreEqual(activity2.SpanId, activity9.ParentSpanId); - Assert.AreEqual(activity9.SpanId, activity5.ParentSpanId); + Assert.AreEqual(createOrchestration.SpanId, orchestrationExecution.ParentSpanId); + Assert.AreEqual(orchestrationExecution.SpanId, sendEvent.ParentSpanId); // Checking trace ID values - Assert.AreEqual(activity2.TraceId.ToString(), activity5.TraceId.ToString(), activity9.TraceId.ToString()); + Assert.AreEqual(createOrchestration.TraceId.ToString(), sendEvent.TraceId.ToString(), orchestrationExecution.TraceId.ToString()); } #endif diff --git a/test/DurableTask.AzureStorage.Tests/DurableTask.AzureStorage.Tests.csproj b/test/DurableTask.AzureStorage.Tests/DurableTask.AzureStorage.Tests.csproj index 16b2e8449..c1b403fe6 100644 --- a/test/DurableTask.AzureStorage.Tests/DurableTask.AzureStorage.Tests.csproj +++ b/test/DurableTask.AzureStorage.Tests/DurableTask.AzureStorage.Tests.csproj @@ -2,14 +2,14 @@ - net6.0;net462 + net8.0;net48 - + - +