|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | + |
| 4 | +using System; |
| 5 | +using System.Collections.Generic; |
| 6 | +using System.IO; |
| 7 | +using System.Threading; |
| 8 | +using System.Threading.Tasks; |
| 9 | +using Microsoft.Shared.Diagnostics; |
| 10 | +using ModelContextProtocol.Client; |
| 11 | +using ModelContextProtocol.Protocol; |
| 12 | + |
| 13 | +namespace Microsoft.Extensions.DataIngestion; |
| 14 | + |
| 15 | +/// <summary> |
| 16 | +/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> MCP server. |
| 17 | +/// </summary> |
| 18 | +public class MarkItDownMcpReader : IngestionDocumentReader |
| 19 | +{ |
| 20 | + private readonly Uri _mcpServerUri; |
| 21 | + private readonly McpClientOptions? _options; |
| 22 | + |
| 23 | + /// <summary> |
| 24 | + /// Initializes a new instance of the <see cref="MarkItDownMcpReader"/> class. |
| 25 | + /// </summary> |
| 26 | + /// <param name="mcpServerUri">The URI of the MarkItDown MCP server (e.g., http://localhost:3001/mcp).</param> |
| 27 | + /// <param name="options">Optional MCP client options for configuring the connection.</param> |
| 28 | + public MarkItDownMcpReader(Uri mcpServerUri, McpClientOptions? options = null) |
| 29 | + { |
| 30 | + _mcpServerUri = Throw.IfNull(mcpServerUri); |
| 31 | + _options = options; |
| 32 | + } |
| 33 | + |
| 34 | + /// <inheritdoc/> |
| 35 | + public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default) |
| 36 | + { |
| 37 | + _ = Throw.IfNull(source); |
| 38 | + _ = Throw.IfNullOrEmpty(identifier); |
| 39 | + |
| 40 | + if (!source.Exists) |
| 41 | + { |
| 42 | + throw new FileNotFoundException("The specified file does not exist.", source.FullName); |
| 43 | + } |
| 44 | + |
| 45 | + // Read file content as base64 data URI |
| 46 | +#if NET |
| 47 | + byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false); |
| 48 | +#else |
| 49 | + byte[] fileBytes; |
| 50 | + using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous)) |
| 51 | + { |
| 52 | + using MemoryStream ms = new(); |
| 53 | + await fs.CopyToAsync(ms).ConfigureAwait(false); |
| 54 | + fileBytes = ms.ToArray(); |
| 55 | + } |
| 56 | +#endif |
| 57 | + string dataUri = CreateDataUri(fileBytes, mediaType); |
| 58 | + |
| 59 | + string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); |
| 60 | + |
| 61 | + return MarkdownParser.Parse(markdown, identifier); |
| 62 | + } |
| 63 | + |
| 64 | + /// <inheritdoc/> |
| 65 | + public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default) |
| 66 | + { |
| 67 | + _ = Throw.IfNull(source); |
| 68 | + _ = Throw.IfNullOrEmpty(identifier); |
| 69 | + |
| 70 | + // Read stream content as base64 data URI |
| 71 | + using MemoryStream ms = new(); |
| 72 | +#if NET |
| 73 | + await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false); |
| 74 | +#else |
| 75 | + await source.CopyToAsync(ms).ConfigureAwait(false); |
| 76 | +#endif |
| 77 | + byte[] fileBytes = ms.ToArray(); |
| 78 | + string dataUri = CreateDataUri(fileBytes, mediaType); |
| 79 | + |
| 80 | + string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false); |
| 81 | + |
| 82 | + return MarkdownParser.Parse(markdown, identifier); |
| 83 | + } |
| 84 | + |
| 85 | +#pragma warning disable S3995 // URI return values should not be strings |
| 86 | + private static string CreateDataUri(byte[] fileBytes, string? mediaType) |
| 87 | +#pragma warning restore S3995 // URI return values should not be strings |
| 88 | + { |
| 89 | + string base64Content = Convert.ToBase64String(fileBytes); |
| 90 | + string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!; |
| 91 | + return $"data:{mimeType};base64,{base64Content}"; |
| 92 | + } |
| 93 | + |
| 94 | + private async Task<string> ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken) |
| 95 | + { |
| 96 | + // Create HTTP client transport for MCP |
| 97 | + HttpClientTransport transport = new(new HttpClientTransportOptions |
| 98 | + { |
| 99 | + Endpoint = _mcpServerUri |
| 100 | + }); |
| 101 | + |
| 102 | + await using (transport.ConfigureAwait(false)) |
| 103 | + { |
| 104 | + // Create MCP client |
| 105 | + McpClient client = await McpClient.CreateAsync(transport, _options, loggerFactory: null, cancellationToken).ConfigureAwait(false); |
| 106 | + |
| 107 | + await using (client.ConfigureAwait(false)) |
| 108 | + { |
| 109 | + // Build parameters for convert_to_markdown tool |
| 110 | + Dictionary<string, object?> parameters = new() |
| 111 | + { |
| 112 | + ["uri"] = dataUri |
| 113 | + }; |
| 114 | + |
| 115 | + // Call the convert_to_markdown tool |
| 116 | + var result = await client.CallToolAsync("convert_to_markdown", parameters, cancellationToken: cancellationToken).ConfigureAwait(false); |
| 117 | + |
| 118 | + // Extract markdown content from result |
| 119 | + // The result is expected to be in the format: { "content": [{ "type": "text", "text": "markdown content" }] } |
| 120 | + if (result.Content != null && result.Content.Count > 0) |
| 121 | + { |
| 122 | + foreach (var content in result.Content) |
| 123 | + { |
| 124 | + if (content.Type == "text" && content is TextContentBlock textBlock) |
| 125 | + { |
| 126 | + return textBlock.Text; |
| 127 | + } |
| 128 | + } |
| 129 | + } |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + throw new InvalidOperationException("Failed to convert document to markdown: unexpected response format from MCP server."); |
| 134 | + } |
| 135 | +} |
0 commit comments