Skip to content

Commit a8aad1a

Browse files
Copilotadamsitnik
andauthored
Add MarkItDownMcpReader for MCP server support (#7025)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Adam Sitnik <adam.sitnik@gmail.com>
1 parent c3e0c73 commit a8aad1a

File tree

6 files changed

+246
-6
lines changed

6 files changed

+246
-6
lines changed

eng/packages/General.props

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
<PackageVersion Include="Microsoft.Extensions.VectorData.Abstractions" Version="$(MicrosoftExtensionsVectorDataAbstractionsVersion)" />
1919
<PackageVersion Include="Microsoft.IO.RecyclableMemoryStream" Version="3.0.0" />
2020
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="$(MicrosoftMLTokenizersVersion)" />
21+
<PackageVersion Include="ModelContextProtocol.Core" Version="0.4.0-preview.3" />
2122
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
2223
<PackageVersion Include="OllamaSharp" Version="5.1.9" />
2324
<PackageVersion Include="OpenAI" Version="2.6.0" />
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.Collections.Generic;
6+
using System.IO;
7+
using System.Threading;
8+
using System.Threading.Tasks;
9+
using Microsoft.Shared.Diagnostics;
10+
using ModelContextProtocol.Client;
11+
using ModelContextProtocol.Protocol;
12+
13+
namespace Microsoft.Extensions.DataIngestion;
14+
15+
/// <summary>
16+
/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> MCP server.
17+
/// </summary>
18+
public class MarkItDownMcpReader : IngestionDocumentReader
19+
{
20+
private readonly Uri _mcpServerUri;
21+
private readonly McpClientOptions? _options;
22+
23+
/// <summary>
24+
/// Initializes a new instance of the <see cref="MarkItDownMcpReader"/> class.
25+
/// </summary>
26+
/// <param name="mcpServerUri">The URI of the MarkItDown MCP server (e.g., http://localhost:3001/mcp).</param>
27+
/// <param name="options">Optional MCP client options for configuring the connection.</param>
28+
public MarkItDownMcpReader(Uri mcpServerUri, McpClientOptions? options = null)
29+
{
30+
_mcpServerUri = Throw.IfNull(mcpServerUri);
31+
_options = options;
32+
}
33+
34+
/// <inheritdoc/>
35+
public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
36+
{
37+
_ = Throw.IfNull(source);
38+
_ = Throw.IfNullOrEmpty(identifier);
39+
40+
if (!source.Exists)
41+
{
42+
throw new FileNotFoundException("The specified file does not exist.", source.FullName);
43+
}
44+
45+
// Read file content as base64 data URI
46+
#if NET
47+
byte[] fileBytes = await File.ReadAllBytesAsync(source.FullName, cancellationToken).ConfigureAwait(false);
48+
#else
49+
byte[] fileBytes;
50+
using (FileStream fs = new(source.FullName, FileMode.Open, FileAccess.Read, FileShare.Read, 1, FileOptions.Asynchronous))
51+
{
52+
using MemoryStream ms = new();
53+
await fs.CopyToAsync(ms).ConfigureAwait(false);
54+
fileBytes = ms.ToArray();
55+
}
56+
#endif
57+
string dataUri = CreateDataUri(fileBytes, mediaType);
58+
59+
string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false);
60+
61+
return MarkdownParser.Parse(markdown, identifier);
62+
}
63+
64+
/// <inheritdoc/>
65+
public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
66+
{
67+
_ = Throw.IfNull(source);
68+
_ = Throw.IfNullOrEmpty(identifier);
69+
70+
// Read stream content as base64 data URI
71+
using MemoryStream ms = new();
72+
#if NET
73+
await source.CopyToAsync(ms, cancellationToken).ConfigureAwait(false);
74+
#else
75+
await source.CopyToAsync(ms).ConfigureAwait(false);
76+
#endif
77+
byte[] fileBytes = ms.ToArray();
78+
string dataUri = CreateDataUri(fileBytes, mediaType);
79+
80+
string markdown = await ConvertToMarkdownAsync(dataUri, cancellationToken).ConfigureAwait(false);
81+
82+
return MarkdownParser.Parse(markdown, identifier);
83+
}
84+
85+
#pragma warning disable S3995 // URI return values should not be strings
86+
private static string CreateDataUri(byte[] fileBytes, string? mediaType)
87+
#pragma warning restore S3995 // URI return values should not be strings
88+
{
89+
string base64Content = Convert.ToBase64String(fileBytes);
90+
string mimeType = string.IsNullOrEmpty(mediaType) ? "application/octet-stream" : mediaType!;
91+
return $"data:{mimeType};base64,{base64Content}";
92+
}
93+
94+
private async Task<string> ConvertToMarkdownAsync(string dataUri, CancellationToken cancellationToken)
95+
{
96+
// Create HTTP client transport for MCP
97+
HttpClientTransport transport = new(new HttpClientTransportOptions
98+
{
99+
Endpoint = _mcpServerUri
100+
});
101+
102+
await using (transport.ConfigureAwait(false))
103+
{
104+
// Create MCP client
105+
McpClient client = await McpClient.CreateAsync(transport, _options, loggerFactory: null, cancellationToken).ConfigureAwait(false);
106+
107+
await using (client.ConfigureAwait(false))
108+
{
109+
// Build parameters for convert_to_markdown tool
110+
Dictionary<string, object?> parameters = new()
111+
{
112+
["uri"] = dataUri
113+
};
114+
115+
// Call the convert_to_markdown tool
116+
var result = await client.CallToolAsync("convert_to_markdown", parameters, cancellationToken: cancellationToken).ConfigureAwait(false);
117+
118+
// Extract markdown content from result
119+
// The result is expected to be in the format: { "content": [{ "type": "text", "text": "markdown content" }] }
120+
if (result.Content != null && result.Content.Count > 0)
121+
{
122+
foreach (var content in result.Content)
123+
{
124+
if (content.Type == "text" && content is TextContentBlock textBlock)
125+
{
126+
return textBlock.Text;
127+
}
128+
}
129+
}
130+
}
131+
}
132+
133+
throw new InvalidOperationException("Failed to convert document to markdown: unexpected response format from MCP server.");
134+
}
135+
}

src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/Microsoft.Extensions.DataIngestion.MarkItDown.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
<ItemGroup>
2424
<PackageReference Include="Markdig.Signed" />
25+
<PackageReference Include="ModelContextProtocol.Core" />
2526
</ItemGroup>
2627

2728
</Project>

src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ Or directly in the C# project file:
2020

2121
## Usage Examples
2222

23-
### Creating a MarkItDownReader for Data Ingestion
23+
### Creating a MarkItDownReader for Data Ingestion (Local Process)
24+
25+
Use `MarkItDownReader` to convert documents using the MarkItDown executable installed locally:
2426

2527
```csharp
2628
using Microsoft.Extensions.DataIngestion;
@@ -31,6 +33,59 @@ IngestionDocumentReader reader =
3133
using IngestionPipeline<string> pipeline = new(reader, CreateChunker(), CreateWriter());
3234
```
3335

36+
### Creating a MarkItDownMcpReader for Data Ingestion (MCP Server)
37+
38+
Use `MarkItDownMcpReader` to convert documents using a MarkItDown MCP server:
39+
40+
```csharp
41+
using Microsoft.Extensions.DataIngestion;
42+
43+
// Connect to a MarkItDown MCP server (e.g., running in Docker)
44+
IngestionDocumentReader reader =
45+
new MarkItDownMcpReader(new Uri("http://localhost:3001/mcp"));
46+
47+
using IngestionPipeline<string> pipeline = new(reader, CreateChunker(), CreateWriter());
48+
```
49+
50+
The MarkItDown MCP server can be run using Docker:
51+
52+
```bash
53+
docker run -p 3001:3001 mcp/markitdown --http --host 0.0.0.0 --port 3001
54+
```
55+
56+
Or installed via pip:
57+
58+
```bash
59+
pip install markitdown-mcp-server
60+
markitdown-mcp --http --host 0.0.0.0 --port 3001
61+
```
62+
63+
### Integrating with Aspire
64+
65+
Aspire can be used for seamless integration with [MarkItDown MCP](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp). Sample AppHost logic:
66+
67+
```csharp
68+
var builder = DistributedApplication.CreateBuilder(args);
69+
70+
var markitdown = builder.AddContainer("markitdown", "mcp/markitdown")
71+
.WithArgs("--http", "--host", "0.0.0.0", "--port", "3001")
72+
.WithHttpEndpoint(targetPort: 3001, name: "http");
73+
74+
var webApp = builder.AddProject("name");
75+
76+
webApp.WithEnvironment("MARKITDOWN_MCP_URL", markitdown.GetEndpoint("http"));
77+
78+
builder.Build().Run();
79+
```
80+
81+
Sample Ingestion Service:
82+
83+
```csharp
84+
string url = $"{Environment.GetEnvironmentVariable("MARKITDOWN_MCP_URL")}/mcp";
85+
86+
IngestionDocumentReader reader = new MarkItDownMcpReader(new Uri(url));
87+
```
88+
3489
## Feedback & Contributing
3590

3691
We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions).

test/Libraries/Microsoft.Extensions.DataIngestion.Tests/Microsoft.Extensions.DataIngestion.Tests.csproj

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@
2727
<PackageReference Include="OpenTelemetry.Exporter.InMemory" />
2828
</ItemGroup>
2929

30-
<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETFramework'">
31-
<!-- Workaround https://github.com/microsoft/semantic-kernel/issues/13316 -->
32-
<PackageReference Include="Microsoft.Bcl.AsyncInterfaces" VersionOverride="$(MicrosoftBclAsyncInterfacesVersion)" />
33-
</ItemGroup>
34-
3530
<ItemGroup>
3631
<Compile Include="..\Microsoft.Extensions.AI.Abstractions.Tests\TestChatClient.cs" />
3732
<Compile Include="..\Microsoft.Extensions.AI.Abstractions.Tests\TestEmbeddingGenerator.cs" />
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
4+
using System;
5+
using System.IO;
6+
using System.Threading.Tasks;
7+
using Xunit;
8+
9+
namespace Microsoft.Extensions.DataIngestion.Readers.Tests;
10+
11+
public class MarkItDownMcpReaderTests
12+
{
13+
[Fact]
14+
public void Constructor_ThrowsWhenMcpServerUriIsNull()
15+
{
16+
Assert.Throws<ArgumentNullException>("mcpServerUri", () => new MarkItDownMcpReader(null!));
17+
}
18+
19+
[Fact]
20+
public async Task ReadAsync_ThrowsWhenIdentifierIsNull()
21+
{
22+
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));
23+
24+
await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: null!));
25+
await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(new FileInfo("fileName.txt"), identifier: string.Empty));
26+
27+
using MemoryStream stream = new();
28+
await Assert.ThrowsAsync<ArgumentNullException>("identifier", async () => await reader.ReadAsync(stream, identifier: null!, mediaType: "some"));
29+
await Assert.ThrowsAsync<ArgumentException>("identifier", async () => await reader.ReadAsync(stream, identifier: string.Empty, mediaType: "some"));
30+
}
31+
32+
[Fact]
33+
public async Task ReadAsync_ThrowsWhenSourceIsNull()
34+
{
35+
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));
36+
37+
await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync(null!, "identifier"));
38+
await Assert.ThrowsAsync<ArgumentNullException>("source", async () => await reader.ReadAsync((Stream)null!, "identifier", "mediaType"));
39+
}
40+
41+
[Fact]
42+
public async Task ReadAsync_ThrowsWhenFileDoesNotExist()
43+
{
44+
var reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/sse"));
45+
var nonExistentFile = new FileInfo(Path.Combine(Path.GetTempPath(), Path.GetRandomFileName()));
46+
47+
await Assert.ThrowsAsync<FileNotFoundException>(async () => await reader.ReadAsync(nonExistentFile, "identifier"));
48+
}
49+
50+
// NOTE: Integration tests with an actual MCP server would go here, but they would require
51+
// a running MarkItDown MCP server to be available, which is not part of the test setup.
52+
// For full integration testing, use a real MCP server in a separate test environment.
53+
}

0 commit comments

Comments
 (0)