Skip to content

Test for irregular whitespaces #1262

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions src/Elastic.Markdown/Myst/Linters/WhiteSpaceNormalizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Buffers;
using Elastic.Markdown.Diagnostics;
using Markdig;
using Markdig.Helpers;
using Markdig.Parsers;
using Markdig.Parsers.Inlines;
using Markdig.Renderers;
using Markdig.Renderers.Html;
using Markdig.Renderers.Html.Inlines;
using Markdig.Syntax.Inlines;

namespace Elastic.Markdown.Myst.Linters;

public static class WhiteSpaceNormalizerBuilderExtensions
{
public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline)
{
pipeline.Extensions.AddIfNotAlready<WhiteSpaceNormalizerBuilderExtension>();
return pipeline;
}
}

public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension
{
public void Setup(MarkdownPipelineBuilder pipeline) =>
pipeline.InlineParsers.InsertBefore<EmphasisInlineParser>(new WhiteSpaceNormalizerParser());

public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>
renderer.ObjectRenderers.InsertAfter<EmphasisInlineRenderer>(new WhiteSpaceNormalizerRenderer());
}

public class WhiteSpaceNormalizerParser : InlineParser
{
// Collection of irregular whitespace characters that may impair Markdown rendering
private static readonly char[] IrregularWhitespaceChars =
[
'\u000B', // Line Tabulation (\v) - <VT>
'\u000C', // Form Feed (\f) - <FF>
'\u00A0', // No-Break Space - <NBSP>
'\u0085', // Next Line
'\u1680', // Ogham Space Mark
'\u180E', // Mongolian Vowel Separator - <MVS>
'\ufeff', // Zero Width No-Break Space - <BOM>
'\u2000', // En Quad
'\u2001', // Em Quad
'\u2002', // En Space - <ENSP>
'\u2003', // Em Space - <EMSP>
'\u2004', // Tree-Per-Em
'\u2005', // Four-Per-Em
'\u2006', // Six-Per-Em
'\u2007', // Figure Space
'\u2008', // Punctuation Space - <PUNCSP>
'\u2009', // Thin Space
'\u200A', // Hair Space
'\u200B', // Zero Width Space - <ZWSP>
'\u2028', // Line Separator
'\u2029', // Paragraph Separator
'\u202F', // Narrow No-Break Space
'\u205F', // Medium Mathematical Space
'\u3000' // Ideographic Space
];
private static readonly SearchValues<char> WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars);

public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars;

public override bool Match(InlineProcessor processor, ref StringSlice slice)
{
var span = slice.AsSpan().Slice(0, 1);
if (span.IndexOfAny(WhiteSpaceSearchValues) == -1)
return false;

processor.Inline = IrregularWhiteSpace.Instance;

var c = span[0];
var charName = GetCharacterName(c);

processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering.");

slice.SkipChar();
return true;
}

// Helper to get a friendly name for the whitespace character
private static string GetCharacterName(char c) => c switch
{
'\u000B' => "Line Tabulation (VT)",
'\u000C' => "Form Feed (FF)",
'\u00A0' => "No-Break Space (NBSP)",
'\u0085' => "Next Line",
'\u1680' => "Ogham Space Mark",
'\u180E' => "Mongolian Vowel Separator (MVS)",
'\ufeff' => "Zero Width No-Break Space (BOM)",
'\u2000' => "En Quad",
'\u2001' => "Em Quad",
'\u2002' => "En Space (ENSP)",
'\u2003' => "Em Space (EMSP)",
'\u2004' => "Tree-Per-Em",
'\u2005' => "Four-Per-Em",
'\u2006' => "Six-Per-Em",
'\u2007' => "Figure Space",
'\u2008' => "Punctuation Space (PUNCSP)",
'\u2009' => "Thin Space",
'\u200A' => "Hair Space",
'\u200B' => "Zero Width Space (ZWSP)",
'\u2028' => "Line Separator",
'\u2029' => "Paragraph Separator",
'\u202F' => "Narrow No-Break Space",
'\u205F' => "Medium Mathematical Space",
'\u3000' => "Ideographic Space",
_ => "Unknown"
};
}

public class IrregularWhiteSpace : LeafInline
{
public static readonly IrregularWhiteSpace Instance = new();
};

public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer<IrregularWhiteSpace>
{
protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) =>
renderer.Write(' ');
}
21 changes: 13 additions & 8 deletions src/Elastic.Markdown/Myst/MarkdownParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,22 @@
// See the LICENSE file in the project root for more information

using System.IO.Abstractions;
using System.Text.RegularExpressions;

using Cysharp.IO;

using Elastic.Documentation.Diagnostics;
using Elastic.Markdown.Myst.CodeBlocks;
using Elastic.Markdown.Myst.Comments;
using Elastic.Markdown.Myst.Directives;
using Elastic.Markdown.Myst.FrontMatter;
using Elastic.Markdown.Myst.InlineParsers;
using Elastic.Markdown.Myst.InlineParsers.Substitution;
using Elastic.Markdown.Myst.Linters;
using Elastic.Markdown.Myst.Renderers;
using Elastic.Markdown.Myst.Roles;
using Elastic.Markdown.Myst.Roles.AppliesTo;

using Markdig;
using Markdig.Extensions.EmphasisExtras;
using Markdig.Parsers;
Expand Down Expand Up @@ -92,20 +98,18 @@ private static async Task<MarkdownDocument> ParseAsync(
MarkdownPipeline pipeline,
Cancel ctx)
{
string inputMarkdown;
if (path.FileSystem is FileSystem)
{
//real IO optimize through UTF8 stream reader.
await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput);
var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
}
else
{
var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
}
inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);

var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
return markdownDocument;
}

// ReSharper disable once InconsistentNaming
Expand Down Expand Up @@ -156,6 +160,7 @@ public MarkdownPipeline Pipeline
.UseEnhancedCodeBlocks()
.UseHtmxLinkInlineRenderer()
.DisableHtml()
.UseWhiteSpaceNormalizer()
.UseHardBreaks();
_ = builder.BlockParsers.TryRemove<IndentedCodeBlockParser>();
_pipelineCached = builder.Build();
Expand Down
17 changes: 16 additions & 1 deletion tests/authoring/Framework/ErrorCollectorAssertions.fs
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions =
| Some e ->
let message = e.Message
test <@ message.Contains(expected) @>
| None -> failwithf "Expected errors but no errors were logged"
| None -> failwithf "Expected warnings but no warnings were logged"

[<DebuggerStepThrough>]
let hasHint (expected: string) (actual: Lazy<GeneratorResults>) =
let actual = actual.Value
actual.Context.Collector.Hints |> shouldBeGreaterThan 0
let errorDiagnostics = actual.Context.Collector.Diagnostics
.Where(fun d -> d.Severity = Severity.Hint)
.ToArray()
|> List.ofArray
|> List.tryHead
match errorDiagnostics with
| Some e ->
let message = e.Message
test <@ message.Contains(expected) @>
| None -> failwithf "Expected hints but no hints were logged"
4 changes: 3 additions & 1 deletion tests/authoring/Framework/TestValues.fs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ type TestDiagnosticsOutput() =
match diagnostic.Severity with
| Severity.Error ->
output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})")
| _ ->
| Severity.Warning ->
output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})")
| _ ->
output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})")
| _ -> ()


Expand Down
1 change: 1 addition & 0 deletions tests/authoring/Inline/Comments.fs
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ not a comment
[<Fact>]
let ``validate HTML: commented line should not be emitted`` () =
markdown |> convertsToHtml """<p>not a comment</p>"""

23 changes: 23 additions & 0 deletions tests/authoring/Linters/WhiteSpaceNormalizers.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

module ``linters``.``white space normalizers``

open Xunit
open authoring


type ``white space detection`` () =

static let markdown = Setup.Markdown $"""
not a{'\u000B'}space
"""

[<Fact>]
let ``validate HTML: should not contain bad space character`` () =
markdown |> convertsToHtml """<p>not a space</p>"""

[<Fact>]
let ``emits a hint when a bad space is used`` () =
markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering."
4 changes: 4 additions & 0 deletions tests/authoring/authoring.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,8 @@
<Compile Include="Directives\IncludeBlocks.fs" />
</ItemGroup>

<ItemGroup>
<Compile Include="Linters\WhiteSpaceNormalizers.fs" />
</ItemGroup>

</Project>
Loading