elastic · theletterf · May 19, 2025 · May 16, 2025 · May 16, 2025 · May 16, 2025
@@ -0,0 +1,127 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Buffers;
+using Elastic.Markdown.Diagnostics;
+using Markdig;
+using Markdig.Helpers;
+using Markdig.Parsers;
+using Markdig.Parsers.Inlines;
+using Markdig.Renderers;
+using Markdig.Renderers.Html;
+using Markdig.Renderers.Html.Inlines;
+using Markdig.Syntax.Inlines;
+
+namespace Elastic.Markdown.Myst.Linters;
+
+public static class WhiteSpaceNormalizerBuilderExtensions
+{
+	public static MarkdownPipelineBuilder UseWhiteSpaceNormalizer(this MarkdownPipelineBuilder pipeline)
+	{
+		pipeline.Extensions.AddIfNotAlready<WhiteSpaceNormalizerBuilderExtension>();
+		return pipeline;
+	}
+}
+
+public class WhiteSpaceNormalizerBuilderExtension : IMarkdownExtension
+{
+	public void Setup(MarkdownPipelineBuilder pipeline) =>
+		pipeline.InlineParsers.InsertBefore<EmphasisInlineParser>(new WhiteSpaceNormalizerParser());
+
+	public void Setup(MarkdownPipeline pipeline, IMarkdownRenderer renderer) =>
+		renderer.ObjectRenderers.InsertAfter<EmphasisInlineRenderer>(new WhiteSpaceNormalizerRenderer());
+}
+
+public class WhiteSpaceNormalizerParser : InlineParser
+{
+	// Collection of irregular whitespace characters that may impair Markdown rendering
+	private static readonly char[] IrregularWhitespaceChars =
+	[
+		'\u000B', // Line Tabulation (\v) - <VT>
+		'\u000C', // Form Feed (\f) - <FF>
+		'\u00A0', // No-Break Space - <NBSP>
+		'\u0085', // Next Line
+		'\u1680', // Ogham Space Mark
+		'\u180E', // Mongolian Vowel Separator - <MVS>
+		'\ufeff', // Zero Width No-Break Space - <BOM>
+		'\u2000', // En Quad
+		'\u2001', // Em Quad
+		'\u2002', // En Space - <ENSP>
+		'\u2003', // Em Space - <EMSP>
+		'\u2004', // Tree-Per-Em
+		'\u2005', // Four-Per-Em
+		'\u2006', // Six-Per-Em
+		'\u2007', // Figure Space
+		'\u2008', // Punctuation Space - <PUNCSP>
+		'\u2009', // Thin Space
+		'\u200A', // Hair Space
+		'\u200B', // Zero Width Space - <ZWSP>
+		'\u2028', // Line Separator
+		'\u2029', // Paragraph Separator
+		'\u202F', // Narrow No-Break Space
+		'\u205F', // Medium Mathematical Space
+		'\u3000'  // Ideographic Space
+	];
+	private static readonly SearchValues<char> WhiteSpaceSearchValues = SearchValues.Create(IrregularWhitespaceChars);
+
+	public WhiteSpaceNormalizerParser() => OpeningCharacters = IrregularWhitespaceChars;
+
+	public override bool Match(InlineProcessor processor, ref StringSlice slice)
+	{
+		var span = slice.AsSpan().Slice(0, 1);
+		if (span.IndexOfAny(WhiteSpaceSearchValues) == -1)
+			return false;
+
+		processor.Inline = IrregularWhiteSpace.Instance;
+
+		var c = span[0];
+		var charName = GetCharacterName(c);
+
+		processor.EmitHint(processor.Inline, 1, $"Irregular whitespace character detected: U+{(int)c:X4} ({charName}). This may impair Markdown rendering.");
+
+		slice.SkipChar();
+		return true;
+	}
+
+	// Helper to get a friendly name for the whitespace character
+	private static string GetCharacterName(char c) => c switch
+	{
+		'\u000B' => "Line Tabulation (VT)",
+		'\u000C' => "Form Feed (FF)",
+		'\u00A0' => "No-Break Space (NBSP)",
+		'\u0085' => "Next Line",
+		'\u1680' => "Ogham Space Mark",
+		'\u180E' => "Mongolian Vowel Separator (MVS)",
+		'\ufeff' => "Zero Width No-Break Space (BOM)",
+		'\u2000' => "En Quad",
+		'\u2001' => "Em Quad",
+		'\u2002' => "En Space (ENSP)",
+		'\u2003' => "Em Space (EMSP)",
+		'\u2004' => "Tree-Per-Em",
+		'\u2005' => "Four-Per-Em",
+		'\u2006' => "Six-Per-Em",
+		'\u2007' => "Figure Space",
+		'\u2008' => "Punctuation Space (PUNCSP)",
+		'\u2009' => "Thin Space",
+		'\u200A' => "Hair Space",
+		'\u200B' => "Zero Width Space (ZWSP)",
+		'\u2028' => "Line Separator",
+		'\u2029' => "Paragraph Separator",
+		'\u202F' => "Narrow No-Break Space",
+		'\u205F' => "Medium Mathematical Space",
+		'\u3000' => "Ideographic Space",
+		_ => "Unknown"
+	};
+}
+
+public class IrregularWhiteSpace : LeafInline
+{
+	public static readonly IrregularWhiteSpace Instance = new();
+};
+
+public class WhiteSpaceNormalizerRenderer : HtmlObjectRenderer<IrregularWhiteSpace>
+{
+	protected override void Write(HtmlRenderer renderer, IrregularWhiteSpace obj) =>
+		renderer.Write(' ');
+}
@@ -3,16 +3,22 @@
 // See the LICENSE file in the project root for more information
 
 using System.IO.Abstractions;
+using System.Text.RegularExpressions;
+
 using Cysharp.IO;
+
+using Elastic.Documentation.Diagnostics;
 using Elastic.Markdown.Myst.CodeBlocks;
 using Elastic.Markdown.Myst.Comments;
 using Elastic.Markdown.Myst.Directives;
 using Elastic.Markdown.Myst.FrontMatter;
 using Elastic.Markdown.Myst.InlineParsers;
 using Elastic.Markdown.Myst.InlineParsers.Substitution;
+using Elastic.Markdown.Myst.Linters;
 using Elastic.Markdown.Myst.Renderers;
 using Elastic.Markdown.Myst.Roles;
 using Elastic.Markdown.Myst.Roles.AppliesTo;
+
 using Markdig;
 using Markdig.Extensions.EmphasisExtras;
 using Markdig.Parsers;
@@ -92,20 +98,18 @@ private static async Task<MarkdownDocument> ParseAsync(
 		MarkdownPipeline pipeline,
 		Cancel ctx)
 	{
+		string inputMarkdown;
 		if (path.FileSystem is FileSystem)
 		{
 			//real IO optimize through UTF8 stream reader.
 			await using var streamReader = new Utf8StreamReader(path.FullName, fileOpenMode: FileOpenMode.Throughput);
-			var inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
-			var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
-			return markdownDocument;
+			inputMarkdown = await streamReader.AsTextReader().ReadToEndAsync(ctx);
 		}
 		else
-		{
-			var inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
-			var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
-			return markdownDocument;
-		}
+			inputMarkdown = await path.FileSystem.File.ReadAllTextAsync(path.FullName, ctx);
+
+		var markdownDocument = Markdig.Markdown.Parse(inputMarkdown, pipeline, context);
+		return markdownDocument;
 	}
 
 	// ReSharper disable once InconsistentNaming
@@ -156,6 +160,7 @@ public MarkdownPipeline Pipeline
 				.UseEnhancedCodeBlocks()
 				.UseHtmxLinkInlineRenderer()
 				.DisableHtml()
+				.UseWhiteSpaceNormalizer()
 				.UseHardBreaks();
 			_ = builder.BlockParsers.TryRemove<IndentedCodeBlockParser>();
 			_pipelineCached = builder.Build();

@@ -54,4 +54,19 @@ module DiagnosticsCollectorAssertions =
         | Some e ->
             let message = e.Message
             test <@ message.Contains(expected) @>
-        | None -> failwithf "Expected errors but no errors were logged"
+        | None -> failwithf "Expected warnings but no warnings were logged"
+
+    [<DebuggerStepThrough>]
+    let hasHint (expected: string) (actual: Lazy<GeneratorResults>) =
+        let actual = actual.Value
+        actual.Context.Collector.Hints |> shouldBeGreaterThan 0
+        let errorDiagnostics = actual.Context.Collector.Diagnostics
+                                   .Where(fun d -> d.Severity = Severity.Hint)
+                                   .ToArray()
+                                   |> List.ofArray
+                                   |> List.tryHead
+        match errorDiagnostics with
+        | Some e ->
+            let message = e.Message
+            test <@ message.Contains(expected) @>
+        | None -> failwithf "Expected hints but no hints were logged"
@@ -26,8 +26,10 @@ type TestDiagnosticsOutput() =
                 match diagnostic.Severity with
                 | Severity.Error ->
                     output.WriteLine($"Error: {diagnostic.Message} ({diagnostic.File}:{line})")
-                | _ ->
+                | Severity.Warning ->
                     output.WriteLine($"Warn : {diagnostic.Message} ({diagnostic.File}:{line})")
+                | _ ->
+                    output.WriteLine($"Hint : {diagnostic.Message} ({diagnostic.File}:{line})")
             | _ -> ()
 
 

@@ -17,3 +17,4 @@ not a comment
     [<Fact>]
     let ``validate HTML: commented line should not be emitted`` () =
         markdown |> convertsToHtml """<p>not a comment</p>"""
+
@@ -0,0 +1,23 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+module ``linters``.``white space normalizers``
+
+open Xunit
+open authoring
+
+
+type ``white space detection`` () =
+
+    static let markdown = Setup.Markdown $"""
+not a{'\u000B'}space
+"""
+
+    [<Fact>]
+    let ``validate HTML: should not contain bad space character`` () =
+        markdown |> convertsToHtml """<p>not a space</p>"""
+
+    [<Fact>]
+    let ``emits a hint when a bad space is used`` () =
+        markdown |> hasHint "Irregular whitespace character detected: U+000B (Line Tabulation (VT)). This may impair Markdown rendering."
@@ -58,4 +58,8 @@
     <Compile Include="Directives\IncludeBlocks.fs" />
   </ItemGroup>
 
+  <ItemGroup>
+    <Compile Include="Linters\WhiteSpaceNormalizers.fs" />
+  </ItemGroup>
+
 </Project>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,3 +17,4 @@ not a comment
		[<Fact>]
		let ``validate HTML: commented line should not be emitted`` () =
		markdown \|> convertsToHtml """<p>not a comment</p>"""