Site text sitemap support and test improvements

drmathias · Aug 28, 2023 · ab58b84 · ab58b84
1 parent c83390e
commit ab58b84
Show file tree

Hide file tree

Showing 20 changed files with 159,527 additions and 8,918 deletions.
diff --git a/.gitignore b/.gitignore
@@ -475,3 +475,6 @@ $RECYCLE.BIN/
 
 # Windows shortcuts
 *.lnk
+
+# Custom directory
+exclude/
diff --git a/README.md b/README.md
@@ -49,12 +49,11 @@ There is also the possibility to extend this library to support protocols other
 | Sitemap entries | ✔️ | |
 | Host directive | ✔️ | |
 | Crawl-delay directive | ✔️ | |
-| Sitemaps XML format | ✔️ | |
 | RSS 2.0 feeds | ❌ | 0.8 |
 | Atom 0.3/1.0 feeds | ❌ | 0.8 |
-| Simple text sitemaps | ❌ | 0.5 |
-| Memory management (500 KiB parsing limit) | ✔️ | |
-| Caching support | ❌ | 0.3 |
+| Sitemaps XML format | ✔️ | |
+| Simple text sitemaps | ✔️ | |
+| Memory management | ✔️ | |
 
 # Usage
 

diff --git a/src/Robots.Txt.Parser/Http/RobotWebClient.cs b/src/Robots.Txt.Parser/Http/RobotWebClient.cs
@@ -1,6 +1,7 @@
 using System;
 using System.Collections.Generic;
 using System.Net.Http;
+using System.Net.Mime;
 using System.Threading;
 using System.Threading.Tasks;
 
@@ -86,11 +87,17 @@ the 500-599 range.
         foreach (var uri in uris)
         {
             var request = new HttpRequestMessage(HttpMethod.Get, uri);
-            request.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml,*/*");
+            request.Headers.Add("Accept", "application/xml,text/plain,text/xml,*/*");
             var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
             if (!response.IsSuccessStatusCode) return null;
             using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);
-            var parsedSitemap = await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken);
+
+            var parsedSitemap = response.Content.Headers.ContentType?.MediaType switch
+            {
+                MediaTypeNames.Text.Plain => await SimpleTextSitemapParser.ReadFromStreamAsync(stream, cancellationToken),
+                MediaTypeNames.Text.Xml or MediaTypeNames.Application.Xml or _
+                    => await SitemapParser.ReadFromStreamAsync(stream, modifiedSince, cancellationToken)
+            };
 
             if (parsedSitemap is null)
             {

diff --git a/src/Robots.Txt.Parser/ProductToken.cs b/src/Robots.Txt.Parser/ProductToken.cs
@@ -20,6 +20,7 @@ public partial class ProductToken : IEquatable<string>, IEquatable<ProductToken>
     /// </summary>
     /// <param name="value">Raw product token value</param>
     /// <returns><see cref="ProductToken"/> that identifies a robot rule group</returns>
+    /// <exception cref="ArgumentOutOfRangeException">Product token is formatted incorrectly</exception>
     public static ProductToken Parse(string value)
     {
         if (value != Wildcard._value && !ValidationPattern.IsMatch(value))

diff --git a/src/Robots.Txt.Parser/RobotsTxtException.cs b/src/Robots.Txt.Parser/RobotsTxtException.cs
@@ -0,0 +1,26 @@
+using System;
+using System.Runtime.Serialization;
+
+namespace Robots.Txt.Parser;
+
+/// <summary>
+/// Exception raised when parsing a robots.txt file
+/// </summary>
+public class RobotsTxtException : Exception
+{
+    internal RobotsTxtException()
+    {
+    }
+
+    internal RobotsTxtException(string? message) : base(message)
+    {
+    }
+
+    internal RobotsTxtException(string? message, Exception? innerException) : base(message, innerException)
+    {
+    }
+
+    protected RobotsTxtException(SerializationInfo info, StreamingContext context) : base(info, context)
+    {
+    }
+}
diff --git a/src/Robots.Txt.Parser/RobotsTxtParser.cs b/src/Robots.Txt.Parser/RobotsTxtParser.cs
@@ -38,6 +38,7 @@ public RobotsTxtParser(IRobotClient robotClient)
     /// <param name="stream">The input stream</param>
     /// <param name="cancellationToken">Cancellation token</param>
     /// <returns>Parsed <see cref="IRobotsTxt"/></returns>
+    /// <exception cref="RobotsTxtException">Raised when there is an error parsing the robots.txt file</exception>
     public async Task<IRobotsTxt> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
     {
         string? line;
@@ -53,71 +54,78 @@ Crawlers MUST use case-insensitive matching to find the group that matches the p
         var userAgentRules = new Dictionary<ProductToken, HashSet<UrlRule>>();
         var userAgentCrawlDirectives = new Dictionary<ProductToken, int>();
 
-        /*
-          The file MUST be UTF-8 encoded
-        */
-        using var streamReader = new StreamReader(stream, Encoding.UTF8);
-        while ((line = await streamReader.ReadLineAsync(cancellationToken)) is not null)
+        try
         {
-            if (stream.Position > ByteCount500KiB) throw new OutOfMemoryException("Reached parsing limit");
+            /*
+              The file MUST be UTF-8 encoded
+            */
+            using var streamReader = new StreamReader(stream, Encoding.UTF8);
+            while ((line = await streamReader.ReadLineAsync(cancellationToken)) is not null && !cancellationToken.IsCancellationRequested)
+            {
+                if (stream.Position > ByteCount500KiB) throw new RobotsTxtException("Reached parsing limit");
 
-            if (line.StartsWith('#')) continue;
+                if (line.StartsWith('#')) continue;
 
-            if (line.StartsWith(UserAgentDirective, StringComparison.InvariantCultureIgnoreCase))
-            {
-                if (!previousLineWasUserAgent) currentUserAgents.Clear();
-                var currentUserAgent = GetValueOfDirective(line, UserAgentDirective);
-                if (ProductToken.TryParse(currentUserAgent, out var productToken))
+                if (line.StartsWith(UserAgentDirective, StringComparison.InvariantCultureIgnoreCase))
                 {
-                    currentUserAgents.Add(productToken);
-                    userAgentRules.TryAdd(productToken, new HashSet<UrlRule>());
-                    previousLineWasUserAgent = true;
+                    if (!previousLineWasUserAgent) currentUserAgents.Clear();
+                    var currentUserAgent = GetValueOfDirective(line, UserAgentDirective);
+                    if (ProductToken.TryParse(currentUserAgent, out var productToken))
+                    {
+                        currentUserAgents.Add(productToken);
+                        userAgentRules.TryAdd(productToken, new HashSet<UrlRule>());
+                        previousLineWasUserAgent = true;
+                    }
+                    continue;
                 }
-                continue;
-            }
 
-            if (currentUserAgents.Count == 0)
-            {
-                if (line.StartsWith(SitemapDirective, StringComparison.InvariantCultureIgnoreCase))
-                {
-                    var sitemapValue = GetValueOfDirective(line, SitemapDirective);
-                    if (Uri.TryCreate(sitemapValue, UriKind.Absolute, out var sitemapAddress)) sitemaps.Add(sitemapAddress);
-                }
-                else if (host is null && line.StartsWith(HostDirective, StringComparison.InvariantCultureIgnoreCase))
-                {
-                    var hostValue = GetValueOfDirective(line, HostDirective);
-                    if (Uri.IsWellFormedUriString(hostValue, UriKind.Absolute)
-                        && Uri.TryCreate(hostValue, UriKind.Absolute, out var uri)) hostValue = uri.Host;
-                    var hostNameType = Uri.CheckHostName(hostValue);
-                    if (hostNameType != UriHostNameType.Unknown && hostNameType != UriHostNameType.Basic) host = hostValue;
-                }
-            }
-            else
-            {
-                if (line.StartsWith(DisallowDirective, StringComparison.InvariantCultureIgnoreCase))
+                if (currentUserAgents.Count == 0)
                 {
-                    var disallowValue = GetValueOfDirective(line, DisallowDirective);
-                    foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Disallow, disallowValue));
-                }
-                else if (line.StartsWith(AllowDirective, StringComparison.InvariantCultureIgnoreCase))
-                {
-                    var allowedValue = GetValueOfDirective(line, AllowDirective);
-                    foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Allow, allowedValue));
+                    if (line.StartsWith(SitemapDirective, StringComparison.InvariantCultureIgnoreCase))
+                    {
+                        var sitemapValue = GetValueOfDirective(line, SitemapDirective);
+                        if (Uri.TryCreate(sitemapValue, UriKind.Absolute, out var sitemapAddress)) sitemaps.Add(sitemapAddress);
+                    }
+                    else if (host is null && line.StartsWith(HostDirective, StringComparison.InvariantCultureIgnoreCase))
+                    {
+                        var hostValue = GetValueOfDirective(line, HostDirective);
+                        if (Uri.IsWellFormedUriString(hostValue, UriKind.Absolute)
+                            && Uri.TryCreate(hostValue, UriKind.Absolute, out var uri)) hostValue = uri.Host;
+                        var hostNameType = Uri.CheckHostName(hostValue);
+                        if (hostNameType != UriHostNameType.Unknown && hostNameType != UriHostNameType.Basic) host = hostValue;
+                    }
                 }
-                else if (line.StartsWith(CrawlDelayDirective, StringComparison.InvariantCultureIgnoreCase))
+                else
                 {
-                    var crawlDelayValue = GetValueOfDirective(line, CrawlDelayDirective);
-                    if (int.TryParse(crawlDelayValue, out var parsedCrawlDelay))
+                    if (line.StartsWith(DisallowDirective, StringComparison.InvariantCultureIgnoreCase))
+                    {
+                        var disallowValue = GetValueOfDirective(line, DisallowDirective);
+                        foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Disallow, disallowValue));
+                    }
+                    else if (line.StartsWith(AllowDirective, StringComparison.InvariantCultureIgnoreCase))
+                    {
+                        var allowedValue = GetValueOfDirective(line, AllowDirective);
+                        foreach (var userAgent in currentUserAgents) userAgentRules[userAgent].Add(new UrlRule(RuleType.Allow, allowedValue));
+                    }
+                    else if (line.StartsWith(CrawlDelayDirective, StringComparison.InvariantCultureIgnoreCase))
                     {
-                        foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, parsedCrawlDelay);
+                        var crawlDelayValue = GetValueOfDirective(line, CrawlDelayDirective);
+                        if (int.TryParse(crawlDelayValue, out var parsedCrawlDelay))
+                        {
+                            foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, parsedCrawlDelay);
+                        }
                     }
                 }
+
+                previousLineWasUserAgent = false;
             }
 
-            previousLineWasUserAgent = false;
+            return new RobotsTxt(_robotClient, userAgentRules, userAgentCrawlDirectives, host, sitemaps);
+        }
+        catch (Exception e) when (e is not RobotsTxtException)
+        {
+            throw new RobotsTxtException("Unable to parse robots.txt", e);
         }
-
-        return new RobotsTxt(_robotClient, userAgentRules, userAgentCrawlDirectives, host, sitemaps);
     }
 
     private static string GetValueOfDirective(string line, string directive)

diff --git a/src/Robots.Txt.Parser/SimpleTextSitemapParser.cs b/src/Robots.Txt.Parser/SimpleTextSitemapParser.cs
@@ -0,0 +1,65 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Threading;
+using System.Threading.Tasks;
+
+namespace Robots.Txt.Parser;
+
+/// <summary>
+/// Parses a <see cref="Sitemap"/> TXT document
+/// </summary>
+public static class SimpleTextSitemapParser
+{
+    private const int MaxLines = 50000;
+    private const int ByteCount50MiB = 52_428_800;
+
+    /// <summary>
+    /// Parses a <see cref="Sitemap"/> from a <see cref="Stream"/>
+    /// </summary>
+    /// <param name="stream">Sitemap document stream</param>
+    /// <param name="cancellationToken">Cancellation token</param>
+    /// <returns>The parsed <see cref="Sitemap"/></returns>
+    /// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
+    public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default)
+    {
+        var urlSet = new HashSet<UrlSetItem>();
+        try
+        {
+            using var streamReader = new StreamReader(stream);
+            string? line;
+            var lineCount = 0;
+            while (((line = await streamReader.ReadLineAsync(cancellationToken)) is not null) && !cancellationToken.IsCancellationRequested)
+            {
+                /*
+                  Each text file ... and must be no larger than 50MiB (52,428,800 bytes)
+                */
+                if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit");
+
+                if (string.IsNullOrWhiteSpace(line)) continue;
+
+                lineCount++;
+
+                /*
+                  Each text file can contain a maximum of 50,000 URLs
+                */
+                if (lineCount > MaxLines) throw new SitemapException("Reached line limit");
+
+                /*
+                  The text file must have one URL per line. The URLs cannot contain embedded new lines.
+                  You must fully specify URLs, including the http.
+                  The text file must use UTF-8 encoding.
+                  The text file should contain no information other than the list of URLs.
+                  The text file should contain no header or footer information.
+                */
+                urlSet.Add(new UrlSetItem(new Uri(line), null, null, null));
+            }
+
+            return new Sitemap(urlSet);
+        }
+        catch (Exception e) when (e is not SitemapException)
+        {
+            throw new SitemapException("Unable to parse sitemap", e);
+        }
+    }
+}
diff --git a/src/Robots.Txt.Parser/SitemapParser.cs b/src/Robots.Txt.Parser/SitemapParser.cs
@@ -8,7 +8,7 @@
 namespace Robots.Txt.Parser;
 
 /// <summary>
-/// Parses a <see cref="Sitemap"/> document
+/// Parses a <see cref="Sitemap"/> XML document
 /// </summary>
 public class SitemapParser
 {
@@ -21,7 +21,7 @@ public class SitemapParser
     /// <param name="modifiedSince">Filters the sitemap on the modified date</param>
     /// <param name="cancellationToken">Cancellation token</param>
     /// <returns>The parsed <see cref="Sitemap"/></returns>
-    /// <exception cref="SitemapException">Thrown when the sitemap document is formatted incorrectly</exception>
+    /// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception>
     public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, DateTime? modifiedSince = null, CancellationToken cancellationToken = default)
     {
         try

diff --git a/tests/Robots.Txt.Parser.Tests.Unit/ProductTokenTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/ProductTokenTests.cs
@@ -145,12 +145,12 @@ public void TryParse_ValidProductToken_ReturnTrue()
     public void Equals_Null_NotEqual()
     {
         // Arrange
-        var a = ProductToken.Wildcard;
+        var productToken = ProductToken.Wildcard;
 
         // Act
-        var isEqualProductToken = a.Equals((ProductToken?)null);
-        var isEqualString = a.Equals((string?)null);
-        var isEqualObject = a.Equals((object?)null);
+        var isEqualProductToken = productToken.Equals((ProductToken?)null);
+        var isEqualString = productToken!.Equals((string?)null);
+        var isEqualObject = productToken!.Equals((object?)null);
 
         // Assert
         isEqualProductToken.Should().Be(false);

diff --git a/tests/Robots.Txt.Parser.Tests.Unit/Robots.Txt.Parser.Tests.Unit.csproj b/tests/Robots.Txt.Parser.Tests.Unit/Robots.Txt.Parser.Tests.Unit.csproj
@@ -25,8 +25,12 @@
   </ItemGroup>
 
   <ItemGroup>
-    <EmbeddedResource Include="over-50kib-robots.txt" />
-    <EmbeddedResource Include="under-50kib-robots.txt" />
+    <EmbeddedResource Include="over-500kib-robots.txt" />
+    <EmbeddedResource Include="exactly-500kib-robots.txt" />
+    <EmbeddedResource Include="over-50k-lines-sitemap.txt" />
+    <EmbeddedResource Include="exactly-50k-lines-sitemap.txt" />
+    <EmbeddedResource Include="over-50mib-sitemap.txt" />
+    <EmbeddedResource Include="exactly-50mib-sitemap.txt" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/tests/Robots.Txt.Parser.Tests.Unit/RobotsTxtParserTests.cs b/tests/Robots.Txt.Parser.Tests.Unit/RobotsTxtParserTests.cs
@@ -1,4 +1,3 @@
-using System;
 using System.IO;
 using System.Reflection;
 using System.Text;
@@ -71,11 +70,11 @@ public async Task ReadFromStreamAsync_WithEndOfLineComments_CommentsIgnored()
     }
 
     [Fact]
-    public async Task ReadFromStreamAsync_Under50KiB_DoNotThrow()
+    public async Task ReadFromStreamAsync_Exactly500KiB_DoNotThrow()
     {
         // Arrange
         var fileProvider = new EmbeddedFileProvider(Assembly.GetExecutingAssembly());
-        var stream = fileProvider.GetFileInfo("under-50kib-robots.txt").CreateReadStream();
+        var stream = fileProvider.GetFileInfo("exactly-500kib-robots.txt").CreateReadStream();
 
         // Act
         var parse = async () => await _parser.ReadFromStreamAsync(stream);
@@ -85,17 +84,17 @@ public async Task ReadFromStreamAsync_Under50KiB_DoNotThrow()
     }
 
     [Fact]
-    public async Task ReadFromStreamAsync_Over50KiB_ThrowOutOfMemoryException()
+    public async Task ReadFromStreamAsync_Over500KiB_ThrowRobotsTxtException()
     {
         // Arrange
         var fileProvider = new EmbeddedFileProvider(Assembly.GetExecutingAssembly());
-        var stream = fileProvider.GetFileInfo("over-50kib-robots.txt").CreateReadStream();
+        var stream = fileProvider.GetFileInfo("over-500kib-robots.txt").CreateReadStream();
 
         // Act
         var parse = async () => await _parser.ReadFromStreamAsync(stream);
 
         // Assert
-        await parse.Should().ThrowAsync<OutOfMemoryException>();
+        await parse.Should().ThrowAsync<RobotsTxtException>();
     }
 
     [Fact]