-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Site text sitemap support and test improvements
- Loading branch information
Showing
20 changed files
with
159,527 additions
and
8,918 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -475,3 +475,6 @@ $RECYCLE.BIN/ | |
|
||
# Windows shortcuts | ||
*.lnk | ||
|
||
# Custom directory | ||
exclude/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
using System; | ||
using System.Runtime.Serialization; | ||
|
||
namespace Robots.Txt.Parser; | ||
|
||
/// <summary> | ||
/// Exception raised when parsing a robots.txt file | ||
/// </summary> | ||
public class RobotsTxtException : Exception | ||
{ | ||
internal RobotsTxtException() | ||
{ | ||
} | ||
|
||
internal RobotsTxtException(string? message) : base(message) | ||
{ | ||
} | ||
|
||
internal RobotsTxtException(string? message, Exception? innerException) : base(message, innerException) | ||
{ | ||
} | ||
|
||
protected RobotsTxtException(SerializationInfo info, StreamingContext context) : base(info, context) | ||
{ | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
|
||
namespace Robots.Txt.Parser; | ||
|
||
/// <summary> | ||
/// Parses a <see cref="Sitemap"/> TXT document | ||
/// </summary> | ||
public static class SimpleTextSitemapParser | ||
{ | ||
private const int MaxLines = 50000; | ||
private const int ByteCount50MiB = 52_428_800; | ||
|
||
/// <summary> | ||
/// Parses a <see cref="Sitemap"/> from a <see cref="Stream"/> | ||
/// </summary> | ||
/// <param name="stream">Sitemap document stream</param> | ||
/// <param name="cancellationToken">Cancellation token</param> | ||
/// <returns>The parsed <see cref="Sitemap"/></returns> | ||
/// <exception cref="SitemapException">Raised when there is an error parsing the Sitemap</exception> | ||
public static async Task<Sitemap> ReadFromStreamAsync(Stream stream, CancellationToken cancellationToken = default) | ||
{ | ||
var urlSet = new HashSet<UrlSetItem>(); | ||
try | ||
{ | ||
using var streamReader = new StreamReader(stream); | ||
string? line; | ||
var lineCount = 0; | ||
while (((line = await streamReader.ReadLineAsync(cancellationToken)) is not null) && !cancellationToken.IsCancellationRequested) | ||
{ | ||
/* | ||
Each text file ... and must be no larger than 50MiB (52,428,800 bytes) | ||
*/ | ||
if (stream.Position > ByteCount50MiB) throw new SitemapException("Reached parsing limit"); | ||
|
||
if (string.IsNullOrWhiteSpace(line)) continue; | ||
|
||
lineCount++; | ||
|
||
/* | ||
Each text file can contain a maximum of 50,000 URLs | ||
*/ | ||
if (lineCount > MaxLines) throw new SitemapException("Reached line limit"); | ||
|
||
/* | ||
The text file must have one URL per line. The URLs cannot contain embedded new lines. | ||
You must fully specify URLs, including the http. | ||
The text file must use UTF-8 encoding. | ||
The text file should contain no information other than the list of URLs. | ||
The text file should contain no header or footer information. | ||
*/ | ||
urlSet.Add(new UrlSetItem(new Uri(line), null, null, null)); | ||
} | ||
|
||
return new Sitemap(urlSet); | ||
} | ||
catch (Exception e) when (e is not SitemapException) | ||
{ | ||
throw new SitemapException("Unable to parse sitemap", e); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.