Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading files with missing white space after xref in lenient mode #906

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/CrossReferenceParserTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
namespace UglyToad.PdfPig.Tests.Integration
{
public class CrossReferenceParserTests
{
[Fact]
public void CanReadDocumentWithMissingWhitespaceAfterXRef()
{
string path = IntegrationHelpers.GetSpecificTestDocumentPath("xref-with-no-whitespace.pdf");
using var document = PdfDocument.Open(path);
Assert.Equal(3, document.NumberOfPages);
}
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,42 @@ 0000000418 00000 n
trailer
<< >>";
// Strict parsing
var input = StringBytesTestConverter.Scanner(data);
var act = () => CrossReferenceTableParser.Parse(input.scanner, 0, false);
var input = GetReader(data);
var act = () => CrossReferenceTableParser.Parse(input, 0, false);
var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Found a line with 2 unexpected entries in the cross reference table: 127, 0.", ex.Message);

// Lenient Parsing
input = StringBytesTestConverter.Scanner(data);
var result = CrossReferenceTableParser.Parse(input.scanner, 0, true);
input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input, 0, true);

Assert.Equal(6, result.ObjectOffsets.Count);
}

[Fact]
public void ParsesMissingWhitespaceAfterXref()
{
var data = @"xref15 2
0000000190 00000 n
0000000250 00032 n

trailer
<<>>";
var input = GetReader(data);

// Strict parsing
var act = () => CrossReferenceTableParser.Parse(input, 0, false);

var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Unexpected operator in xref position: xref15.", ex.Message);

// Lenient Parsing
input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input, 0, true);

Assert.Equal(2, result.ObjectOffsets.Count);
}

private static CoreTokenScanner GetReader(string input)
{
return StringBytesTestConverter.Scanner(input).scanner;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public CrossReferenceTable Parse(IInputBytes bytes, bool isLenientParsing, long

tokenScanner.MoveNext();

if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
if (CrossReferenceTableParser.IsCrossReferenceMarker(tokenScanner, isLenientParsing))
{
missedAttempts = 0;
log.Debug("Element was cross reference table.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ internal static class CrossReferenceTableParser
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";

public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
{
var builder = new CrossReferenceTablePartBuilder
Expand All @@ -31,10 +31,22 @@ public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long

if (scanner.CurrentToken is OperatorToken operatorToken)
{
if (operatorToken.Data == "xref")
if (operatorToken.Data == OperatorToken.Xref.Data)
{
scanner.MoveNext();
}
else if (isLenientParsing)
{
if (operatorToken.Data.StartsWith(OperatorToken.Xref.Data))
{
scanner.Seek(scanner.CurrentPosition - operatorToken.Data.Length + OperatorToken.Xref.Data.Length);
scanner.MoveNext();
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
}
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
Expand Down Expand Up @@ -106,6 +118,15 @@ public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long
return builder.Build();
}

public static bool IsCrossReferenceMarker(ISeekableTokenScanner scanner, bool isLenientParsing)
{
return (scanner.CurrentToken is OperatorToken operatorToken
&& (operatorToken.Data == OperatorToken.Xref.Data
|| (isLenientParsing
&& operatorToken.Data.StartsWith(OperatorToken.Xref.Data)
&& int.TryParse(operatorToken.Data.Substring(OperatorToken.Xref.Data.Length), out _))));
}

private static int ProcessTokens(ReadOnlySpan<IToken> tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing,
int objectCount, ref TableSubsectionDefinition definition)
{
Expand Down
Loading