Skip to content

Commit

Permalink
Use pdfScanner in ReadVerticalDisplacements and fix #693 and return 0…
Browse files Browse the repository at this point in the history
… in CMap on exception in ReadByte() if useLenientParsing is true and fix #692
  • Loading branch information
BobLd committed Oct 18, 2024
1 parent ea95a7a commit e10609e
Show file tree
Hide file tree
Showing 11 changed files with 74 additions and 34 deletions.
Binary file not shown.
Binary file not shown.
30 changes: 30 additions & 0 deletions src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,36 @@

public class GithubIssuesTests
{
[Fact]
public void Issue693()
{
var doc = IntegrationHelpers.GetDocumentPath("reference-2-numeric-error.pdf");

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(1269, page1.Letters.Count);
}
}

[Fact]
public void Issue692()
{
var doc = IntegrationHelpers.GetDocumentPath("cmap-parsing-exception.pdf");

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = true, SkipMissingFonts = true }))
{
var page1 = document.GetPage(1);
Assert.Equal(796, page1.Letters.Count);
}

using (var document = PdfDocument.Open(doc, new ParsingOptions() { UseLenientParsing = false, SkipMissingFonts = false }))
{
var ex = Assert.Throws<InvalidOperationException>(() => document.GetPage(1));
Assert.StartsWith("Read byte called on input bytes which was at end of byte set.", ex.Message);
}
}

[Fact]
public void Issue874()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ public class IntegrationDocumentTests
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf",
"ErcotFacts.pdf"
"ErcotFacts.pdf",
"cmap-parsing-exception.pdf"
];

[Theory]
Expand Down
2 changes: 1 addition & 1 deletion src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ private static PdfDocument OpenDocument(
cidFontFactory,
filterProvider,
pdfScanner,
parsingOptions.Logger);
parsingOptions);

var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);

Expand Down
5 changes: 2 additions & 3 deletions src/UglyToad.PdfPig/PdfExtensions.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
namespace UglyToad.PdfPig
{
using System;
using System.Collections.Generic;
using System;
using System.Diagnostics.CodeAnalysis;
using Core;
using Filters;
using Parser.Parts;
using Parser.Parts;
using Tokenization.Scanner;
using Tokens;

Expand Down
18 changes: 11 additions & 7 deletions src/UglyToad.PdfPig/PdfFonts/Cmap/CMap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
/// The CMap (character code map) maps character codes to character identifiers (CIDs).
/// The set of characters which a CMap refers to is the "character set" (charset).
/// </summary>
internal class CMap
internal sealed class CMap
{
public CharacterIdentifierSystemInfo Info { get; }

Expand Down Expand Up @@ -140,13 +140,12 @@ public int ConvertToCid(int code)
return 0;
}


public override string ToString()
{
return Name;
}

public int ReadCode(IInputBytes bytes)
public int ReadCode(IInputBytes bytes, bool useLenientParsing)
{
if (hasEmptyCodespace)
{
Expand All @@ -166,7 +165,7 @@ public int ReadCode(IInputBytes bytes)
break;
}

result[i] = ReadByte(bytes);
result[i] = ReadByte(bytes, useLenientParsing);
}

for (int i = minCodeLength - 1; i < maxCodeLength; i++)
Expand All @@ -181,17 +180,23 @@ public int ReadCode(IInputBytes bytes)
}
if (byteCount < maxCodeLength)
{
result[byteCount] = ReadByte(bytes);
result[byteCount] = ReadByte(bytes, useLenientParsing);
}
}

throw new PdfDocumentFormatException($"CMap is invalid, min code length was {minCodeLength}, max was {maxCodeLength}.");
}

private static byte ReadByte(IInputBytes bytes)
private static byte ReadByte(IInputBytes bytes, bool useLenientParsing)
{
if (!bytes.MoveNext())
{
if (useLenientParsing)
{
// See issue #692
return 0;
}

throw new InvalidOperationException("Read byte called on input bytes which was at end of byte set. Current offset: " + bytes.CurrentOffset);
}

Expand All @@ -208,6 +213,5 @@ private static int ByteArrayToInt(ReadOnlySpan<byte> data)
}
return code;
}

}
}
6 changes: 3 additions & 3 deletions src/UglyToad.PdfPig/PdfFonts/Composite/ToUnicodeCMap.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
/// Defines the information content (actual text) of the font
/// as opposed to the display format.
/// </summary>
internal class ToUnicodeCMap
internal sealed class ToUnicodeCMap
{
private readonly CMap? cMap;

Expand Down Expand Up @@ -45,9 +45,9 @@ public bool TryGet(int code, [NotNullWhen(true)] out string? value)
return cMap.TryConvertToUnicode(code, out value);
}

public int ReadCode(IInputBytes inputBytes)
public int ReadCode(IInputBytes inputBytes, bool useLenientParsing)
{
return cMap!.ReadCode(inputBytes);
return cMap!.ReadCode(inputBytes, useLenientParsing);
}
}
}
7 changes: 6 additions & 1 deletion src/UglyToad.PdfPig/PdfFonts/Composite/Type0Font.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ internal sealed class Type0Font : IFont, IVerticalWritingSupported
private readonly Dictionary<int, CharacterBoundingBox> boundingBoxCache
= new Dictionary<int, CharacterBoundingBox>();

private readonly bool useLenientParsing;

public NameToken Name => BaseFont;

public NameToken BaseFont { get; }
Expand All @@ -41,6 +43,7 @@ public Type0Font(
CMap cmap,
CMap? toUnicodeCMap,
CMap? ucs2CMap,
ParsingOptions parsingOptions,
bool isChineseJapaneseOrKorean)
{
this.ucs2CMap = ucs2CMap;
Expand All @@ -52,13 +55,15 @@ public Type0Font(
ToUnicode = new ToUnicodeCMap(toUnicodeCMap);
Details = cidFont.Details?.WithName(Name.Data)
?? FontDetails.GetDefault(Name.Data);

useLenientParsing = parsingOptions.UseLenientParsing;
}

public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
var current = bytes.CurrentOffset;

var code = CMap.ReadCode(bytes);
var code = CMap.ReadCode(bytes, useLenientParsing);

codeLength = (int)(bytes.CurrentOffset - current);

Expand Down
10 changes: 6 additions & 4 deletions src/UglyToad.PdfPig/PdfFonts/Parser/Handlers/Type0FontHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@
using Tokens;
using Util;

internal class Type0FontHandler : IFontHandler
internal sealed class Type0FontHandler : IFontHandler
{
private readonly CidFontFactory cidFontFactory;
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner scanner;
private readonly ILog logger;
private readonly ParsingOptions parsingOptions;

public Type0FontHandler(
CidFontFactory cidFontFactory,
ILookupFilterProvider filterProvider,
IPdfTokenScanner scanner,
ILog logger)
ParsingOptions parsingOptions)
{
this.cidFontFactory = cidFontFactory;
this.filterProvider = filterProvider;
this.scanner = scanner;
this.logger = logger;
logger = parsingOptions.Logger;
this.parsingOptions = parsingOptions;
}

public IFont Generate(DictionaryToken dictionary)
Expand Down Expand Up @@ -91,7 +93,7 @@ public IFont Generate(DictionaryToken dictionary)
}
}

var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, isChineseJapaneseOrKorean);
var font = new Type0Font(baseFont!, cidFont, cMap, toUnicodeCMap, ucs2CMap, parsingOptions, isChineseJapaneseOrKorean);

return font;
}
Expand Down
27 changes: 13 additions & 14 deletions src/UglyToad.PdfPig/PdfFonts/Parser/Parts/CidFontFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
using UglyToad.PdfPig.Logging;
using Util;

internal class CidFontFactory
internal sealed class CidFontFactory
{
private readonly ILookupFilterProvider filterProvider;
private readonly IPdfTokenScanner pdfScanner;
Expand Down Expand Up @@ -46,7 +46,7 @@ public CidFontFactory(ILog log, IPdfTokenScanner pdfScanner, ILookupFilterProvid
defaultWidth = defaultWidthToken.Double;
}

var verticalWritingMetrics = ReadVerticalDisplacements(dictionary);
var verticalWritingMetrics = ReadVerticalDisplacements(dictionary, pdfScanner);

FontDescriptor? descriptor = null;
if (TryGetFontDescriptor(dictionary, out var descriptorDictionary))
Expand Down Expand Up @@ -190,7 +190,7 @@ private IReadOnlyDictionary<int, double> ReadWidths(DictionaryToken dict)
return widths;
}

private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict)
private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken dict, IPdfTokenScanner pdfScanner)
{
var verticalDisplacements = new Dictionary<int, double>();
var positionVectors = new Dictionary<int, PdfVector>();
Expand All @@ -210,22 +210,21 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
}

// vertical metrics for individual CIDs.
if (dict.TryGet(NameToken.W2, out var w2Token) && w2Token is ArrayToken w2)
if (dict.TryGet(NameToken.W2, pdfScanner, out ArrayToken? w2))
{
for (var i = 0; i < w2.Data.Count; i++)
{
var c = (NumericToken)w2.Data[i];
var c = DirectObjectFinder.Get<NumericToken>(w2.Data[i], pdfScanner);
var next = w2.Data[++i];

if (next is ArrayToken array)
if (DirectObjectFinder.TryGet(next, pdfScanner, out ArrayToken? array))
{
for (var j = 0; j < array.Data.Count; j++)
{
var cid = c.Int + j;
// ReSharper disable InconsistentNaming
var w1y = (NumericToken)array.Data[j];
var v1x = (NumericToken)array.Data[++j];
var v1y = (NumericToken)array.Data[++j];
var w1y = DirectObjectFinder.Get<NumericToken>(array.Data[j], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(array.Data[++j], pdfScanner);

verticalDisplacements[cid] = w1y.Double;

Expand All @@ -236,9 +235,9 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
{
var first = c.Int;
var last = ((NumericToken)next).Int;
var w1y = (NumericToken)w2.Data[++i];
var v1x = (NumericToken)w2.Data[++i];
var v1y = (NumericToken)w2.Data[++i];
var w1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1x = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
var v1y = DirectObjectFinder.Get<NumericToken>(w2.Data[++i], pdfScanner);
// ReSharper restore InconsistentNaming

for (var cid = first; cid <= last; cid++)
Expand All @@ -250,7 +249,7 @@ private static VerticalWritingMetrics ReadVerticalDisplacements(DictionaryToken
}
}
}

return new VerticalWritingMetrics(dw2, verticalDisplacements, positionVectors);
}

Expand Down

0 comments on commit e10609e

Please sign in to comment.