Skip to content

Commit 0783e08

Browse files
authored
Merge pull request ststeiger#447 from jltrem/jltrem/pdf-image-consolidation
Add image consolidation
2 parents d0b0a42 + f1323f4 commit 0783e08

File tree

4 files changed

+156
-4
lines changed

4 files changed

+156
-4
lines changed
65 KB
Loading

PdfSharpCore.Test/Merge.cs

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,74 @@
1-
using System.IO;
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Linq;
5+
using PdfSharpCore.Drawing;
6+
using PdfSharpCore.Drawing.Layout;
27
using PdfSharpCore.Pdf;
38
using PdfSharpCore.Pdf.IO;
49
using PdfSharpCore.Test.Helpers;
510
using Xunit;
11+
using Xunit.Abstractions;
612

713
namespace PdfSharpCore.Test
814
{
915
public class Merge
1016
{
17+
private readonly ITestOutputHelper _output;
18+
19+
public Merge(ITestOutputHelper output)
20+
{
21+
_output = output;
22+
}
23+
1124
[Fact]
1225
public void CanMerge2Documents()
1326
{
1427
var pdf1Path = PathHelper.GetInstance().GetAssetPath("FamilyTree.pdf");
1528
var pdf2Path = PathHelper.GetInstance().GetAssetPath("test.pdf");
1629

30+
var outputDocument = MergeDocuments(new[] { pdf1Path, pdf2Path });
31+
32+
var outFilePath = CreateOutFilePath("merge.pdf");
33+
outputDocument.Save(outFilePath);
34+
}
35+
36+
[Fact]
37+
public void CanConsolidateImageDataInDocument()
38+
{
39+
var doc1 = CreateTestDocumentWithImage("lenna.png");
40+
var doc2 = CreateTestDocumentWithImage("frog-and-toad.jpg");
41+
42+
var pdf1Path = CreateOutFilePath("image-doc1.pdf");
43+
doc1.Save(pdf1Path);
44+
45+
var pdf2Path = CreateOutFilePath("image-doc2.pdf");
46+
doc2.Save(pdf2Path);
47+
48+
var pdfPathsForMerge = Enumerable.Range(1, 50).SelectMany(_ => new[] { pdf1Path, pdf2Path });
49+
var outputDocument = MergeDocuments(pdfPathsForMerge);
50+
51+
var mergedFilePath = CreateOutFilePath("images-merged.pdf");
52+
outputDocument.Save(mergedFilePath);
53+
54+
outputDocument.ConsolidateImages();
55+
var consolidatedFilePath = CreateOutFilePath("images-merged-consolidated.pdf");
56+
outputDocument.Save(consolidatedFilePath);
57+
58+
long mergedLength = new FileInfo(mergedFilePath).Length;
59+
long consolidatedLength = new FileInfo(consolidatedFilePath).Length;
60+
Assert.True(consolidatedLength < mergedLength / 4);
61+
}
62+
63+
private static PdfDocument MergeDocuments(IEnumerable<string> pdfPaths)
64+
{
1765
var outputDocument = new PdfDocument();
1866

19-
foreach (var pdfPath in new[] { pdf1Path, pdf2Path })
67+
foreach (var pdfPath in pdfPaths)
2068
{
2169
using var fs = File.OpenRead(pdfPath);
2270
var inputDocument = Pdf.IO.PdfReader.Open(fs, PdfDocumentOpenMode.Import);
71+
2372
var count = inputDocument.PageCount;
2473
for (var idx = 0; idx < count; idx++)
2574
{
@@ -28,14 +77,34 @@ public void CanMerge2Documents()
2877
}
2978
}
3079

31-
var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", "merge.pdf");
80+
return outputDocument;
81+
}
82+
83+
private static string CreateOutFilePath(string filename)
84+
{
85+
var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", filename);
3286
var dir = Path.GetDirectoryName(outFilePath);
3387
if (!Directory.Exists(dir))
3488
{
3589
Directory.CreateDirectory(dir);
3690
}
3791

38-
outputDocument.Save(outFilePath);
92+
return outFilePath;
93+
}
94+
95+
private static PdfDocument CreateTestDocumentWithImage(string imageFilename)
96+
{
97+
var document = new PdfDocument();
98+
99+
var pageNewRenderer = document.AddPage();
100+
var renderer = XGraphics.FromPdfPage(pageNewRenderer);
101+
var textFormatter = new XTextFormatter(renderer);
102+
103+
var layout = new XRect(12, 12, 400, 50);
104+
textFormatter.DrawString(imageFilename, new XFont("Arial", 12), XBrushes.Black, layout);
105+
renderer.DrawImage(XImage.FromFile(PathHelper.GetInstance().GetAssetPath(imageFilename)), new XPoint(12, 100));
106+
107+
return document;
39108
}
40109
}
41110
}

PdfSharpCore.Test/PdfSharpCore.Test.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@
3838
<None Update="Assets\**\*.png">
3939
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
4040
</None>
41+
<None Update="Assets\frog-and-toad.jpg">
42+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
43+
</None>
4144
</ItemGroup>
4245

4346
</Project>

PdfSharpCore/Pdf/PdfDocument.cs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
#endregion
2929

3030
using System;
31+
using System.Collections.Generic;
3132
using System.Diagnostics;
3233
using System.IO;
3334
using System.Linq;
35+
using System.Security.Cryptography;
36+
using System.Text;
3437
using PdfSharpCore.Pdf.Advanced;
3538
using PdfSharpCore.Pdf.Internal;
3639
using PdfSharpCore.Pdf.IO;
@@ -809,6 +812,83 @@ public void MakeAcroFormsReadOnly()
809812
}
810813
}
811814

815+
public void ConsolidateImages()
816+
{
817+
var images = ImageInfo.FindAll(this);
818+
819+
var mapHashcodeToMd5 = new Dictionary<int, string>();
820+
var mapMd5ToPdfItem = new Dictionary<string, PdfItem>();
821+
822+
// Calculate MD5 for each image XObject and build lookups for all images.
823+
foreach (ImageInfo img in images)
824+
{
825+
mapHashcodeToMd5[img.XObject.GetHashCode()] = img.XObjectMD5;
826+
mapMd5ToPdfItem[img.XObjectMD5] = img.Item.Value;
827+
}
828+
829+
// Set the PdfItem for each image to the one chosen for the MD5.
830+
foreach (ImageInfo img in images)
831+
{
832+
string md5 = mapHashcodeToMd5[img.XObject.GetHashCode()];
833+
img.XObjects.Elements[img.Item.Key] = mapMd5ToPdfItem[md5];
834+
}
835+
}
836+
837+
internal class ImageInfo
838+
{
839+
public PdfDictionary XObjects { get; }
840+
public KeyValuePair<string, PdfItem> Item { get; }
841+
public PdfDictionary XObject { get; }
842+
public string XObjectMD5 { get; }
843+
844+
private static readonly MD5 Hasher = MD5.Create();
845+
846+
public ImageInfo(PdfDictionary xObjects, KeyValuePair<string, PdfItem> item, PdfDictionary xObject)
847+
{
848+
XObjects = xObjects;
849+
Item = item;
850+
XObject = xObject;
851+
XObjectMD5 = ComputeMD5(xObject.Stream.Value);
852+
}
853+
854+
/// <summary>
855+
/// Get info for each image in the document.
856+
/// </summary>
857+
internal static List<ImageInfo> FindAll(PdfDocument doc) =>
858+
doc.Pages.Cast<PdfPage>()
859+
.Select(page => page.Elements.GetDictionary("/Resources"))
860+
.Select(resources => resources?.Elements?.GetDictionary("/XObject"))
861+
.Where(xObjects => xObjects?.Elements != null)
862+
.SelectMany(xObjects =>
863+
from item in xObjects.Elements
864+
let xObject = (item.Value as PdfReference)?.Value as PdfDictionary
865+
where xObject?.Elements?.GetString("/Subtype") == "/Image"
866+
select new ImageInfo(xObjects, item, xObject)
867+
)
868+
.ToList();
869+
870+
/// <summary>
871+
/// Compute and return the MD5 hash of the input data.
872+
/// </summary>
873+
internal static string ComputeMD5(byte[] input)
874+
{
875+
byte[] hashBytes;
876+
lock (Hasher)
877+
{
878+
hashBytes = Hasher.ComputeHash(input);
879+
Hasher.Initialize();
880+
}
881+
882+
var sb = new StringBuilder();
883+
foreach (var x in hashBytes)
884+
{
885+
sb.Append(x.ToString("x2"));
886+
}
887+
888+
return sb.ToString();
889+
}
890+
}
891+
812892
/// <summary>
813893
/// Gets the security handler.
814894
/// </summary>

0 commit comments

Comments
 (0)