Skip to content

Add option to skip corrupt PDFs in PDFMergerUtility with improved exception handling #208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: trunk
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 147 additions & 82 deletions pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ public class PDFMergerUtility
private static final Logger LOG = LogManager.getLogger(PDFMergerUtility.class);

private final List<Object> sources;

private String destinationFileName;
private OutputStream destinationStream;
private boolean ignoreAcroFormErrors = false;
Expand All @@ -102,6 +103,26 @@ public class PDFMergerUtility
private DocumentMergeMode documentMergeMode = DocumentMergeMode.PDFBOX_LEGACY_MODE;
private AcroFormMergeMode acroFormMergeMode = AcroFormMergeMode.PDFBOX_LEGACY_MODE;

private boolean skipCorruptFiles = false;
private int lastMergeSkippedCount = 0;
private final List<String> skippedFiles = new ArrayList<>();

/**
* Set to true to skip corrupt PDF files instead of failing the whole merge.
*
* @param skip true to skip corrupt files
*/
public void setSkipCorruptFiles(boolean skip) {
this.skipCorruptFiles = skip;
}

/**
*@return true if corrupt files should be skipped
*/
public boolean isSkipCorruptFiles() {
return skipCorruptFiles;
}

/**
* The mode to use when merging documents:
*
Expand Down Expand Up @@ -370,67 +391,102 @@ else if (documentMergeMode == DocumentMergeMode.OPTIMIZE_RESOURCES_MODE)
{
optimizedMergeDocuments(streamCacheCreateFunction, compressParameters);
}
LOG.info("Merge completed. Total sources: {}, Skipped: {}", sources.size(), lastMergeSkippedCount);

}


/**
* @return an unmodifiable list of file paths or stream indices that were skipped
*/
public List<String> getSkippedFiles() {
return Collections.unmodifiableList(skippedFiles);
}

/**
* Add a PDF source via InputStream.
* The InputStream is wrapped into a RandomAccessReadBuffer for PDFBox loading.
*
* @param inputStream Input stream containing a valid PDF file
*/
public void addSource(InputStream inputStream) {
if (inputStream == null) {
throw new IllegalArgumentException("InputStream cannot be null");
}
sources.add(new RandomAccessReadBuffer(inputStream));
}

private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction,
CompressParameters compressParameters) throws IOException
CompressParameters compressParameters) throws IOException
{
StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null ? streamCacheCreateFunction
if (sources.isEmpty()) {
throw new IllegalStateException("No source PDFs provided. Use addSource() before merging.");
}

StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null
? streamCacheCreateFunction
: IOUtils.createMemoryOnlyStreamCache();
try (PDDocument destination = new PDDocument(strmCacheFunc))
{

try (PDDocument destination = new PDDocument(strmCacheFunc)) {
PDFCloneUtility cloner = new PDFCloneUtility(destination);
PDPageTree destinationPageTree = destination.getPages(); // cache PageTree
for (Object sourceObject : sources)
{
PDPageTree destinationPageTree = destination.getPages();

lastMergeSkippedCount = 0;


for (Object sourceObject : sources) {
PDDocument sourceDoc = null;
try
{
if (sourceObject instanceof File)
{
sourceDoc = Loader.loadPDF((File) sourceObject);
}
else
{
sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject);
try {
try {
if (sourceObject instanceof File) {
File file = (File) sourceObject;
sourceDoc = Loader.loadPDF(file);
} else {
sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject);
}
} catch (IOException e) {
String sourceDesc = (sourceObject instanceof File)
? ((File) sourceObject).getAbsolutePath()
: "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")";
if (skipCorruptFiles) {
LOG.warn("Skipping corrupt file: {}", sourceDesc, e);
skippedFiles.add(sourceDesc);
lastMergeSkippedCount++;
continue;
} else {
throw new IOException("Failed to load PDF from source: " + sourceDesc, e);
}
}
for (PDPage page : sourceDoc.getPages())
{

for (PDPage page : sourceDoc.getPages()) {
PDPage newPage = new PDPage(cloner.cloneForNewDocument(page.getCOSObject()));
newPage.setCropBox(page.getCropBox());
newPage.setMediaBox(page.getMediaBox());
newPage.setRotation(page.getRotation());

PDResources resources = page.getResources();
if (resources != null)
{
// this is smart enough to just create references for resources that are used on multiple
// pages
if (resources != null) {
newPage.setResources(new PDResources(
cloner.cloneForNewDocument(resources.getCOSObject())));
}
else
{
} else {
newPage.setResources(new PDResources());
}
destinationPageTree.add(newPage);
}
}
finally
{
} finally {
IOUtils.closeQuietly(sourceDoc);
}
}

if (destinationStream == null)
{

if (destinationStream == null) {
destination.save(destinationFileName, compressParameters);
}
else
{
} else {
destination.save(destinationStream, compressParameters);
}

}
}



/**
Expand All @@ -441,63 +497,72 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate
*
* @throws IOException If there is an error saving the document.
*/
private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction,
CompressParameters compressParameters) throws IOException
private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction,
CompressParameters compressParameters) throws IOException
{
if (!sources.isEmpty())
{
// Make sure that:
// - first Exception is kept
// - all PDDocuments are closed
// - all FileInputStreams are closed
// - there's a way to see which errors occurred
StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null ? streamCacheCreateFunction
: IOUtils.createMemoryOnlyStreamCache();
try (PDDocument destination = new PDDocument(strmCacheFunc))
{
for (Object sourceObject : sources)
{
PDDocument sourceDoc;
if (sourceObject instanceof File)
{
sourceDoc = Loader.loadPDF((File) sourceObject);
}
else
{
sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject);
if (sources.isEmpty()) {
throw new IllegalStateException("No source PDFs provided. Use addSource() before merging.");
}

StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null
? streamCacheCreateFunction
: IOUtils.createMemoryOnlyStreamCache();

try (PDDocument destination = new PDDocument(strmCacheFunc)) {
lastMergeSkippedCount = 0;

for (Object sourceObject : sources) {
PDDocument sourceDoc;
try {
try {
if (sourceObject instanceof File) {
File file = (File) sourceObject;
sourceDoc = Loader.loadPDF(file);
} else {
sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject);
}
} catch (IOException e) {
String sourceDesc = (sourceObject instanceof File)
? ((File) sourceObject).getAbsolutePath()
: "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")";
if (skipCorruptFiles) {
LOG.warn("Skipping corrupt file: {}", sourceDesc, e);
skippedFiles.add(sourceDesc);
lastMergeSkippedCount++;
continue;
} else {
throw new IOException("Failed to load PDF from source: " + sourceDesc, e);
}
}
try
{

try {
appendDocument(destination, sourceDoc);
}
finally
{
} finally {
IOUtils.closeAndLogException(sourceDoc, LOG, "PDDocument", null);
}
} catch (Exception e) {
LOG.error("Unexpected failure during legacy merge: ", e);
throw e;
}

// optionally set meta data
if (destinationDocumentInformation != null)
{
destination.setDocumentInformation(destinationDocumentInformation);
}
if (destinationMetadata != null)
{
destination.getDocumentCatalog().setMetadata(destinationMetadata);
}

if (destinationStream == null)
{
destination.save(destinationFileName, compressParameters);
}
else
{
destination.save(destinationStream, compressParameters);
}
}

if (destinationDocumentInformation != null) {
destination.setDocumentInformation(destinationDocumentInformation);
}
if (destinationMetadata != null) {
destination.getDocumentCatalog().setMetadata(destinationMetadata);
}

if (destinationStream == null) {
destination.save(destinationFileName, compressParameters);
} else {
destination.save(destinationStream, compressParameters);
}

}
}


/**
* append all pages from source to destination.
*
Expand Down