NXP-8918: allow PDF/A conversion

Gagnavarslan · Feb 24, 2012 · e8475de · e8475de
1 parent 09cc14f
commit e8475de
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 48 deletions.
diff --git a/...tform-convert/src/main/java/org/nuxeo/ecm/platform/convert/plugins/JODBasedConverter.java b/...tform-convert/src/main/java/org/nuxeo/ecm/platform/convert/plugins/JODBasedConverter.java
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2006-2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
+ * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
  *
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the GNU Lesser General Public License
@@ -12,17 +12,16 @@
  * Lesser General Public License for more details.
  *
  * Contributors:
- *     Nuxeo - initial API and implementation
- *
- * $Id$
+ *     Nuxeo
+ *     Florent Guillaume
  */
-
 package org.nuxeo.ecm.platform.convert.plugins;
 
 import java.io.File;
 import java.io.InputStream;
 import java.io.Serializable;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -50,6 +49,10 @@
 
 import com.sun.star.uno.RuntimeException;
 
+/**
+ * Converter based on JOD which uses an external OpenOffice process to do actual
+ * conversions.
+ */
 public class JODBasedConverter implements ExternalConverter {
 
     protected static final String TMP_PATH_PARAMETER = "TmpDirectory";
@@ -58,6 +61,21 @@ public class JODBasedConverter implements ExternalConverter {
 
     private static final DocumentFormatRegistry formatRegistry = new DefaultDocumentFormatRegistry();
 
+    /**
+     * Boolean conversion parameter for PDF/A-1.
+     *
+     * @since 5.6
+     */
+    public static final String PDFA1_PARAM = "PDF/A-1";
+
+    protected static final Map<DocumentFamily, String> PDF_FILTER_NAMES = new HashMap<DocumentFamily, String>();
+    {
+        PDF_FILTER_NAMES.put(DocumentFamily.TEXT, "writer_pdf_Export");
+        PDF_FILTER_NAMES.put(DocumentFamily.SPREADSHEET, "calc_pdf_Export");
+        PDF_FILTER_NAMES.put(DocumentFamily.PRESENTATION, "impress_pdf_Export");
+        PDF_FILTER_NAMES.put(DocumentFamily.DRAWING, "draw_pdf_Export");
+    }
+
     protected ConverterDescriptor descriptor;
 
     protected String getDestinationMimeType() {
@@ -69,11 +87,42 @@ protected String getDestinationMimeType() {
      * <p>
      * It takes the actual destination mimetype from the plugin configuration.
      *
-     * @return the DestinationFormat for this given plugin. {@see
-     *         org.nuxeo.ecm.platform.transform.interfaces.Plugin}
+     * @param sourceFormat the source format
+     * @param pdfa1 true if PDF/A-1 is required
      */
-    private DocumentFormat getDestinationFormat() {
-        return formatRegistry.getFormatByMediaType(getDestinationMimeType());
+    protected DocumentFormat getDestinationFormat(DocumentFormat sourceFormat,
+            boolean pdfa1) {
+        String mimeType = getDestinationMimeType();
+        boolean topdf = "application/pdf".equals(mimeType);
+        boolean html2pdf = "text/html".equals(sourceFormat.getMediaType())
+                && topdf;
+        DocumentFormat destinationFormat;
+        if (topdf) {
+            destinationFormat = new DocumentFormat(pdfa1 ? "PDF/A-1" : "PDF",
+                    "pdf", "application/pdf");
+            Map<String, Object> storeProperties = new HashMap<String, Object>();
+            DocumentFamily sourceFamily = sourceFormat.getInputFamily();
+            String filterName;
+            if (html2pdf) {
+                // we have to be strict regarding output FilterName,
+                // use "writer_web_pdf_Export" instead of "writer_pdf_Export"
+                filterName = "writer_web_pdf_Export";
+            } else {
+                filterName = PDF_FILTER_NAMES.get(sourceFamily);
+            }
+            storeProperties.put("FilterName", filterName);
+            if (pdfa1) {
+                Map<String, Object> filterData = new HashMap<String, Object>();
+                filterData.put("SelectPdfVersion", Integer.valueOf(1)); // PDF/A-1
+                filterData.put("UseTaggedPDF", Boolean.TRUE); // per spec
+                storeProperties.put("FilterData", filterData);
+            }
+            destinationFormat.setStoreProperties(sourceFamily, storeProperties);
+        } else {
+            // use default JODConverter registry
+            destinationFormat = formatRegistry.getFormatByMediaType(mimeType);
+        }
+        return destinationFormat;
     }
 
     /**
@@ -104,26 +153,7 @@ protected void finalize() throws Throwable {
         super.finalize();
     }
 
-    private static boolean adaptFilterNameForHTML2PDF(DocumentFormat sourceFormat,
-            DocumentFormat destinationFormat) {
-
-        // TODO: solve this
-        // due to a random bug, we have to be strict regarding otuput FilterName
-        // html file have to use "writer_web_pdf_Export" instead of JODConverter
-        // simplification "writer_pdf_Export"
-        // patch dynamically
-        if ("text/html".equals(sourceFormat.getMediaType())
-                && "application/pdf".equals(destinationFormat.getMediaType())) {
-            // change the FilterName
-            DocumentFamily family = sourceFormat.getInputFamily();
-            Map<String, String> storeProperties = new HashMap<String, String>();
-            storeProperties.put("FilterName", "writer_web_pdf_Export");
-            destinationFormat.setStoreProperties(family, storeProperties);
-            return true;
-        }
-        return false;
-    }
-
+    @Override
     public BlobHolder convert(BlobHolder blobHolder,
             Map<String, Serializable> parameters) throws ConversionException {
         Blob inputBlob;
@@ -142,6 +172,9 @@ public BlobHolder convert(BlobHolder blobHolder,
         // This plugin do deal only with one input source.
         String sourceMimetype = inputBlob.getMimeType();
 
+        boolean pdfa1 = parameters != null
+                && Boolean.TRUE.equals(parameters.get(PDFA1_PARAM));
+
         if (documentConverter != null) {
             File sourceFile = null;
             File outFile = null;
@@ -164,13 +197,12 @@ public BlobHolder convert(BlobHolder blobHolder,
                 // stream.reset(); // works on a JCRBlobInputStream
                 // }
                 FileUtils.copyToFile(stream, sourceFile);
-                DocumentFormat sourceFormat = null;
 
+                DocumentFormat sourceFormat = null;
                 if (sourceMimetype != null) {
                     // Try to fetch it from the registry.
                     sourceFormat = getSourceFormat(sourceMimetype);
                 }
-
                 // If not found in the registry or not given as a parameter.
                 // Try to sniff ! What does that smell ? :)
                 if (sourceFormat == null) {
@@ -179,7 +211,8 @@ public BlobHolder convert(BlobHolder blobHolder,
 
                 // From plugin settings because we know the destination
                 // mimetype.
-                DocumentFormat destinationFormat = getDestinationFormat();
+                DocumentFormat destinationFormat = getDestinationFormat(
+                        sourceFormat, pdfa1);
 
                 // allow HTML2PDF filtering
 
@@ -225,8 +258,6 @@ public BlobHolder convert(BlobHolder blobHolder,
                     }
 
                 } else {
-                    adaptFilterNameForHTML2PDF(sourceFormat, destinationFormat);
-
                     outFile = File.createTempFile("NXJOOoConverterDocumentOut",
                             '.' + destinationFormat.getExtension());
 
@@ -273,10 +304,12 @@ sourceMimetype, getDestinationMimeType(),
 
     }
 
+    @Override
     public void init(ConverterDescriptor descriptor) {
         this.descriptor = descriptor;
     }
 
+    @Override
     public ConverterCheckResult isConverterAvailable() {
         ConverterCheckResult result = new ConverterCheckResult();
         try {

diff --git a/...latform-convert/src/test/java/org/nuxeo/ecm/platform/convert/tests/BaseConverterTest.java b/...latform-convert/src/test/java/org/nuxeo/ecm/platform/convert/tests/BaseConverterTest.java
@@ -24,6 +24,9 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.util.PDFTextStripper;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -35,8 +38,8 @@
 import org.nuxeo.ecm.platform.convert.ooomanager.OOoManagerService;
 import org.nuxeo.runtime.api.Framework;
 import org.nuxeo.runtime.test.NXRuntimeTestCase;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.util.PDFTextStripper;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
 
 public abstract class BaseConverterTest extends Assert {
 
@@ -97,4 +100,17 @@ public static String readPdfText(File pdfFile) throws IOException {
         return text.trim();
     }
 
+    public static boolean isPDFA(File pdfFile) throws Exception {
+        PDDocument pddoc = PDDocument.load(pdfFile);
+        XMPMetadata xmp = pddoc.getDocumentCatalog().getMetadata().exportXMPMetadata();
+        Document doc = xmp.getXMPDocument();
+        // <rdf:Description xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
+        // rdf:about="">
+        // <pdfaid:part>1</pdfaid:part>
+        // <pdfaid:conformance>A</pdfaid:conformance>
+        // </rdf:Description>
+        NodeList list = doc.getElementsByTagName("pdfaid:conformance");
+        return list != null && "A".equals(list.item(0).getTextContent());
+    }
+
 }
diff --git a/...rm-convert/src/test/java/org/nuxeo/ecm/platform/convert/tests/TestAnyToPDFConverters.java b/...rm-convert/src/test/java/org/nuxeo/ecm/platform/convert/tests/TestAnyToPDFConverters.java
@@ -1,5 +1,5 @@
 /*
- * (C) Copyright 2006-2009 Nuxeo SAS (http://nuxeo.com/) and contributors.
+ * (C) Copyright 2006-2012 Nuxeo SAS (http://nuxeo.com/) and contributors.
  *
  * All rights reserved. This program and the accompanying materials
  * are made available under the terms of the GNU Lesser General Public License
@@ -12,30 +12,37 @@
  * Lesser General Public License for more details.
  *
  * Contributors:
- *     Nuxeo - initial API and implementation
- *
- * $Id$
+ *     Nuxeo
+ *     Florent Guillaume
  */
-
 package org.nuxeo.ecm.platform.convert.tests;
 
 import java.io.File;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.junit.Test;
 import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
 import org.nuxeo.ecm.core.convert.api.ConversionService;
 import org.nuxeo.ecm.core.convert.api.ConverterCheckResult;
+import org.nuxeo.ecm.platform.convert.plugins.JODBasedConverter;
 import org.nuxeo.runtime.api.Framework;
 
-
 public class TestAnyToPDFConverters extends BaseConverterTest {
 
     private static final Log log = LogFactory.getLog(TestAnyToPDFConverters.class);
 
     protected void doTestPDFConverter(String srcMT, String fileName)
             throws Exception {
+        doTestPDFConverter(srcMT, fileName, false); // normal PDF
+        doTestPDFConverter(srcMT, fileName, true); // PDF/A-1
+    }
+
+    protected void doTestPDFConverter(String srcMT, String fileName,
+            boolean pdfa) throws Exception {
 
         ConversionService cs = Framework.getLocalService(ConversionService.class);
 
@@ -53,14 +60,24 @@ protected void doTestPDFConverter(String srcMT, String fileName)
 
         BlobHolder hg = getBlobFromPath("test-docs/" + fileName, srcMT);
 
-        BlobHolder result = cs.convert(converterName, hg, null);
+        Map<String,Serializable> parameters = new HashMap<String, Serializable>();
+        if (pdfa) {
+            parameters.put(JODBasedConverter.PDFA1_PARAM, Boolean.TRUE);
+        }
+        BlobHolder result = cs.convert(converterName, hg, parameters);
         assertNotNull(result);
 
         File pdfFile = File.createTempFile("testingPDFConverter", ".pdf");
-        result.getBlob().transferTo(pdfFile);
-        String text = readPdfText(pdfFile);
-        assertTrue(text.contains("Hello"));
-        log.info(srcMT + " to PDF conversion : OK");
+        try {
+            result.getBlob().transferTo(pdfFile);
+            String text = readPdfText(pdfFile);
+            assertTrue(text.contains("Hello"));
+            if (pdfa) {
+                assertTrue("Output is not PDF/A", isPDFA(pdfFile));
+            }
+        } finally {
+            pdfFile.delete();
+        }
     }
 
     @Test
@@ -75,7 +92,7 @@ public void testAnyToTextConverter() throws Exception {
             return;
         }
 
-        // doTestPDFConverter("text/html", "hello.html");
+        doTestPDFConverter("text/html", "hello.html");
         // doTestPDFConverter("text/xml", "hello.xml");
         doTestPDFConverter("application/vnd.ms-excel", "hello.xls");
         doTestPDFConverter("application/vnd.sun.xml.writer", "hello.sxw");