From 47314a0f38e7bc3e6d1953dbe48a44d58abfd9f5 Mon Sep 17 00:00:00 2001
From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
Date: Tue, 27 Aug 2024 11:46:18 +0200
Subject: [PATCH] Extract images enhancements (#1757)
* fix
* extarct images
* langs
* logging
* cuke fix
---------
Co-authored-by: a
---
cucumber/exampleFiles/images.pdf | Bin 20615 -> 3889 bytes
cucumber/features/general.feature | 6 +-
.../api/misc/ExtractImagesController.java | 52 +++++++++++++-----
.../model/api/PDFExtractImagesRequest.java | 16 ++++++
.../software/SPDF/utils/GeneralUtils.java | 1 +
.../SPDF/utils/ImageProcessingUtils.java | 30 ++++++++++
src/main/resources/messages_ar_AR.properties | 1 +
src/main/resources/messages_bg_BG.properties | 1 +
src/main/resources/messages_ca_CA.properties | 1 +
src/main/resources/messages_cs_CZ.properties | 1 +
src/main/resources/messages_da_DK.properties | 3 +-
src/main/resources/messages_de_DE.properties | 3 +-
src/main/resources/messages_el_GR.properties | 1 +
src/main/resources/messages_en_GB.properties | 1 +
src/main/resources/messages_en_US.properties | 1 +
src/main/resources/messages_es_ES.properties | 1 +
src/main/resources/messages_eu_ES.properties | 1 +
src/main/resources/messages_fr_FR.properties | 1 +
src/main/resources/messages_ga_IE.properties | 3 +-
src/main/resources/messages_hi_IN.properties | 1 +
src/main/resources/messages_hr_HR.properties | 1 +
src/main/resources/messages_hu_HU.properties | 1 +
src/main/resources/messages_id_ID.properties | 1 +
src/main/resources/messages_it_IT.properties | 1 +
src/main/resources/messages_ja_JP.properties | 1 +
src/main/resources/messages_ko_KR.properties | 1 +
src/main/resources/messages_nl_NL.properties | 1 +
src/main/resources/messages_no_NB.properties | 1 +
src/main/resources/messages_pl_PL.properties | 1 +
src/main/resources/messages_pt_BR.properties | 1 +
src/main/resources/messages_pt_PT.properties | 1 +
src/main/resources/messages_ro_RO.properties | 1 +
src/main/resources/messages_ru_RU.properties | 1 +
src/main/resources/messages_sk_SK.properties | 1 +
.../resources/messages_sr_LATN_RS.properties | 1 +
src/main/resources/messages_sv_SE.properties | 1 +
src/main/resources/messages_th_TH.properties | 1 +
src/main/resources/messages_tr_TR.properties | 1 +
src/main/resources/messages_uk_UA.properties | 1 +
src/main/resources/messages_vi_VN.properties | 3 +-
src/main/resources/messages_zh_CN.properties | 1 +
src/main/resources/messages_zh_TW.properties | 1 +
.../templates/misc/extract-images.html | 4 ++
43 files changed, 131 insertions(+), 22 deletions(-)
create mode 100644 src/main/java/stirling/software/SPDF/model/api/PDFExtractImagesRequest.java
diff --git a/cucumber/exampleFiles/images.pdf b/cucumber/exampleFiles/images.pdf
index a87bca4144840dae5b54c7417e88ecb24359d351..cde3b170e6e135138c40668358ac074229939215 100644
GIT binary patch
literal 3889
zcmY!laBj;pw&C^a#c%RR9`!#XlIP{~3i%fl%!AuJ>=
z$T>eL%`nHSSkK5+&8fg%&nrv4I8-yf$UafiHO?R{BiJ}4TqQn5FVWIJMnlolIwl~y
zKsm(UJ)lTYG1fb=&@CoTFRIuv(OXYTS6MgH!7M-DFxEHAB|FbHJ3cQpR54IjQQ6*8
zQ7c`o&Wvxb(v_
zQ-C1`RtXC|14HD{OHwkhHb{s!4^}iya#gkRP4|flaaAhHsiF`lDryuKY8518>tYQd
zOQaAQcF}KWgyg%?_{Ww*42O08HykYhst6PShQqo9Fd8iZ2qgd`Go(TiS~nRP8DZ35
z=oK8awm@zhaOqD
z6y3h$wu;D;<%-8VJySaFaUPEi70~uAKKs$==S_wAUlWth9q(E^^NYvVbM3nQTR3!C
zI<|zTX`e}Hc97
z(1AZkdRzAzpEP|idE=j>7Df{%EbLt$w=(a{aU;E|w&iQoSY6znaT+(fpLRdG-N~i2
zp}<=;?Oj}Kpl>@wI^^h?C0XRhs9cB10-v)<|RlA
z64-zYF3IK6_fs$gHdx&tpeRKts?6iPkc60
zecCuL)6DzG)%Wuok5m>4Z|nRKuzvd&(Y>m%NAj3^tWDodka`%cB=&aQOHcLBm7fCA
z1HT;2t)AS&+L`>p?$Ms?>$|l|OiqVa>MflbG>a!~Nr?9Yre}#<)pIo4P0yrFKA0Y?
zc{zE`)5Vh3&yIXn4D2j}C
zK2`yXC#?wzl8dC<(`PvaNNVceZNzDxT{i_
z@6~m$pS2x%zRNOi+t+&&JNl2ZS)H^LoW6ULyo>PC!_}-uqkiAwUgLZ@UsKwuYx9P6
z-IKrZ2~GVWdF=K_t*(zsIk#iO9g7b46w28?b11!<`eLHex#^a(TCLAi?P|31SBvqf
zkkXE=*Q#$>HUCw~?AP0DR$1`+RmY~i;eMZ+x##Ls?aP~1#ay_5@}1w@ZvtDn46PPbpt-9M?|>ciI4
z`hn}LmK0sh)+oOp`SSR^%e)r#N-vI=&I@dk?4NQ~O?CRmiN*J9Q{8=y_AU?G*|{|R
z^B+bH8#`8PY19xYjRt2{r79Q$>xH2F{1OFFy&A-&@0pjDuV4&e<4MF3L8)o*6l)1A
z{=JcLM;Q6OhmV
literal 20615
zcmeI4>vGgK6vw~&DYQUnU?y41wtOkHl-wX(nvjG-TW;Q6Cu~U8Vb?IghrVJz^bOh<
zsm}2pZ7gLP>e`-;$AN*-KgY6<{yP5cMe@?_#?}g7b*-g8|N8w8i!+<0gJbLVZL5Fq
z`7B}m-S{Y(S^f3&!z5#lZS^0HM>F;-^JL^6i|}8bH+bH1@|Kgg+`Q%HEiZ3*c}wIi
zk+=N3<>##+Zv}ZP%v)jJit<*Jw_aM`e{Us|5nQ)(?)rD9>FC2SnX={G&oWEr#<@86
zTwI>#dYr{4>5-Ku%wQn)&K&?}q=8vaCs{JdW=vpE|8X)J$7|^)IV!gN<8y}vk;7hE
z{k>$CewYrE8G`}clE*PWKSo~Ma6if9A$@smtbdSv%2pJJBGWF)P_NV3u9=WQHZv2WRLUI+U=*{QmvTvp4TnF0u-$od!siAcu-sNGZ)bjGFNiZ(Yc5M1=L-O9TzX*B7Xiv
zZm3#x%W%ob_%|YGZ)bjGFNgj(z%EN1=L-O
z9TzX*B0j}~+)%a1?J{8Gc9Ue}c138+lch0e8yj;|RCaPhv8~fFS16?BlFV&WC+qlV
z^6}F&IkaqPlxNxMhxtOtjSPoOX|N3*e%M55jxF!Dx&{j!bq(&)nkHabsip<+)uFDz
zEkn~H+vtyTz+5qod$7z+(|q`}U(*6xKJiD>LU;&8QFCzSYMKknfHY0O$6}fW4+_$?
z0Jbw1<6Ija{h?_Nd|Rk#9=zexG;u+HK5X~Y;v#r7LQ(TLe5|f%t~sWjXZGg_*cGB3
z8^R9Cq9$zkz(~_vxUcG(*`M%W`Lq@nz*Zwoi{PzCQS%*m;)t%9{rN5|4AkO$SVXUB
zA-pszYJr@9re^jRIIzS-J64$UGziT(9@?58J=h+m#rd#YO4lyvFND`O#W=?{<|%hvSVgYI30MfAY5oQM1-5a0PGrne
z?mDpCve=*N!B)McxSF@uGydPV8f^u7Z~%DiwM55E5`ZU;E~MM?9X>$>ydV>Z_ZOcgirR0aRE2D
z%4Mk<{RNJ}P#y?#o(2KD2`Zi=w2k`}4{dX9$a~2--?C{uK1rtd#K8UWmjqul$j4lz
l@>yf~qq(*xhbfbzn?D3=mc`QyUte%}1M;k;rOll!>u-WTn-Ks2
diff --git a/cucumber/features/general.feature b/cucumber/features/general.feature
index 345f59cbc8d..4255c89e7e4 100644
--- a/cucumber/features/general.feature
+++ b/cucumber/features/general.feature
@@ -95,7 +95,7 @@ Feature: API Validation
@extract-images
- Scenario Outline: Extract Image Scans
+ Scenario Outline: Extract Image Scans duplicates
Given I use an example file at "exampleFiles/images.pdf" as parameter "fileInput"
And the request data includes
| parameter | value |
@@ -103,7 +103,7 @@ Feature: API Validation
When I send the API request to the endpoint "/api/v1/misc/extract-images"
Then the response content type should be "application/octet-stream"
And the response file should have extension ".zip"
- And the response ZIP should contain 20 files
+ And the response ZIP should contain 2 files
And the response file should have size greater than 0
And the response status code should be 200
@@ -112,5 +112,3 @@ Feature: API Validation
| png |
| gif |
| jpeg |
-
-
diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java
index e54cd9d71c5..266fbd35bb0 100644
--- a/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/misc/ExtractImagesController.java
@@ -5,6 +5,9 @@
import java.awt.image.RenderedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
@@ -36,7 +39,8 @@
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
-import stirling.software.SPDF.model.api.PDFWithImageFormatRequest;
+import stirling.software.SPDF.model.api.PDFExtractImagesRequest;
+import stirling.software.SPDF.utils.ImageProcessingUtils;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@@ -51,11 +55,11 @@ public class ExtractImagesController {
summary = "Extract images from a PDF file",
description =
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input: PDF Output: IMAGE/ZIP Type: SIMO")
- public ResponseEntity extractImages(@ModelAttribute PDFWithImageFormatRequest request)
+ public ResponseEntity extractImages(@ModelAttribute PDFExtractImagesRequest request)
throws IOException, InterruptedException, ExecutionException {
MultipartFile file = request.getFileInput();
String format = request.getFormat();
-
+ boolean allowDuplicates = request.isAllowDuplicates();
System.out.println(
System.currentTimeMillis() + " file=" + file.getName() + ", format=" + format);
PDDocument document = Loader.loadPDF(file.getBytes());
@@ -75,7 +79,7 @@ public ResponseEntity extractImages(@ModelAttribute PDFWithImageFormatRe
String filename =
Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", "");
- Set processedImages = new HashSet<>();
+ Set processedImages = new HashSet<>();
if (useMultithreading) {
// Executor service to handle multithreading
@@ -92,7 +96,13 @@ public ResponseEntity extractImages(@ModelAttribute PDFWithImageFormatRe
executor.submit(
() -> {
extractImagesFromPage(
- page, format, filename, pageNum, processedImages, zos);
+ page,
+ format,
+ filename,
+ pageNum,
+ processedImages,
+ zos,
+ allowDuplicates);
return null;
});
@@ -110,7 +120,8 @@ public ResponseEntity extractImages(@ModelAttribute PDFWithImageFormatRe
// Single-threaded extraction
for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) {
PDPage page = document.getPage(pgNum);
- extractImagesFromPage(page, format, filename, pgNum + 1, processedImages, zos);
+ extractImagesFromPage(
+ page, format, filename, pgNum + 1, processedImages, zos, allowDuplicates);
}
}
@@ -137,21 +148,34 @@ private void extractImagesFromPage(
String format,
String filename,
int pageNum,
- Set processedImages,
- ZipOutputStream zos)
+ Set processedImages,
+ ZipOutputStream zos,
+ boolean allowDuplicates)
throws IOException {
+ MessageDigest md;
+ try {
+ md = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ logger.error("MD5 algorithm not available for extractImages hash.", e);
+ return;
+ }
if (page.getResources() == null || page.getResources().getXObjectNames() == null) {
return;
}
+ int count = 1;
for (COSName name : page.getResources().getXObjectNames()) {
if (page.getResources().isImageXObject(name)) {
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
- int imageHash = image.hashCode();
- synchronized (processedImages) {
- if (processedImages.contains(imageHash)) {
- continue; // Skip already processed images
+ if (!allowDuplicates) {
+ byte[] data = ImageProcessingUtils.getImageData(image.getImage());
+ byte[] imageHash = md.digest(data);
+ synchronized (processedImages) {
+ if (processedImages.stream()
+ .anyMatch(hash -> Arrays.equals(hash, imageHash))) {
+ continue; // Skip already processed images
+ }
+ processedImages.add(imageHash);
}
- processedImages.add(imageHash);
}
RenderedImage renderedImage = image.getImage();
@@ -160,7 +184,7 @@ private void extractImagesFromPage(
BufferedImage bufferedImage = convertToRGB(renderedImage, format);
// Write image to zip file
- String imageName = filename + "_" + imageHash + " (Page " + pageNum + ")." + format;
+ String imageName = filename + "_page_" + pageNum + "_" + count++ + "." + format;
synchronized (zos) {
zos.putNextEntry(new ZipEntry(imageName));
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
diff --git a/src/main/java/stirling/software/SPDF/model/api/PDFExtractImagesRequest.java b/src/main/java/stirling/software/SPDF/model/api/PDFExtractImagesRequest.java
new file mode 100644
index 00000000000..9983d32eacf
--- /dev/null
+++ b/src/main/java/stirling/software/SPDF/model/api/PDFExtractImagesRequest.java
@@ -0,0 +1,16 @@
+package stirling.software.SPDF.model.api;
+
+import io.swagger.v3.oas.annotations.media.Schema;
+
+import lombok.Data;
+import lombok.EqualsAndHashCode;
+
+@Data
+@EqualsAndHashCode(callSuper = true)
+public class PDFExtractImagesRequest extends PDFWithImageFormatRequest {
+
+ @Schema(
+ description =
+ "Boolean to enable/disable the saving of duplicate images, true to enable duplicates")
+ private boolean allowDuplicates;
+}
diff --git a/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java b/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java
index 21d921c833e..480badcbe5e 100644
--- a/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java
+++ b/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java
@@ -262,4 +262,5 @@ public static boolean createDir(String path) {
}
return true;
}
+
}
diff --git a/src/main/java/stirling/software/SPDF/utils/ImageProcessingUtils.java b/src/main/java/stirling/software/SPDF/utils/ImageProcessingUtils.java
index ede9c4f42ac..655e344c3aa 100644
--- a/src/main/java/stirling/software/SPDF/utils/ImageProcessingUtils.java
+++ b/src/main/java/stirling/software/SPDF/utils/ImageProcessingUtils.java
@@ -1,6 +1,10 @@
package stirling.software.SPDF.utils;
import java.awt.image.BufferedImage;
+import java.awt.image.DataBuffer;
+import java.awt.image.DataBufferByte;
+import java.awt.image.DataBufferInt;
+import java.nio.ByteBuffer;
public class ImageProcessingUtils {
@@ -29,4 +33,30 @@ static BufferedImage convertColorType(BufferedImage sourceImage, String colorTyp
}
return convertedImage;
}
+
+ public static byte[] getImageData(BufferedImage image) {
+ DataBuffer dataBuffer = image.getRaster().getDataBuffer();
+ if (dataBuffer instanceof DataBufferByte) {
+ return ((DataBufferByte) dataBuffer).getData();
+ } else if (dataBuffer instanceof DataBufferInt) {
+ int[] intData = ((DataBufferInt) dataBuffer).getData();
+ ByteBuffer byteBuffer = ByteBuffer.allocate(intData.length * 4);
+ byteBuffer.asIntBuffer().put(intData);
+ return byteBuffer.array();
+ } else {
+ int width = image.getWidth();
+ int height = image.getHeight();
+ byte[] data = new byte[width * height * 3];
+ int index = 0;
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ int rgb = image.getRGB(x, y);
+ data[index++] = (byte) ((rgb >> 16) & 0xFF); // Red
+ data[index++] = (byte) ((rgb >> 8) & 0xFF); // Green
+ data[index++] = (byte) (rgb & 0xFF); // Blue
+ }
+ }
+ return data;
+ }
+ }
}
diff --git a/src/main/resources/messages_ar_AR.properties b/src/main/resources/messages_ar_AR.properties
index 4027d3fa7bc..f5a53e8d6ab 100644
--- a/src/main/resources/messages_ar_AR.properties
+++ b/src/main/resources/messages_ar_AR.properties
@@ -802,6 +802,7 @@ ocr.submit=معالجة PDF باستخدام OCR
extractImages.title=استخراج الصور
extractImages.header=استخراج الصور
extractImages.selectText=Øدد تنسيق الصورة لتØويل الصور المستخرجة إلى
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=استخراج
diff --git a/src/main/resources/messages_bg_BG.properties b/src/main/resources/messages_bg_BG.properties
index 0cc7916e54c..f2835382d0b 100644
--- a/src/main/resources/messages_bg_BG.properties
+++ b/src/main/resources/messages_bg_BG.properties
@@ -802,6 +802,7 @@ ocr.submit=Обработка на PDF чрез OCR
extractImages.title=Извличане на изображениÑ
extractImages.header=Извличане на изображениÑ
extractImages.selectText=Изберете формат на изображението, в който да преобразувате извлечените изображениÑ
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Извличане
diff --git a/src/main/resources/messages_ca_CA.properties b/src/main/resources/messages_ca_CA.properties
index 3d2279874ec..3b43ca0f542 100644
--- a/src/main/resources/messages_ca_CA.properties
+++ b/src/main/resources/messages_ca_CA.properties
@@ -802,6 +802,7 @@ ocr.submit=Processa PDF amb OCR
extractImages.title=Extreu Imatges
extractImages.header=Extreu Imatges
extractImages.selectText=Selecciona el format d'imatge al qual convertir les imatges extretes
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extreu
diff --git a/src/main/resources/messages_cs_CZ.properties b/src/main/resources/messages_cs_CZ.properties
index 88e91f40aff..a5c11e1b08e 100644
--- a/src/main/resources/messages_cs_CZ.properties
+++ b/src/main/resources/messages_cs_CZ.properties
@@ -802,6 +802,7 @@ ocr.submit=Zpracovat PDF s OCR
extractImages.title=Extrahovat obrázky
extractImages.header=Extrahovat obrázky
extractImages.selectText=Vyberte formát obrázku pro extrahované obrázky
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrahovat
diff --git a/src/main/resources/messages_da_DK.properties b/src/main/resources/messages_da_DK.properties
index d428850dd7b..455fad0197c 100644
--- a/src/main/resources/messages_da_DK.properties
+++ b/src/main/resources/messages_da_DK.properties
@@ -1,7 +1,7 @@
###########
# Generic #
###########
-# the direction that the language is written (ltr = left to right, rtl = right to left)
+# the direction that the language is written (ltr=left to right, rtl = right to left)
language.direction=ltr
pdfPrompt=Vælg PDF-fil(er)
@@ -802,6 +802,7 @@ ocr.submit=Process PDF with OCR
extractImages.title=Extract Images
extractImages.header=Extract Images
extractImages.selectText=Select image format to convert extracted images to
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extract
diff --git a/src/main/resources/messages_de_DE.properties b/src/main/resources/messages_de_DE.properties
index fd91e3ea843..76dde891e27 100644
--- a/src/main/resources/messages_de_DE.properties
+++ b/src/main/resources/messages_de_DE.properties
@@ -4,7 +4,7 @@
# the direction that the language is written (ltr=left to right, rtl = right to left)
language.direction=ltr
-pdfPrompwt=PDF auswählen
+pdfPrompt=Select PDF(s)
multiPdfPrompt=PDFs auswählen(2+)
multiPdfDropPrompt=Wählen Sie alle gewünschten PDFs aus (oder ziehen Sie sie per Drag & Drop hierhin)
imgPrompt=Wählen Sie ein Bild
@@ -802,6 +802,7 @@ ocr.submit=PDF mit OCR verarbeiten
extractImages.title=Bilder extrahieren
extractImages.header=Bilder extrahieren
extractImages.selectText=Wählen Sie das Bildformat aus, in das extrahierte Bilder konvertiert werden sollen
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrahieren
diff --git a/src/main/resources/messages_el_GR.properties b/src/main/resources/messages_el_GR.properties
index a11235c2328..0490d969505 100644
--- a/src/main/resources/messages_el_GR.properties
+++ b/src/main/resources/messages_el_GR.properties
@@ -802,6 +802,7 @@ ocr.submit=ΕπεξεÏγασία PDF με OCR
extractImages.title=Εξαγωγή Εικόνων
extractImages.header=Εξαγωγή Εικόνων
extractImages.selectText=ΕπιλÎξτε μοÏφή εικόνας για να μετατÏÎψετε τις εξαγόμενες εικόνες
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Εξαγωγή
diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties
index 4e6580a6e26..625523b9667 100644
--- a/src/main/resources/messages_en_GB.properties
+++ b/src/main/resources/messages_en_GB.properties
@@ -802,6 +802,7 @@ ocr.submit=Process PDF with OCR
extractImages.title=Extract Images
extractImages.header=Extract Images
extractImages.selectText=Select image format to convert extracted images to
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extract
diff --git a/src/main/resources/messages_en_US.properties b/src/main/resources/messages_en_US.properties
index 17b3b722d5d..2a1804355bf 100644
--- a/src/main/resources/messages_en_US.properties
+++ b/src/main/resources/messages_en_US.properties
@@ -802,6 +802,7 @@ ocr.submit=Process PDF with OCR
extractImages.title=Extract Images
extractImages.header=Extract Images
extractImages.selectText=Select image format to convert extracted images to
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extract
diff --git a/src/main/resources/messages_es_ES.properties b/src/main/resources/messages_es_ES.properties
index c81a6af254d..2cc95ecda58 100644
--- a/src/main/resources/messages_es_ES.properties
+++ b/src/main/resources/messages_es_ES.properties
@@ -802,6 +802,7 @@ ocr.submit=Procesar PDF con OCR
extractImages.title=Extraer imágenes
extractImages.header=Extraer imágenes
extractImages.selectText=Seleccionar el formato de imagen para convertir las imágenes extraÃdas
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extraer
diff --git a/src/main/resources/messages_eu_ES.properties b/src/main/resources/messages_eu_ES.properties
index d20eca584bc..7ecf22e5b46 100644
--- a/src/main/resources/messages_eu_ES.properties
+++ b/src/main/resources/messages_eu_ES.properties
@@ -802,6 +802,7 @@ ocr.submit=PDF prozesatu OCR-rekin
extractImages.title=Atera irudiak
extractImages.header=Atera irudiak
extractImages.selectText=Hautatu irudi-formatua ateratako irudiak bihurtzeko
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Atera
diff --git a/src/main/resources/messages_fr_FR.properties b/src/main/resources/messages_fr_FR.properties
index 0811d72cb7c..b73f5346740 100644
--- a/src/main/resources/messages_fr_FR.properties
+++ b/src/main/resources/messages_fr_FR.properties
@@ -802,6 +802,7 @@ ocr.submit=Traiter
extractImages.title=Extraire les images
extractImages.header=Extraire les images
extractImages.selectText=Format d’image dans lequel convertir les images extraites
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extraire
diff --git a/src/main/resources/messages_ga_IE.properties b/src/main/resources/messages_ga_IE.properties
index 6d36a5471cf..0ef9ce3f33c 100644
--- a/src/main/resources/messages_ga_IE.properties
+++ b/src/main/resources/messages_ga_IE.properties
@@ -1,7 +1,7 @@
###########
# Generic #
###########
-# the direction that the language is written (ltr = left to right, rtl = right to left)
+# the direction that the language is written (ltr=left to right, rtl = right to left)
language.direction=ltr
pdfPrompt=Roghnaigh PDF(s)
@@ -802,6 +802,7 @@ ocr.submit=Próiseáil PDF le OCR
extractImages.title=Sliocht Ãomhánna
extractImages.header=Sliocht Ãomhánna
extractImages.selectText=Roghnaigh formáid Ãomhá chun Ãomhánna bainte a thiontú go
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Sliocht
diff --git a/src/main/resources/messages_hi_IN.properties b/src/main/resources/messages_hi_IN.properties
index 262ec5f8fee..8a8e665c4ae 100644
--- a/src/main/resources/messages_hi_IN.properties
+++ b/src/main/resources/messages_hi_IN.properties
@@ -802,6 +802,7 @@ ocr.submit=OCR के साथ PDF पà¥à¤°à¥‹à¤¸à¥‡à¤¸ करें
extractImages.title=छवियां निकालें
extractImages.header=छवियां निकालें
extractImages.selectText=निकाली गई छवियों को कनà¥à¤µà¤°à¥à¤Ÿ करने के लिठछवि पà¥à¤°à¤¾à¤°à¥‚प चà¥à¤¨à¥‡à¤‚
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=निकालें
diff --git a/src/main/resources/messages_hr_HR.properties b/src/main/resources/messages_hr_HR.properties
index 4eb17cd1438..192f9da7396 100644
--- a/src/main/resources/messages_hr_HR.properties
+++ b/src/main/resources/messages_hr_HR.properties
@@ -802,6 +802,7 @@ ocr.submit=Obradi PDF sa OCR-om
extractImages.title=Ekstrakt slika
extractImages.header=Ekstrakt slika
extractImages.selectText=Odaberite format slike za pretvaranje izdvojenih slika
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Izdvajanje
diff --git a/src/main/resources/messages_hu_HU.properties b/src/main/resources/messages_hu_HU.properties
index bcd5be266fd..08a1097fcc2 100644
--- a/src/main/resources/messages_hu_HU.properties
+++ b/src/main/resources/messages_hu_HU.properties
@@ -802,6 +802,7 @@ ocr.submit=PDF feldolgozása OCR-rel
extractImages.title=Képek kinyerése
extractImages.header=Képek kinyerése
extractImages.selectText=Válassza ki a képformátumot a kinyert képek konvertálásához
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Kinyerés
diff --git a/src/main/resources/messages_id_ID.properties b/src/main/resources/messages_id_ID.properties
index 1ce1c684720..ffb7cd7049d 100644
--- a/src/main/resources/messages_id_ID.properties
+++ b/src/main/resources/messages_id_ID.properties
@@ -802,6 +802,7 @@ ocr.submit=Memproses PDF dengan OCR
extractImages.title=Ekstrak Gambar
extractImages.header=Mengekstrak Gambar
extractImages.selectText=Pilih format gambar yang akan dikonversi
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Ekstrak
diff --git a/src/main/resources/messages_it_IT.properties b/src/main/resources/messages_it_IT.properties
index 7398c42cc7f..37d5cc283bf 100644
--- a/src/main/resources/messages_it_IT.properties
+++ b/src/main/resources/messages_it_IT.properties
@@ -802,6 +802,7 @@ ocr.submit=Scansiona testo nel PDF con OCR
extractImages.title=Estrai immagini
extractImages.header=Estrai immagini
extractImages.selectText=Seleziona il formato in cui salvare le immagini estratte
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Estrai
diff --git a/src/main/resources/messages_ja_JP.properties b/src/main/resources/messages_ja_JP.properties
index 96669ccae59..4cd41fe1f86 100644
--- a/src/main/resources/messages_ja_JP.properties
+++ b/src/main/resources/messages_ja_JP.properties
@@ -802,6 +802,7 @@ ocr.submit=OCRã§PDFを処ç†ã™ã‚‹
extractImages.title=ç”»åƒã®æŠ½å‡º
extractImages.header=ç”»åƒã®æŠ½å‡º
extractImages.selectText=抽出ã—ãŸç”»åƒã®ãƒ•ã‚©ãƒ¼ãƒžãƒƒãƒˆã‚’é¸æŠž
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=抽出
diff --git a/src/main/resources/messages_ko_KR.properties b/src/main/resources/messages_ko_KR.properties
index c160c6d5d48..4ea72d22a59 100644
--- a/src/main/resources/messages_ko_KR.properties
+++ b/src/main/resources/messages_ko_KR.properties
@@ -802,6 +802,7 @@ ocr.submit=ì¸ì‹
extractImages.title=ì´ë¯¸ì§€ 추출
extractImages.header=ì´ë¯¸ì§€ 추출
extractImages.selectText=ì¶”ì¶œëœ ì´ë¯¸ì§€ë¥¼ ë³€í™˜í• ì´ë¯¸ì§€ 형ì‹ì„ ì„ íƒí•©ë‹ˆë‹¤.
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=추출
diff --git a/src/main/resources/messages_nl_NL.properties b/src/main/resources/messages_nl_NL.properties
index 1ada6fb1da0..af03f60d5ed 100644
--- a/src/main/resources/messages_nl_NL.properties
+++ b/src/main/resources/messages_nl_NL.properties
@@ -802,6 +802,7 @@ ocr.submit=Verwerk PDF met OCR
extractImages.title=Afbeeldingen extraheren
extractImages.header=Afbeeldingen extraheren
extractImages.selectText=Selecteer het beeldformaat voor geëxtraheerde afbeeldingen
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extraheer
diff --git a/src/main/resources/messages_no_NB.properties b/src/main/resources/messages_no_NB.properties
index ed3ec72c3a9..73b8832b59d 100644
--- a/src/main/resources/messages_no_NB.properties
+++ b/src/main/resources/messages_no_NB.properties
@@ -802,6 +802,7 @@ ocr.submit=Behandle PDF med OCR
extractImages.title=Hent ut bilder
extractImages.header=Hent ut bilder
extractImages.selectText=Velg bildeformat for å konvertere de hentede bildene til
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Hent ut
diff --git a/src/main/resources/messages_pl_PL.properties b/src/main/resources/messages_pl_PL.properties
index aaa5dd47611..a09d6e33a0d 100755
--- a/src/main/resources/messages_pl_PL.properties
+++ b/src/main/resources/messages_pl_PL.properties
@@ -802,6 +802,7 @@ ocr.submit=Przetwarzaj PDF za pomocÄ… OCR
extractImages.title=Wyodrębnij obrazy
extractImages.header=Wyodrębnij obrazy
extractImages.selectText=Wybierz format obrazu, na który chcesz przekonwertować wyodrębniony obraz.
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Wyodrębnij
diff --git a/src/main/resources/messages_pt_BR.properties b/src/main/resources/messages_pt_BR.properties
index 1ab7b6121b9..bced7555e73 100644
--- a/src/main/resources/messages_pt_BR.properties
+++ b/src/main/resources/messages_pt_BR.properties
@@ -802,6 +802,7 @@ ocr.submit=Processar PDF com OCR
extractImages.title=Extrair imagens
extractImages.header=Extrair imagens
extractImages.selectText=Selecione o formato de imagem para converter as imagens extraÃdas
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrair
diff --git a/src/main/resources/messages_pt_PT.properties b/src/main/resources/messages_pt_PT.properties
index 69e2b88bdca..6cc60af3ffa 100644
--- a/src/main/resources/messages_pt_PT.properties
+++ b/src/main/resources/messages_pt_PT.properties
@@ -802,6 +802,7 @@ ocr.submit=Processar PDF com OCR
extractImages.title=Extrair Imagens
extractImages.header=Extrair Imagens
extractImages.selectText=Selecione o formato de imagem para converter as imagens extraÃdas
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrair
diff --git a/src/main/resources/messages_ro_RO.properties b/src/main/resources/messages_ro_RO.properties
index d89477fb00c..f4cb19bb283 100644
--- a/src/main/resources/messages_ro_RO.properties
+++ b/src/main/resources/messages_ro_RO.properties
@@ -802,6 +802,7 @@ ocr.submit=Procesează PDF-ul cu OCR
extractImages.title=Extrage Imagini
extractImages.header=Extrage Imagini
extractImages.selectText=Selectați formatul imaginii în care să se convertească imaginile extrase
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrage
diff --git a/src/main/resources/messages_ru_RU.properties b/src/main/resources/messages_ru_RU.properties
index ebd032a044b..04193a5ae86 100644
--- a/src/main/resources/messages_ru_RU.properties
+++ b/src/main/resources/messages_ru_RU.properties
@@ -802,6 +802,7 @@ ocr.submit=Обработка PDF Ñ OCR
extractImages.title=Извлечь изображениÑ
extractImages.header=Извлечь изображениÑ
extractImages.selectText=Выберите формат Ð¸Ð·Ð¾Ð±Ñ€Ð°Ð¶ÐµÐ½Ð¸Ñ Ð´Ð»Ñ Ð¿Ñ€ÐµÐ¾Ð±Ñ€Ð°Ð·Ð¾Ð²Ð°Ð½Ð¸Ñ Ð¸Ð·Ð²Ð»ÐµÑ‡ÐµÐ½Ð½Ñ‹Ñ… изображений в
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Извлечь
diff --git a/src/main/resources/messages_sk_SK.properties b/src/main/resources/messages_sk_SK.properties
index a7ed994c589..b837e708d88 100644
--- a/src/main/resources/messages_sk_SK.properties
+++ b/src/main/resources/messages_sk_SK.properties
@@ -802,6 +802,7 @@ ocr.submit=Spracovať PDF s OCR
extractImages.title=Extrahovať obrázky
extractImages.header=Extrahovať obrázky
extractImages.selectText=Vyberte formát obrázka na konverziu extrahovaných obrázkov
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrahovať
diff --git a/src/main/resources/messages_sr_LATN_RS.properties b/src/main/resources/messages_sr_LATN_RS.properties
index 1f20628a01e..8d393b440aa 100644
--- a/src/main/resources/messages_sr_LATN_RS.properties
+++ b/src/main/resources/messages_sr_LATN_RS.properties
@@ -802,6 +802,7 @@ ocr.submit=Obradi PDF sa OCR-om
extractImages.title=Izdvajanje slika
extractImages.header=Izdvajanje slika
extractImages.selectText=Odaberite format slike za konvertovanje izdvojenih slika
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Izdvajanje
diff --git a/src/main/resources/messages_sv_SE.properties b/src/main/resources/messages_sv_SE.properties
index ad5f80c9860..ed8e7ce1f7d 100644
--- a/src/main/resources/messages_sv_SE.properties
+++ b/src/main/resources/messages_sv_SE.properties
@@ -802,6 +802,7 @@ ocr.submit=Bearbeta PDF med OCR
extractImages.title=Extrahera bilder
extractImages.header=Extrahera bilder
extractImages.selectText=Välj bildformat att konvertera extraherade bilder till
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Extrahera
diff --git a/src/main/resources/messages_th_TH.properties b/src/main/resources/messages_th_TH.properties
index 9622c01e9a2..677aa5afd8b 100644
--- a/src/main/resources/messages_th_TH.properties
+++ b/src/main/resources/messages_th_TH.properties
@@ -802,6 +802,7 @@ ocr.submit=ประมวลผล PDF ด้วย OCR
extractImages.title=à¹à¸¢à¸à¸£à¸¹à¸›à¸ าพ
extractImages.header=à¹à¸¢à¸à¸£à¸¹à¸›à¸ าพ
extractImages.selectText=เลืà¸à¸à¸£à¸¹à¸›à¹à¸šà¸šà¸ าพที่จะใช้ในà¸à¸²à¸£à¹à¸›à¸¥à¸‡à¸£à¸¹à¸›à¸ าพที่à¹à¸¢à¸à¹„ด้
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=à¹à¸¢à¸
diff --git a/src/main/resources/messages_tr_TR.properties b/src/main/resources/messages_tr_TR.properties
index 5a814b3b7b9..a4e1170621b 100644
--- a/src/main/resources/messages_tr_TR.properties
+++ b/src/main/resources/messages_tr_TR.properties
@@ -802,6 +802,7 @@ ocr.submit=PDF'i OCR(Metin Tanıma) ile İşle
extractImages.title=Resimleri Çıkar
extractImages.header=Resimleri Çıkar
extractImages.selectText=Çıkarılan resimleri dönüştürmek için resim formatını seçin
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=Çıkar
diff --git a/src/main/resources/messages_uk_UA.properties b/src/main/resources/messages_uk_UA.properties
index cf63c6ed3ae..3a1f37c320b 100644
--- a/src/main/resources/messages_uk_UA.properties
+++ b/src/main/resources/messages_uk_UA.properties
@@ -802,6 +802,7 @@ ocr.submit=Обробка PDF з OCR
extractImages.title=ВитÑгнути зображеннÑ
extractImages.header=ВитÑгнути зображеннÑ
extractImages.selectText=Виберіть формат Ð·Ð¾Ð±Ñ€Ð°Ð¶ÐµÐ½Ð½Ñ Ð´Ð»Ñ Ð¿ÐµÑ€ÐµÑ‚Ð²Ð¾Ñ€ÐµÐ½Ð½Ñ Ð²Ð¸Ñ‚Ñгнутих зображень у
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=ВитÑгнути
diff --git a/src/main/resources/messages_vi_VN.properties b/src/main/resources/messages_vi_VN.properties
index cd700f52c0a..2e54e83ca20 100644
--- a/src/main/resources/messages_vi_VN.properties
+++ b/src/main/resources/messages_vi_VN.properties
@@ -1,7 +1,7 @@
###########
# Generic #
###########
-# the direction that the language is written (ltr = left to right, rtl = right to left)
+# the direction that the language is written (ltr=left to right, rtl = right to left)
language.direction=ltr
pdfPrompt=Chá»n (các) tệp PDF
@@ -802,6 +802,7 @@ ocr.submit=XỠlý PDF với OCR
extractImages.title=TrÃch xuất hình ảnh
extractImages.header=TrÃch xuất hình ảnh
extractImages.selectText=Chá»n định dạng hình ảnh để chuyển đổi hình ảnh đã trÃch xuất
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=TrÃch xuất
diff --git a/src/main/resources/messages_zh_CN.properties b/src/main/resources/messages_zh_CN.properties
index eb91f5c699c..264f7006aab 100644
--- a/src/main/resources/messages_zh_CN.properties
+++ b/src/main/resources/messages_zh_CN.properties
@@ -802,6 +802,7 @@ ocr.submit=用OCR处ç†PDF
extractImages.title=æå–图åƒ
extractImages.header=æå–图åƒ
extractImages.selectText=选择图åƒæ ¼å¼ï¼Œå°†æå–的图åƒè½¬æ¢ä¸º
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=æå–
diff --git a/src/main/resources/messages_zh_TW.properties b/src/main/resources/messages_zh_TW.properties
index 45dd4fc3b92..dadbde356d8 100644
--- a/src/main/resources/messages_zh_TW.properties
+++ b/src/main/resources/messages_zh_TW.properties
@@ -802,6 +802,7 @@ ocr.submit=使用 OCR è™•ç† PDF
extractImages.title=æå–圖片
extractImages.header=æå–圖片
extractImages.selectText=é¸æ“‡è¦è½‰æ›æå–å½±åƒçš„å½±åƒæ ¼å¼
+extractImages.allowDuplicates=Save duplicate images
extractImages.submit=æå–
diff --git a/src/main/resources/templates/misc/extract-images.html b/src/main/resources/templates/misc/extract-images.html
index 36c0d51c30f..d47a72d7c36 100644
--- a/src/main/resources/templates/misc/extract-images.html
+++ b/src/main/resources/templates/misc/extract-images.html
@@ -27,6 +27,10 @@
+
+
+
+