Skip to content

Commit

Permalink
fix for OCR multi lang
Browse files Browse the repository at this point in the history
  • Loading branch information
Frooodle committed Apr 30, 2023
1 parent 585bf4c commit 80c26a9
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 6 deletions.
2 changes: 2 additions & 0 deletions HowToUseOCR.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Depending on your requirements, you can choose the appropriate language pack for
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`

# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, ITS REQUIRED.

#### Docker

If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,15 @@ public class OCRController {

private static final Logger logger = LoggerFactory.getLogger(OCRController.class);

public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();
}
return Arrays.stream(files).filter(file -> file.getName().endsWith(".traineddata")).map(file -> file.getName().replace(".traineddata", ""))
.filter(lang -> !lang.equalsIgnoreCase("osd")).collect(Collectors.toList());
}

@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
Expand All @@ -49,9 +58,11 @@ public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, va
throw new IOException("Please select at least one language.");
}

// Validate and sanitize selected languages using regex
String languagePattern = "^[a-zA-Z]{3}$"; // Regex pattern for three-letter language codes
selectedLanguages = selectedLanguages.stream().filter(lang -> Pattern.matches(languagePattern, lang)).collect(Collectors.toList());
// Get available Tesseract languages
List<String> availableLanguages = getAvailableTesseractLanguages();

// Validate selected languages
selectedLanguages = selectedLanguages.stream().filter(availableLanguages::contains).collect(Collectors.toList());

if (selectedLanguages.isEmpty()) {
throw new IOException("None of the selected languages are valid.");
Expand All @@ -69,7 +80,7 @@ public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, va
// Run OCR Command
String languageOption = String.join("+", selectedLanguages);

List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf"));
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf", "--pdf-renderer" , "hocr"));

if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
Expand Down
4 changes: 2 additions & 2 deletions src/main/resources/templates/other/ocr-pdf.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ <h2 th:text="#{ocr.header}"></h2>
<label for="languages" class="form-label" th:text="#{ocr.selectText.1}"></label>
<hr>
<div id="languages">
<div th:each="language, iterStat : ${languages}" >
<div th:each="language, iterStat : ${languages}">
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
<label class="form-check-label" th:for="${'language-' + language}" th:text="${(language == 'eng') ? 'English' : language}"></label>
</div>
</div>
<hr>
Expand Down

0 comments on commit 80c26a9

Please sign in to comment.