Skip to content

Commit

Permalink
Bump org.apache.pdfbox:pdfbox from 2.0.29 to 3.0.1 (#1378)
Browse files Browse the repository at this point in the history
* Bump org.apache.pdfbox:pdfbox from 2.0.29 to 3.0.1

Bumps org.apache.pdfbox:pdfbox from 2.0.29 to 3.0.1.

---
updated-dependencies:
- dependency-name: org.apache.pdfbox:pdfbox
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* refactoring code to upgraded dependency

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Mohab Mohie <Mohab.MohieElDeen@outlook.com>
  • Loading branch information
dependabot[bot] and MohabMohie authored Dec 18, 2023
1 parent f26a49a commit dd1d275
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 38 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
<version>3.0.1</version>
</dependency>

<!-- REST ASSURED -->
Expand Down
57 changes: 20 additions & 37 deletions src/main/java/com/shaft/tools/io/PdfFileManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import com.shaft.tools.io.internal.FailureReporter;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
Expand All @@ -17,7 +17,7 @@
public class PdfFileManager {

private final File file;
private RandomAccessBufferedFileInputStream stream = null;
private RandomAccessReadBufferedFile stream = null;
private PDFParser parser = null;
private COSDocument cosDoc = null;
private PDFTextStripper strip = null;
Expand Down Expand Up @@ -57,22 +57,17 @@ public String readFileContent() {
*/
public static String readFileContent(String relativeFilePath, boolean... deleteFileAfterReading) {
if (FileActions.getInstance().doesFileExist(relativeFilePath)) {
try {
var randomAccessBufferedFileInputStream = new RandomAccessBufferedFileInputStream(new File(FileActions.getInstance().getAbsolutePath(relativeFilePath)));
var pdfParser = new PDFParser(randomAccessBufferedFileInputStream);
pdfParser.parse();
try (var pdfParser = new PDFParser(new RandomAccessReadBufferedFile(new File(FileActions.getInstance().getAbsolutePath(relativeFilePath)))).parse()) {
var pdfTextStripper = new PDFTextStripper();
pdfTextStripper.setSortByPosition(true);
var fileContent = pdfTextStripper.getText(new PDDocument(pdfParser.getDocument()));
randomAccessBufferedFileInputStream.close();

if (deleteFileAfterReading != null
&& deleteFileAfterReading.length > 0
&& deleteFileAfterReading[0]) {
FileActions.getInstance().deleteFile(relativeFilePath);
}
return fileContent;
} catch (java.io.IOException rootCauseException) {
} catch (IOException rootCauseException) {
FailureReporter.fail(PdfFileManager.class, "Failed to read this PDF file [" + relativeFilePath + "].", rootCauseException);
}

Expand All @@ -96,7 +91,6 @@ public String readPDFContentFromDownloadedPDF(int startPageNumber, int endPageNu

stream = readFileInputStream(file);
parser = parseStreamDocument(stream);

cosDoc = getParsedDocument(parser);
String content = getPdfText(cosDoc, startPageNumber, endPageNumber);
closeStreamAndDeleteFile(file, stream, deleteFileAfterValidationStatus);
Expand All @@ -105,27 +99,24 @@ public String readPDFContentFromDownloadedPDF(int startPageNumber, int endPageNu
}

public String readPDFContentFromDownloadedPDF(DeleteFileAfterValidationStatus deleteFileAfterValidationStatus) {

stream = readFileInputStream(file);
parser = parseStreamDocument(stream);

cosDoc = getParsedDocument(parser);
String content = getPdfText(cosDoc);
closeStreamAndDeleteFile(file, stream, deleteFileAfterValidationStatus);

return content;
}

private RandomAccessBufferedFileInputStream readFileInputStream(File file) {
private RandomAccessReadBufferedFile readFileInputStream(File file) {
try {
stream = new RandomAccessBufferedFileInputStream(file);
stream = new RandomAccessReadBufferedFile(file);
} catch (IOException rootCauseException) {
FailureReporter.fail(PdfFileManager.class, "Couldn't read the data from the provided file [" + file + "].", rootCauseException);
}
return stream;
}

private PDFParser parseStreamDocument(RandomAccessBufferedFileInputStream stream) {
private PDFParser parseStreamDocument(RandomAccessReadBufferedFile stream) {
try {
parser = new PDFParser(stream);
parser.parse();
Expand All @@ -136,27 +127,23 @@ private PDFParser parseStreamDocument(RandomAccessBufferedFileInputStream stream
}

private COSDocument getParsedDocument(PDFParser parser) {
try {
cosDoc = parser.getDocument();
try (var parsedDocument = parser.parse()){
cosDoc = parsedDocument.getDocument();
} catch (IOException rootCauseException) {
FailureReporter.fail(PdfFileManager.class, "Couldn't get the document that was parsed. Check that the document parsed before get the document.", rootCauseException);
}
return cosDoc;
}

private String getPdfText(COSDocument cosDoc, int startPageNumber, int endPageNumber) {
try {
strip = new PDFTextStripper();
// By default, text extraction is done in the same sequence as the text in the
// PDF page content stream. PDF is a graphic format, not a text format, and
// unlike HTML, it has no requirements that text one on page be rendered in a
// certain order. The order is the one that was determined by the software that
// created the PDF
// To get text sorted from left to right and top to bottom
strip.setSortByPosition(true);
} catch (IOException rootCauseException) {
FailureReporter.fail(PdfFileManager.class, "Couldn't load PDFTextStripper properties.", rootCauseException);
}
strip = new PDFTextStripper();
// By default, text extraction is done in the same sequence as the text in the
// PDF page content stream. PDF is a graphic format, not a text format, and
// unlike HTML, it has no requirements that text one on page be rendered in a
// certain order. The order is the one that was determined by the software that
// created the PDF
// To get text sorted from left to right and top to bottom
strip.setSortByPosition(true);

strip.setStartPage(startPageNumber);
strip.setEndPage(endPageNumber);
Expand All @@ -172,12 +159,8 @@ private String getPdfText(COSDocument cosDoc, int startPageNumber, int endPageNu
}

private String getPdfText(COSDocument cosDoc) {
try {
strip = new PDFTextStripper();
strip.setSortByPosition(true);
} catch (IOException rootCauseException) {
FailureReporter.fail(PdfFileManager.class, "Couldn't load PDFTextStripper properties.", rootCauseException);
}
strip = new PDFTextStripper();
strip.setSortByPosition(true);

PDDocument pdDoc = new PDDocument(cosDoc);

Expand All @@ -190,7 +173,7 @@ private String getPdfText(COSDocument cosDoc) {
return content;
}

private void closeStreamAndDeleteFile(File file, RandomAccessBufferedFileInputStream stream,
private void closeStreamAndDeleteFile(File file, RandomAccessReadBufferedFile stream,
DeleteFileAfterValidationStatus deleteFileAfterValidation) {
try {
stream.close();
Expand Down

0 comments on commit dd1d275

Please sign in to comment.