Skip to content

Commit

Permalink
fix get title logic (#12137)
Browse files Browse the repository at this point in the history
ignore the first extra large character in the abstract
  • Loading branch information
leaf-soba authored Oct 31, 2024
1 parent 9d9a251 commit 36adca5
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
import java.io.StringWriter;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -261,31 +264,33 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t
}

private String findLargestFontText(List<TextPosition> textPositions) {
float maxFontSize = 0;
StringBuilder largestFontText = new StringBuilder();
Map<Float, StringBuilder> fontSizeTextMap = new TreeMap<>(Collections.reverseOrder());
TextPosition previousTextPosition = null;
for (TextPosition textPosition : textPositions) {
// Exclude unwanted text based on heuristics
if (isUnwantedText(previousTextPosition, textPosition)) {
continue;
}
float fontSize = textPosition.getFontSizeInPt();
if (fontSize > maxFontSize) {
maxFontSize = fontSize;
largestFontText.setLength(0);
largestFontText.append(textPosition.getUnicode());
previousTextPosition = textPosition;
} else if (fontSize == maxFontSize) {
if (previousTextPosition != null) {
if (isThereSpace(previousTextPosition, textPosition)) {
largestFontText.append(" ");
}
}
largestFontText.append(textPosition.getUnicode());
previousTextPosition = textPosition;
fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder());
if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) {
fontSizeTextMap.get(fontSize).append(" ");
}
fontSizeTextMap.get(fontSize).append(textPosition.getUnicode());
previousTextPosition = textPosition;
}
for (Map.Entry<Float, StringBuilder> entry : fontSizeTextMap.entrySet()) {
String candidateText = entry.getValue().toString().trim();
if (isLegalTitle(candidateText)) {
return candidateText;
}
}
return largestFontText.toString().trim();
return fontSizeTextMap.values().iterator().next().toString().trim();
}

private boolean isLegalTitle(String candidateText) {
// The minimum title length typically observed in academic research is 4 characters.
return candidateText.length() >= 4;
}

private boolean isThereSpace(TextPosition previous, TextPosition current) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception

private static Stream<Arguments> providePdfData() {
return Stream.of(
Arguments.of("Fundamentals of Distributed Computing: A Practical Tour of Vector Clock Systems", "/pdfs/PdfContentImporter/Roberto2002.pdf"),
Arguments.of("On How We Can Teach – Exploring New Ways in Professional Software Development for Students", "/pdfs/PdfContentImporter/Kriha2018.pdf"),
Arguments.of("JabRef Example for Reference Parsing", "/pdfs/IEEE/ieee-paper.pdf"),
Arguments.of("Paper Title", "/org/jabref/logic/importer/util/LNCS-minimal.pdf"),
Expand Down
Binary file not shown.

0 comments on commit 36adca5

Please sign in to comment.