Skip to content

Commit

Permalink
Implement new text analyzer features
Browse files Browse the repository at this point in the history
  • Loading branch information
mrtcode committed May 24, 2024
1 parent b006073 commit c91890e
Show file tree
Hide file tree
Showing 20 changed files with 4,145 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ import { StructTreePage } from "./struct_tree.js";
import { writeObject } from "./writer.js";
import { XFAFactory } from "./xfa/factory.js";
import { XRef } from "./xref.js";
import { Module } from "./module/module.js";

Check failure on line 74 in src/core/document.js

View workflow job for this annotation

GitHub Actions / Lint (lts/*)

Imports should be sorted alphabetically

const DEFAULT_USER_UNIT = 1.0;
const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792];
Expand Down Expand Up @@ -917,6 +918,7 @@ class PDFDocument {
this.pdfManager = pdfManager;
this.stream = stream;
this.xref = new XRef(stream, pdfManager);
this.module = new Module(this);
this._pagePromises = new Map();
this._version = null;

Expand Down
121 changes: 121 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2346,6 +2346,7 @@ class PartialEvaluator {
transform: null,
fontName: null,
hasEOL: false,
chars: [],
};

// Use a circular buffer (length === 2) to save the last chars in the
Expand Down Expand Up @@ -2585,6 +2586,7 @@ class PartialEvaluator {
transform: textChunk.transform,
fontName: textChunk.fontName,
hasEOL: textChunk.hasEOL,
chars: textChunk.chars,
};
}

Expand Down Expand Up @@ -2906,6 +2908,9 @@ class PartialEvaluator {
scaledDim = 0;
}

let prevWidth = textChunk.width;
let m = Util.transform(textState.ctm, textState.textMatrix);

if (!font.vertical) {
scaledDim *= textState.textHScale;
textState.translateTextMatrix(scaledDim, 0);
Expand All @@ -2932,6 +2937,121 @@ class PartialEvaluator {
}
textChunk.str.push(glyphUnicode);

function closestStandardAngle(degrees) {
const standardAngles = [0, 90, 180, 270];
let closestAngle = standardAngles[0];
let minDifference = Math.abs(degrees - closestAngle);

for (let i = 1; i < standardAngles.length; i++) {
const difference = Math.abs(degrees - standardAngles[i]);
if (difference < minDifference) {
minDifference = difference;
closestAngle = standardAngles[i];
}
}

return closestAngle;
}

function matrixToDegrees(matrix) {
let radians = Math.atan2(matrix[1], matrix[0]);
if (radians < 0) {
radians += (2 * Math.PI);
}
let degrees = Math.round(radians * (180 / Math.PI));
degrees = degrees % 360;
if (degrees < 0) {
degrees += 360;
}
degrees = closestStandardAngle(degrees);
return degrees;
}

let rotation = matrixToDegrees(m);

let ascent = font.ascent;
let descent = font.descent;
if (descent > 0) {
descent = -descent;
}
if (ascent && descent) {
if (ascent > 1) {
ascent = 0.75;
}
if (descent < -0.5) {
descent = -0.25;
}
}
else {
ascent = 0.75;
descent = -0.25;
}

if (font.capHeight && font.capHeight < ascent && font.capHeight > 0) {
ascent = font.capHeight;
}

let charWidth = textChunk.width - prevWidth;
let rect = [0, textState.fontSize * descent, charWidth, textState.fontSize * ascent]

if (
font.isType3Font &&
textState.fontSize <= 1 &&
!isArrayEqual(textState.fontMatrix, FONT_IDENTITY_MATRIX)
) {
const glyphHeight = font.bbox[3] - font.bbox[1];
if (glyphHeight > 0) {
rect[1] = font.bbox[1] * textState.fontMatrix[3];
rect[3] = font.bbox[3] * textState.fontMatrix[3];
}
}

rect = Util.getAxialAlignedBoundingBox(rect, m);

let baselineRect = Util.getAxialAlignedBoundingBox([0, 0, 0, 0], m);
let baseline = 0;
if (rotation === 0 || rotation === 180) {
baseline = baselineRect[1];
}
else if (rotation === 90 || rotation === 270) {
baseline = baselineRect[0];
}

let p1 = [0, 0];
let p2 = [0, 1];

let [x1, y1] = Util.applyTransform(p1, getCurrentTextTransform());
let [x2, y2] = Util.applyTransform(p2, getCurrentTextTransform());
let fontSize = Math.hypot(x1 - x2, y1 - y2);

let diagonal = rotation % 90 !== 0;

if (
glyph.unicode !== ' ' &&
fontSize !== 0 &&
// Sometimes char can map to null and break strings
glyph.unicode.charCodeAt(0)
) {
textChunk.chars.push({
// Decomposed ligatures, normalized Arabic characters
c: glyphUnicode,
// Normalizes Arabic characters others characters where length remains 1, but preserves
// ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301'
// which is quite common in Spanish author names and because of the space prevents
// author name recognition
u: glyphUnicode.length === 1 ? glyphUnicode : glyph.unicode,
rect,
fontSize,
fontName: textState.font.name,
bold: textState.font.bold,
italic: textState.font.italic,
glyphWidth,
baseline,
rotation,
diagonal,
});
}

if (charSpacing) {
if (!font.vertical) {
textState.translateTextMatrix(
Expand Down Expand Up @@ -3012,6 +3132,7 @@ class PartialEvaluator {
textContent.items.push(runBidiTransform(textContentItem));
textContentItem.initialized = false;
textContentItem.str.length = 0;
textContentItem.chars = [];
}

function enqueueChunk(batch = false) {
Expand Down
115 changes: 115 additions & 0 deletions src/core/module/content-rect.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import { distance } from './lib/levenstein.js';
import { getPageLabel } from './page-label.js';
import { getCenterRect, getClusters, getRectCenter } from './utilities.js';

// TODO: Take into account horizontal pages

function getLinesFromChars(chars) {
let lines = [];
let line = null;
for (let char of chars) {
if (line) {
line.rect[0] = Math.min(line.rect[0], char.rect[0]);
line.rect[1] = Math.min(line.rect[1], char.rect[1]);
line.rect[2] = Math.max(line.rect[2], char.rect[2]);
line.rect[3] = Math.max(line.rect[3], char.rect[3]);
line.chars.push(char);
} else {
line = {
rect: char.rect.slice(),
chars: [char],
};
}
if (char.lineBreakAfter) {
lines.push(line);
line = null;
}
}

for (let line of lines) {
line.text = line.chars.map(x => x.c).join('');
line.centerY = getCenterRect(line.rect)[1];
}
return lines;
}

export async function getContentRect(pdfDocument, structuredCharsProvider) {
let numPages = pdfDocument.catalog.numPages;
let pageIndex = Math.floor(numPages / 2);
let startPage = pageIndex - 2;
let endPage = pageIndex + 2;

let x;

let combinedLines = [];
for (let i = startPage; i <= endPage; i++) {
let chars = await structuredCharsProvider(i);
if (!x) x = chars[2743];
let lines = getLinesFromChars(chars);
combinedLines.push(...lines);
}

let clusters = getClusters(combinedLines, 'centerY', 0.2);

combinedLines = [];

for (let cluster of clusters) {
let removeLines = new Set();
for (let i = 0; i < cluster.length; i++) {
let currentLine = cluster[i];
for (let j = 0; j < cluster.length; j++) {
if (i === j || removeLines.has(i) && removeLines.has(j)) {
continue;
}
let otherLine = cluster[j];
let dist = distance(currentLine.text, otherLine.text);
let stringsEqual = dist / currentLine.text.length <= 0.2;
if (stringsEqual) {
removeLines.add(i);
removeLines.add(j);
}
}
}
for (let i = 0; i < cluster.length; i++) {
let line = cluster[i];
if (!removeLines.has(i)) {
combinedLines.push(line);
}
}
}

let max = combinedLines.reduce((acc, cur) => cur.rect[0] < acc.rect[0] ? cur : acc, combinedLines[0])

let eps = 0.1;
let rect = [
Math.min(...combinedLines.map(x => x.rect[0])) + eps,
Math.min(...combinedLines.map(x => x.rect[1])) + eps,
Math.max(...combinedLines.map(x => x.rect[2])) - eps,
Math.max(...combinedLines.map(x => x.rect[3])) - eps,
];

for (let i = startPage; i <= endPage; i++) {
let pageLabel = await getPageLabel(pdfDocument, structuredCharsProvider, i);
if (!pageLabel) {
continue;
}
let { rotate, view } = await pdfDocument.getPage(i);
let width = view[2] - view[0];
let height = view[3] - view[1];

let centerRect = getRectCenter(pageLabel.rect);
if (centerRect[1] < height / 8) {
rect[1] = Math.max(rect[1], pageLabel.rect[3] + eps);
}
else if (centerRect[1] > (height / 8) * 7) {
rect[3] = Math.min(rect[3], pageLabel.rect[1] - eps);
}
}

let { view } = await pdfDocument.getPage(1);
let width = view[2] - view[0];
rect[0] = 0;
rect[2] = width;

return rect;
}
Loading

0 comments on commit c91890e

Please sign in to comment.