-
Notifications
You must be signed in to change notification settings - Fork 33
fix: fix and improve PDF highlighting #313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c76e552
2d320ac
c77169e
c08dd88
a047c71
9827c60
5898089
66860f2
4f6bcac
9fab717
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,4 @@ | ||
| import { Bbox, TextSpan } from '../../types'; | ||
| import { bboxesIntersect } from '../../../../utils/box'; | ||
| import { spanIntersection, spanLen } from '../../../../utils/textSpan'; | ||
|
|
||
| /** | ||
|
|
@@ -35,10 +34,6 @@ export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { | |
| // | ||
| const OVERLAP_RATIO = 0.8; | ||
|
|
||
| if (bboxesIntersect(boxA, boxB)) { | ||
| return false; | ||
| } | ||
|
|
||
| const [leftA, topA, rightA, bottomA] = boxA; | ||
| const [leftB, topB, rightB, bottomB] = boxB; | ||
| const heightA = bottomA - topA; | ||
|
|
@@ -56,6 +51,14 @@ export function isNextToEachOther(boxA: Bbox, boxB: Bbox): boolean { | |
| } | ||
|
|
||
| // see if boxes can be neighborhoods | ||
| const verticalGap = Math.max(0, leftB - rightA, leftA - rightB); | ||
| return verticalGap < avgHeight; | ||
| const horizontalGap = Math.max(leftB - rightA, leftA - rightB); | ||
| if (horizontalGap < 0) { | ||
| // gap < 0 means that the boxes are overlapped | ||
| const overlap = -horizontalGap; | ||
| // consider two horizontally aligned overlapped boxes are next to each other when | ||
| // the overlap is smaller than half of the box height. | ||
| return overlap < avgHeight / 2; | ||
| } | ||
|
Comment on lines
+55
to
+61
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the previous code of disallowing any overlapping was too strict. Practically, PDF sometimes includes bboxes overlapped each other for a text. It caused an issue of fragmented highlights like the following screenshot.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't fully understand why you are using height to determine if boxes are horizontally aligned. That seems odd. What's the reasoning behind that?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure, but my guess is that this is for picking a small size that is relative to the size of the characters. (Width wouldn't work due to differences in word length.) My initial thought would be to pick a specific small size, but this solution better accounts for potential issues with zoom.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to to adapt to various font sizes. For font height of 72px, 30px is small enough. But for height of 12px, 30px overlapping means that some characters are most likely overlapped. In the case, given boxes cannot be considered as 'next to each other'. |
||
|
|
||
| return horizontalGap < avgHeight; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,7 @@ | ||
| import minBy from 'lodash/minBy'; | ||
| import { nonEmpty } from 'utils/nonEmpty'; | ||
| import { TextSpan } from '../../types'; | ||
| import { bboxesIntersect } from '../../../../utils/box'; | ||
| import { spanLen, spanMerge } from '../../../../utils/textSpan'; | ||
| import { spanLen } from '../../../../utils/textSpan'; | ||
| import { TextLayout, TextLayoutCell, TextLayoutCellBase } from '../textLayout/types'; | ||
| import { MappingSourceTextProvider } from './MappingSourceTextProvider'; | ||
| import { MappingTargetBoxProvider } from './MappingTargetCellProvider'; | ||
|
|
@@ -30,28 +29,29 @@ export function getTextBoxMappings< | |
| const source = new Source(sourceLayout, targetLayout); | ||
| const builder = new TextBoxMappingBuilder(); | ||
|
|
||
| target.processText((targetCellId, targetText, markTargetAsMapped) => { | ||
| const matchInSource = source.findMatch(targetCellId, targetText); | ||
| if (matchInSource) { | ||
| const mappedTargetCells = markTargetAsMapped(matchInSource.matchLength); | ||
|
|
||
| let mappedSourceFullSpan: TextSpan = [0, 0]; | ||
| mappedTargetCells.forEach(targetCell => { | ||
| const mappedSourceSpan = matchInSource.markSourceAsMapped(targetCell.text); | ||
| if (mappedSourceSpan) { | ||
| builder.addMapping( | ||
| { cell: matchInSource.cell, span: mappedSourceSpan }, | ||
| { cell: targetCell } | ||
| ); | ||
| mappedSourceFullSpan = spanMerge(mappedSourceFullSpan, mappedSourceSpan); | ||
| } | ||
| }); | ||
| if (spanLen(mappedSourceFullSpan) > 0) { | ||
| matchInSource.markSourceMappedBySpan(mappedSourceFullSpan); | ||
| for (const minMatchLength of [27, 9, 3, 1]) { | ||
| debug('getTextBoxMapping: processText with minMatchLength: %d', minMatchLength); | ||
|
Comment on lines
+32
to
+33
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| target.processText((targetCellId, targetText, markTargetAsMapped) => { | ||
| if (targetText.length < minMatchLength) { | ||
| return; | ||
| } | ||
| } | ||
| }); | ||
|
|
||
| const matchInSource = source.findMatch(targetCellId, targetText, minMatchLength); | ||
| if (matchInSource) { | ||
| const mappedTargetCells = markTargetAsMapped(matchInSource.matchLength); | ||
|
|
||
| mappedTargetCells.forEach(targetCell => { | ||
| const mappedSourceSpan = matchInSource.markSourceAsMapped(targetCell.text); | ||
| if (mappedSourceSpan) { | ||
| builder.addMapping( | ||
| { cell: matchInSource.cell, span: mappedSourceSpan }, | ||
| { cell: targetCell } | ||
| ); | ||
| } | ||
| }); | ||
| matchInSource.markAsMapped(); | ||
| } | ||
| }); | ||
| } | ||
| return builder.toTextBoxMapping(); | ||
| } | ||
|
|
||
|
|
@@ -98,6 +98,7 @@ class Target<TargetCell extends TextLayoutCell> { | |
| this.targetProvider.skip(); | ||
| } | ||
| } | ||
| this.targetProvider.rewind(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -135,14 +136,17 @@ class Source<SourceCell extends TextLayoutCell, TargetCell extends TextLayoutCel | |
| * with the target cell of given `targetCellId` | ||
| * @param targetCellId | ||
| * @param text | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add mention of param |
||
| * @param minLength the minimal length of matched text. this function returns the result | ||
| * only when a match longer than `minLength` is found. Otherwise `null`. | ||
| * In case the `text` is shorter than `minLength`, this always return `null`. | ||
| * @return matched source information and functions to mark the matched span as mapped | ||
| */ | ||
| findMatch(targetCellId: TargetCell['id'], text: string) { | ||
| findMatch(targetCellId: TargetCell['id'], text: string, minLength = 1) { | ||
| const candidateSources = this.targetIndexToSources[targetCellId]; | ||
| const bestMatch = Source.findBestMatch(candidateSources, text); | ||
| const bestMatch = Source.findBestMatch(candidateSources, text, minLength); | ||
| debug('> source cell(s) matched: %o', bestMatch); | ||
|
|
||
| if (!bestMatch?.match || spanLen(bestMatch.match.span) === 0) { | ||
| if (!bestMatch?.match || spanLen(bestMatch.match.span) < minLength) { | ||
| return null; | ||
| } | ||
|
|
||
|
|
@@ -155,13 +159,19 @@ class Source<SourceCell extends TextLayoutCell, TargetCell extends TextLayoutCel | |
| matchLength: spanLen(matchedSourceSpan), | ||
| markSourceAsMapped: (text: string) => { | ||
| const mappedSource = matchedSourceProvider.getMatch(text); | ||
| if (mappedSource?.span) { | ||
| // mark the span in the source provider as 'used' so that other texts from target | ||
| // are not mapped to the same source span | ||
| matchedSourceProvider.consume(mappedSource.span); | ||
| } | ||
| debug('>> target cell %o to source %o', text, mappedSource); | ||
| return mappedSource?.span; | ||
| }, | ||
| markSourceMappedBySpan: (span: TextSpan) => { | ||
| if (spanLen(span) > 0) { | ||
| matchedSourceProvider.consume(span); | ||
| } | ||
| markAsMapped: () => { | ||
| // mark entire the match in the source provider as 'used'. | ||
| // this need to called after mapping to target text are built using `markSourceAsMapped`. | ||
| // the matched span in the source is mapped to target. | ||
| matchedSourceProvider.consume(matchedSourceSpan); | ||
| } | ||
| }; | ||
| } | ||
|
|
@@ -177,11 +187,12 @@ class Source<SourceCell extends TextLayoutCell, TargetCell extends TextLayoutCel | |
| cell: TextLayoutCell; | ||
| provider: MappingSourceTextProvider; | ||
| }[], | ||
| textToMatch: string | ||
| textToMatch: string, | ||
| minLength: number | ||
| ) { | ||
| // find matches | ||
| const matches = sources.map(source => { | ||
| const match = source.provider.getMatch(textToMatch); | ||
| const match = source.provider.getMatch(textToMatch, minLength); | ||
| return { ...source, match }; | ||
| }); | ||
|
|
||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is to avoid useless empty
<div/>containers being generated. Thedivs are generated for all the highlights in a document regardless of the page where the highlight exists. With this fix,divs are generated for highlights on the current page.