Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New figure/table segmentation approach and models #963

Draft
wants to merge 33 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
2721f64
use batik
kermitt2 Jul 4, 2021
1cbb815
batik integration
kermitt2 Jul 4, 2021
aa654fd
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
5e9e53a
test svg element merging
kermitt2 Jul 4, 2021
7b8917d
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
3f1058a
cleaning
kermitt2 Jul 4, 2021
b18bb88
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 3, 2021
9eecec7
start FigureSegmenterParser
kermitt2 Aug 4, 2021
7a10012
some progress on new models
kermitt2 Aug 9, 2021
d3f0df3
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 9, 2021
e964a41
review direction
kermitt2 Aug 11, 2021
0255b80
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 18, 2021
6d19e34
createTraining for figure segmenter
kermitt2 Aug 26, 2021
a92f692
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 26, 2021
01cf993
fix crop box for reference over 2 pages
kermitt2 Aug 28, 2021
0d4981d
update figure-segmenter features
kermitt2 Aug 28, 2021
6d52e2e
complete create training for figures
kermitt2 Aug 28, 2021
432d40b
various fixes
kermitt2 Aug 29, 2021
ea542e2
cleaning
kermitt2 Aug 30, 2021
711e6c9
update fulltext model with updated vector graphic processing
kermitt2 Sep 12, 2021
31bfc34
Merge branch 'master' into fix-vector-graphics
kermitt2 Sep 24, 2022
f868fe0
fix conflict
kermitt2 Nov 7, 2022
121a1cb
add stacktrace in circleci build
lfoppiano Dec 6, 2022
2b69b37
try github actions
lfoppiano Dec 6, 2022
0e722f4
fix merge conflict
kermitt2 Dec 6, 2022
ab0a514
fix merge
kermitt2 Dec 6, 2022
fb33e5d
fix conflict with latest master
kermitt2 Sep 14, 2023
e2bf621
minor doc update
kermitt2 Sep 27, 2023
d189cb5
Merge branch 'master' into new-figure-table-models
lfoppiano Dec 17, 2023
d649e22
Merge branch 'release-0.8.1' into new-figure-table-models
kermitt2 Aug 10, 2024
44d1801
Merge branch 'master' into new-figure-table-models
kermitt2 Sep 23, 2024
c664d63
working version
kermitt2 Sep 24, 2024
8e09ba4
adapt segmention
kermitt2 Sep 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review direction
  • Loading branch information
kermitt2 committed Aug 11, 2021
commit e964a4122866293729051bd6821f69aa82261d41
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import java.util.List;
import java.util.SortedSet;
import java.util.regex.Matcher;
import java.util.Map;
import java.util.HashMap;

/**
* A model for segmenting the figure areas. The model is applied after the Segmentation model and
Expand Down Expand Up @@ -78,7 +80,8 @@ public Document extract(Document doc) {
List<GraphicObject> figureAnchors = this.initFigureAnchors(doc);

// for each figure anchor, we generate sequence to be labeled with features
Pair<List<String>,List<LayoutTokenization>> featureObject = this.getAreasFeatured(doc, figureAnchors);
boolean up = false;
Pair<List<String>,List<LayoutTokenization>> featureObject = this.getAreasFeatured(doc, figureAnchors, up);

List<String> contents = featureObject.getLeft();
List<LayoutTokenization> theTokenizations = featureObject.getRight();
Expand All @@ -87,8 +90,6 @@ public Document extract(Document doc) {
if (contents != null && contents.size() > 0) {
String labelledResults;
try {
boolean up = false;

GenericTagger tagger = up ? figureSegmenterParserUp : figureSegmenterParserDown;
labelledResults = tagger.label(contents);
} catch(Exception e) {
Expand All @@ -109,7 +110,7 @@ private List<GraphicObject> initFigureAnchors(Document doc) {
* Addition of the features at layout token level for the areas before and after the figure anchors.
*
*/
private Pair<List<String>,List<LayoutTokenization>> getAreasFeatured(Document doc, List<GraphicObject> figureAnchors) {
private Pair<List<String>,List<LayoutTokenization>> getAreasFeatured(Document doc, List<GraphicObject> figureAnchors, boolean up) {
List<Block> blocks = doc.getBlocks();
if ((blocks == null) || blocks.size() == 0) {
return null;
Expand All @@ -133,7 +134,10 @@ private Pair<List<String>,List<LayoutTokenization>> getAreasFeatured(Document do
int startPos = startGraphicPos;
int endPos = endGraphicPos;

// start down and determine the end position in the down direction
// position of the blocks in the page where the GraphicObject is located
Map<Integer,Block> blockIndexMap = new HashMap<>();
Page currentPage = null;

for (Page page : doc.getPages()) {
if (page.getNumber() == figureAnchor.getPage()) {
if ((page.getBlocks() == null) || (page.getBlocks().size() == 0)) {
Expand All @@ -153,44 +157,69 @@ private Pair<List<String>,List<LayoutTokenization>> getAreasFeatured(Document do
firstPageBlock = true;
}
List<LayoutToken> tokens = block.getTokens();
if (tokens == null && lastPageBlock) {
break;
}
if (tokens == null) {
if (tokens == null || tokens.size() == 0) {
continue;
}


blockIndexMap.put(tokens.get(0).getOffset(), block);
}

currentPage = page;
break;
}
}

if (currentPage == null || blockIndexMap.size() == 0) {
// we can't process this malformed graphic object (it should not happen ;)
continue;
}

for(int i=startPos; i<=endPos; i++) {
FeaturesVectorFigureSegmenter features = new FeaturesVectorFigureSegmenter();

LayoutToken token = tokenizations.get(i);
localTokenization.add(token);
String localText = token.getText();

if (localText == null) {
continue;
if (up) {
// go up and determine the start position in the upper direction
Block currentBlock = blockIndexMap.get(startPos);
int pos = currentPage.getBlocks().indexOf(currentBlock);
for(int j=1; j<=EXTENSION_SIZE; j++) {
if (pos-j < 0)
break;
List<LayoutToken> localTokens = currentPage.getBlocks().get(pos-j).getTokens();
if (localTokens == null || localTokens.size() == 0)
continue;
startPos = currentPage.getBlocks().get(pos-j).getTokens().get(0).getOffset();
}

localText = localText.replaceAll("[ \n]", "");
if(localText.length() == 0 || TextUtilities.filterLine(localText)) {
continue;
for(int i=endPos; i>=startPos; i--) {
localTokenization.add(tokenizations.get(i));
FeaturesVectorFigureSegmenter features = this.createFeatureVector(tokenizations, i, blockIndexMap);
if (features != null) {
if (i >= startGraphicPos && i <= endGraphicPos)
features.inGraphicBox = true;
content.append(features.printVector());
}
}
} else {
// go down and determine the end position in the down direction
Block currentBlock = blockIndexMap.get(endPos+1);
// if currentBlock here is null, we are already at the end of the page
if (currentBlock != null) {
int pos = currentPage.getBlocks().indexOf(currentBlock);
for(int j=0; j<EXTENSION_SIZE; j++) {
if (pos+j >= currentPage.getBlocks().size())
break;
List<LayoutToken> localTokens = currentPage.getBlocks().get(pos+j).getTokens();
if (localTokens == null || localTokens.size() == 0)
continue;
endPos = currentPage.getBlocks().get(pos+j).getTokens().get(0).getOffset();
}
}

features.token = token;
features.string = localText;

if (i >= startGraphicPos && i <= endGraphicPos)
features.inGraphicBox = true;

content.append(features.printVector());
for(int i=startPos; i<=endPos; i++) {
localTokenization.add(tokenizations.get(i));
FeaturesVectorFigureSegmenter features = this.createFeatureVector(tokenizations, i, blockIndexMap);
if (features != null) {
if (i >= startGraphicPos && i <= endGraphicPos)
features.inGraphicBox = true;
content.append(features.printVector());
}
}
}

LayoutTokenization tokenizationsFigure = new LayoutTokenization(localTokenization);
Expand All @@ -202,6 +231,25 @@ private Pair<List<String>,List<LayoutTokenization>> getAreasFeatured(Document do
return Pair.of(results, tokenizationsFigures);
}

private FeaturesVectorFigureSegmenter createFeatureVector(List<LayoutToken> tokenizations, int i, Map<Integer,Block> blockIndexMap) {
LayoutToken token = tokenizations.get(i);
String localText = token.getText();

if (localText == null) {
return null;
}

localText = localText.replaceAll("[ \n]", "");
if(localText.length() == 0 || TextUtilities.filterLine(localText)) {
return null;
}

FeaturesVectorFigureSegmenter features = new FeaturesVectorFigureSegmenter();
features.token = token;
features.string = localText;

return features;
}

/**
* Create training data based on an input Document (segmented by the segmentation model) and
Expand All @@ -217,8 +265,11 @@ public Pair<String,String> createTrainingData(Document doc, String id) {
// figure anchors are based on VectorGraphicBoxCalculator, which aggregate bitmap and SVG elements
List<GraphicObject> figureAnchors = this.initFigureAnchors(doc);

// we cover first the extension down the graphic object
boolean up = false;

// for each figure anchor, we generate sequence to be labeled with features
Pair<List<String>,List<LayoutTokenization>> featureObject = this.getAreasFeatured(doc, figureAnchors);
Pair<List<String>,List<LayoutTokenization>> featureObject = this.getAreasFeatured(doc, figureAnchors, up);

List<String> featureVectors = featureObject.getLeft();
List<LayoutTokenization> layoutTokenizations = featureObject.getRight();
Expand All @@ -229,9 +280,6 @@ public Pair<String,String> createTrainingData(Document doc, String id) {
return null;
}

// we cover first the extension down the graphic object
boolean up = false;

StringBuilder sb = new StringBuilder();
sb.append("<tei xml:space=\"preserve\">\n" +
" <teiHeader>\n" +
Expand Down