Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New figure/table segmentation approach and models #963

Draft
wants to merge 33 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
2721f64
use batik
kermitt2 Jul 4, 2021
1cbb815
batik integration
kermitt2 Jul 4, 2021
aa654fd
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
5e9e53a
test svg element merging
kermitt2 Jul 4, 2021
7b8917d
Merge branch 'master' into fix-vector-graphics
kermitt2 Jul 4, 2021
3f1058a
cleaning
kermitt2 Jul 4, 2021
b18bb88
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 3, 2021
9eecec7
start FigureSegmenterParser
kermitt2 Aug 4, 2021
7a10012
some progress on new models
kermitt2 Aug 9, 2021
d3f0df3
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 9, 2021
e964a41
review direction
kermitt2 Aug 11, 2021
0255b80
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 18, 2021
6d19e34
createTraining for figure segmenter
kermitt2 Aug 26, 2021
a92f692
Merge branch 'master' into fix-vector-graphics
kermitt2 Aug 26, 2021
01cf993
fix crop box for reference over 2 pages
kermitt2 Aug 28, 2021
0d4981d
update figure-segmenter features
kermitt2 Aug 28, 2021
6d52e2e
complete create training for figures
kermitt2 Aug 28, 2021
432d40b
various fixes
kermitt2 Aug 29, 2021
ea542e2
cleaning
kermitt2 Aug 30, 2021
711e6c9
update fulltext model with updated vector graphic processing
kermitt2 Sep 12, 2021
31bfc34
Merge branch 'master' into fix-vector-graphics
kermitt2 Sep 24, 2022
f868fe0
fix conflict
kermitt2 Nov 7, 2022
121a1cb
add stacktrace in circleci build
lfoppiano Dec 6, 2022
2b69b37
try github actions
lfoppiano Dec 6, 2022
0e722f4
fix merge conflict
kermitt2 Dec 6, 2022
ab0a514
fix merge
kermitt2 Dec 6, 2022
fb33e5d
fix conflict with latest master
kermitt2 Sep 14, 2023
e2bf621
minor doc update
kermitt2 Sep 27, 2023
d189cb5
Merge branch 'master' into new-figure-table-models
lfoppiano Dec 17, 2023
d649e22
Merge branch 'release-0.8.1' into new-figure-table-models
kermitt2 Aug 10, 2024
44d1801
Merge branch 'master' into new-figure-table-models
kermitt2 Sep 23, 2024
c664d63
working version
kermitt2 Sep 24, 2024
8e09ba4
adapt segmention
kermitt2 Sep 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test svg element merging
  • Loading branch information
kermitt2 committed Jul 4, 2021
commit 5e9e53a314b425d3a1f9d24aa8ec45616dcf76cb
13 changes: 13 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/document/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ public void setImages(List<GraphicObject> images) {
protected transient Metadata metadata = null;

protected transient Multimap<Integer, GraphicObject> imagesPerPage = LinkedListMultimap.create();
protected transient Multimap<Integer, Block> blocksPerPage = HashMultimap.create();

// some statistics regarding the document - useful for generating the features
protected double maxCharacterDensity = 0.0;
Expand Down Expand Up @@ -225,6 +226,14 @@ public Metadata getMetadata() {
return metadata;
}

public Multimap<Integer, Block> getBlocksPerPage() {
return blocksPerPage;
}

public Multimap<Integer, GraphicObject> getImagesPerPage() {
return imagesPerPage;
}

/**
* Set the path to the XML file generated by xml2pdf
*/
Expand Down Expand Up @@ -724,6 +733,10 @@ public void addPage(Page page) {
if (pages == null)
pages = new ArrayList<Page>();
pages.add(page);
if (page.getBlocks() != null) {
for(Block localBlock : page.getBlocks())
blocksPerPage.put(pages.size()-1,localBlock);
}
}

public void setBibDataSets(List<BibDataSet> bibDataSets) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ public class VectorGraphicBoxCalculator {

public static Multimap<Integer, GraphicObject> calculate(org.grobid.core.document.Document document) throws IOException {

Multimap<Integer, Block> blockMultimap = HashMultimap.create();
Multimap<Integer, Block> blockMultimap = document.getBlocksPerPage();
Multimap<Integer, GraphicObject> result = LinkedHashMultimap.create();

// init BATIK stuff
Expand Down Expand Up @@ -149,36 +149,50 @@ public static Multimap<Integer, GraphicObject> calculate(org.grobid.core.documen
SVGElement item = (SVGElement) nodeList.item(i);
SVGLocatable locatable = (SVGLocatable)item;
SVGRect rect = locatable.getBBox();

if (rect == null)
continue;

String coords = pageNum + "," + rect.getX() + "," + rect.getY() + "," + rect.getWidth() + "," + rect.getHeight();

System.out.println(coords);

BoundingBox e = BoundingBox.fromString(coords);
if (!mainPageArea.contains(e) || e.area() / mainPageArea.area() > 0.7) {
System.out.println("filter box, area: " + e.area());
System.out.println("filter this box, area: " + e.area());
continue;
}
boxes.add(e);
}
System.out.println("nb boxes: " + boxes.size());
List<BoundingBox> remainingBoxes = mergeBoxes(boxes);
System.out.println("nb remainingBoxes: " + remainingBoxes.size());

// bound intersecting or very close blocks with text, this is typically to cover
// the case where the text is outside the svg
for (int i = 0; i < remainingBoxes.size(); i++) {
Collection<Block> col = blockMultimap.get(pageNum);
for (Block bl : col) {
// if (!bl.getPage().getMainArea().contains(b)) {
// continue;
// }
// if (!bl.getPage().getMainArea().contains(b)) {
// continue;
// }

BoundingBox b = BoundingBox.fromPointAndDimensions(pageNum, bl.getX(), bl.getY(), bl.getWidth(), bl.getHeight());
if (remainingBoxes.get(i).intersect(b)) {
remainingBoxes.set(i, remainingBoxes.get(i).boundBox(b));
}

/*if (remainingBoxes.get(i).distanceTo(b) < 10) {
remainingBoxes.set(i, remainingBoxes.get(i).boundBox(b));
}*/
}
}

remainingBoxes = mergeBoxes(remainingBoxes);
remainingBoxes = glueBoxes(remainingBoxes, 10.0);
remainingBoxes = glueBoxes(remainingBoxes, 10.0);
remainingBoxes = glueBoxes(remainingBoxes, 10.0);
remainingBoxes = mergeBoxes(remainingBoxes);

System.out.println("nb remainingBoxes after merge: " + remainingBoxes.size());
for (BoundingBox b : remainingBoxes) {
if (b.area() > MINIMUM_VECTOR_BOX_AREA) {
Expand All @@ -195,6 +209,9 @@ public static Multimap<Integer, GraphicObject> calculate(org.grobid.core.documen
return result;
}

/**
* Merge bounding boxes in case of intersection
*/
public static List<BoundingBox> mergeBoxes(List<BoundingBox> boxes) {
boolean allMerged = false;
while (!allMerged) {
Expand All @@ -216,6 +233,44 @@ public static List<BoundingBox> mergeBoxes(List<BoundingBox> boxes) {
}
}

return Lists.newArrayList(Iterables.filter(boxes, new Predicate<BoundingBox>() {
@Override
public boolean apply(BoundingBox boundingBox) {
if (boundingBox == null) {
return false;
}
/*if (boundingBox.getHeight() < 5 || boundingBox.getWidth() < 5) {
return false;
}*/
return true;
}
}));
}

/**
* Merge bounding boxes in case of close proximity defined by a max proximity distance
*/
public static List<BoundingBox> glueBoxes(List<BoundingBox> boxes, double maxProximityDistance) {
boolean allMerged = false;
while (!allMerged) {
allMerged = true;
for (int i = 0; i < boxes.size(); i++) {
BoundingBox a = boxes.get(i);
if (a == null) continue;
for (int j = i + 1; j < boxes.size(); j++) {
BoundingBox b = boxes.get(j);
if (b != null) {
if (a.distanceTo(b) < maxProximityDistance) {
allMerged = false;
a = a.boundBox(b);
boxes.set(i, a);
boxes.set(j, null);
}
}
}
}
}

return Lists.newArrayList(Iterables.filter(boxes, new Predicate<BoundingBox>() {
@Override
public boolean apply(BoundingBox boundingBox) {
Expand Down