forked from tesseract-ocr/tesseract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtablefind.cpp
2102 lines (1957 loc) · 82.4 KB
/
tablefind.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
///////////////////////////////////////////////////////////////////////
// File: tablefind.cpp
// Description: Helper classes to find tables from ColPartitions.
// Author: Faisal Shafait (faisal.shafait@dfki.de)
// Created: Tue Jan 06 11:13:01 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "tablefind.h"
#include <math.h>
#include "allheaders.h"
#include "colpartitionset.h"
#include "tablerecog.h"
namespace tesseract {
// These numbers are used to calculate the global median stats.
// They just set an upper bound on the stats objects.
// Maximum vertical spacing between neighbor partitions.
const int kMaxVerticalSpacing = 500;
// Maximum width of a blob in a partition.
const int kMaxBlobWidth = 500;
// Minimum whitespace size to split a partition (measured as a multiple
// of a partition's median width).
const double kSplitPartitionSize = 2.0;
// To insert text, the partition must satisfy these size constraints
// in AllowTextPartition(). The idea is to filter noise partitions
// determined by the size compared to the global medians.
// TODO(nbeato): Need to find good numbers again.
const double kAllowTextHeight = 0.5;
const double kAllowTextWidth = 0.6;
const double kAllowTextArea = 0.8;
// The same thing applies to blobs (to filter noise).
// TODO(nbeato): These numbers are a shot in the dark...
// height and width are 0.5 * gridsize() in colfind.cpp
// area is a rough guess for the size of a period.
const double kAllowBlobHeight = 0.3;
const double kAllowBlobWidth = 0.4;
const double kAllowBlobArea = 0.05;
// Minimum number of components in a text partition. A partition having fewer
// components than that is more likely a data partition and is a candidate
// table cell.
const int kMinBoxesInTextPartition = 10;
// Maximum number of components that a data partition can have
const int kMaxBoxesInDataPartition = 20;
// Maximum allowed gap in a text partitions as a multiple of its median size.
const double kMaxGapInTextPartition = 4.0;
// Minimum value that the maximum gap in a text partition should have as a
// factor of its median size.
const double kMinMaxGapInTextPartition = 0.5;
// The amount of overlap that is "normal" for adjacent blobs in a text
// partition. This is used to calculate gap between overlapping blobs.
const double kMaxBlobOverlapFactor = 4.0;
// Maximum x-height a table partition can have as a multiple of global
// median x-height
const double kMaxTableCellXheight = 2.0;
// Maximum line spacing between a table column header and column contents
// for merging the two (as a multiple of the partition's median_size).
const int kMaxColumnHeaderDistance = 4;
// Minimum ratio of num_table_partitions to num_text_partitions in a column
// block to be called it a table column
const double kTableColumnThreshold = 3.0;
// Search for horizontal ruling lines within the vertical margin as a
// multiple of grid size
const int kRulingVerticalMargin = 3;
// Minimum overlap that a colpartition must have with a table region
// to become part of that table
const double kMinOverlapWithTable = 0.6;
// Maximum side space (distance from column boundary) that a typical
// text-line in flowing text should have as a multiple of its x-height
// (Median size).
const int kSideSpaceMargin = 10;
// Fraction of the peak of x-projection of a table region to set the
// threshold for the x-projection histogram
const double kSmallTableProjectionThreshold = 0.35;
const double kLargeTableProjectionThreshold = 0.45;
// Minimum number of rows required to look for more rows in the projection.
const int kLargeTableRowCount = 6;
// Minimum number of rows in a table
const int kMinRowsInTable = 3;
// The amount of padding (multiplied by global_median_xheight_ during use)
// that is vertically added to the search adjacent leader search during
// ColPartition marking.
const int kAdjacentLeaderSearchPadding = 2;
// Used when filtering false positives. When finding the last line
// of a paragraph (typically left-aligned), the previous line should have
// its center to the right of the last line by this scaled amount.
const double kParagraphEndingPreviousLineRatio = 1.3;
// The maximum amount of whitespace allowed left of a paragraph ending.
// Do not filter a ColPartition with more than this space left of it.
const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
// Used when filtering false positives. The last line of a paragraph
// should be preceded by a line that is predominantly text. This is the
// ratio of text to whitespace (to the right of the text) that is required
// for the previous line to be a text.
const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
// When counting table columns, this is the required gap between two columns
// (it is multiplied by global_median_xheight_).
const double kMaxXProjectionGapFactor = 2.0;
// Used for similarity in partitions using stroke width. Values copied
// from ColFind.cpp in Ray's CL.
const double kStrokeWidthFractionalTolerance = 0.25;
const double kStrokeWidthConstantTolerance = 2.0;
BOOL_VAR(textord_show_tables, false, "Show table regions");
BOOL_VAR(textord_tablefind_show_mark, false,
"Debug table marking steps in detail");
BOOL_VAR(textord_tablefind_show_stats, false,
"Show page stats used in table finding");
BOOL_VAR(textord_tablefind_recognize_tables, false,
"Enables the table recognizer for table layout and filtering.");
ELISTIZE(ColSegment)
CLISTIZE(ColSegment)
// Templated helper function used to create destructor callbacks for the
// BBGrid::ClearGridData() method.
template <typename T> void DeleteObject(T *object) {
delete object;
}
TableFinder::TableFinder()
: resolution_(0),
global_median_xheight_(0),
global_median_blob_width_(0),
global_median_ledding_(0),
left_to_right_language_(true) {
}
TableFinder::~TableFinder() {
// ColPartitions and ColSegments created by this class for storage in grids
// need to be deleted explicitly.
clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
table_grid_.ClearGridData(&DeleteObject<ColSegment>);
}
void TableFinder::set_left_to_right_language(bool order) {
left_to_right_language_ = order;
}
void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
const ICOORD& top_right) {
// Initialize clean partitions list and grid
clean_part_grid_.Init(grid_size, bottom_left, top_right);
leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
col_seg_grid_.Init(grid_size, bottom_left, top_right);
table_grid_.Init(grid_size, bottom_left, top_right);
}
// Copy cleaned partitions from part_grid_ to clean_part_grid_ and
// insert leaders and rulers into the leader_and_ruling_grid_
void TableFinder::InsertCleanPartitions(ColPartitionGrid* grid,
TO_BLOCK* block) {
// Calculate stats. This lets us filter partitions in AllowTextPartition()
// and filter blobs in AllowBlob().
SetGlobalSpacings(grid);
// Iterate the ColPartitions in the grid.
ColPartitionGridSearch gsearch(grid);
gsearch.SetUniqueMode(true);
gsearch.StartFullSearch();
ColPartition* part = NULL;
while ((part = gsearch.NextFullSearch()) != NULL) {
// Reject partitions with nothing useful inside of them.
if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
continue;
ColPartition* clean_part = part->ShallowCopy();
ColPartition* leader_part = NULL;
if (part->IsLineType()) {
InsertRulingPartition(clean_part);
continue;
}
// Insert all non-text partitions to clean_parts
if (!part->IsTextType()) {
InsertImagePartition(clean_part);
continue;
}
// Insert text colpartitions after removing noisy components from them
// The leaders are split into a separate grid.
BLOBNBOX_CLIST* part_boxes = part->boxes();
BLOBNBOX_C_IT pit(part_boxes);
for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
BLOBNBOX *pblob = pit.data();
// Bad blobs... happens in UNLV set.
// news.3G1, page 17 (around x=6)
if (!AllowBlob(*pblob))
continue;
if (pblob->flow() == BTFT_LEADER) {
if (leader_part == NULL) {
leader_part = part->ShallowCopy();
leader_part->set_flow(BTFT_LEADER);
}
leader_part->AddBox(pblob);
} else if (pblob->region_type() != BRT_NOISE) {
clean_part->AddBox(pblob);
}
}
clean_part->ComputeLimits();
ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
InsertTextPartition(clean_part);
SplitAndInsertFragmentedTextPartition(fragmented);
if (leader_part != NULL) {
// TODO(nbeato): Note that ComputeLimits does not update the column
// information. So the leader may appear to span more columns than it
// really does later on when IsInSameColumnAs gets called to test
// for adjacent leaders.
leader_part->ComputeLimits();
InsertLeaderPartition(leader_part);
}
}
// Make the partition partners better for upper and lower neighbors.
clean_part_grid_.FindPartitionPartners();
clean_part_grid_.RefinePartitionPartners(false);
}
// High level function to perform table detection
void TableFinder::LocateTables(ColPartitionGrid* grid,
ColPartitionSet** all_columns,
WidthCallback* width_cb,
const FCOORD& reskew) {
// initialize spacing, neighbors, and columns
InitializePartitions(all_columns);
#ifndef GRAPHICS_DISABLED
if (textord_show_tables) {
ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColPartitions(table_win, &leader_and_ruling_grid_,
ScrollView::AQUAMARINE);
DisplayColPartitionConnections(table_win, &clean_part_grid_,
ScrollView::ORANGE);
table_win = MakeWindow(100, 300, "Fragmented Text");
DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
}
#endif // GRAPHICS_DISABLED
// mark, filter, and smooth candidate table partitions
MarkTablePartitions();
// Make single-column blocks from good_columns_ partitions. col_segments are
// moved to a grid later which takes the ownership
ColSegment_LIST column_blocks;
GetColumnBlocks(all_columns, &column_blocks);
// Set the ratio of candidate table partitions in each column
SetColumnsType(&column_blocks);
// Move column segments to col_seg_grid_
MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
// Detect split in column layout that might have occurred due to the
// presence of a table. In such a case, merge the corresponding columns.
GridMergeColumnBlocks();
// Group horizontally overlapping table partitions into table columns.
// table_columns created here get deleted at the end of this method.
ColSegment_LIST table_columns;
GetTableColumns(&table_columns);
// Within each column, mark the range table regions occupy based on the
// table columns detected. table_regions are moved to a grid later which
// takes the ownership
ColSegment_LIST table_regions;
GetTableRegions(&table_columns, &table_regions);
#ifndef GRAPHICS_DISABLED
if (textord_tablefind_show_mark) {
ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
}
#endif // GRAPHICS_DISABLED
// Merge table regions across columns for tables spanning multiple
// columns
MoveColSegmentsToGrid(&table_regions, &table_grid_);
GridMergeTableRegions();
// Adjust table boundaries by including nearby horizontal lines and left
// out column headers
AdjustTableBoundaries();
GridMergeTableRegions();
if (textord_tablefind_recognize_tables) {
// Remove false alarms consiting of a single column
DeleteSingleColumnTables();
#ifndef GRAPHICS_DISABLED
if (textord_show_tables) {
ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
table_grid_.DisplayBoxes(table_win);
}
#endif // GRAPHICS_DISABLED
// Find table grid structure and reject tables that are malformed.
RecognizeTables();
GridMergeTableRegions();
RecognizeTables();
#ifndef GRAPHICS_DISABLED
if (textord_show_tables) {
ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
DisplayColPartitions(table_win, &clean_part_grid_,
ScrollView::BLUE, ScrollView::BLUE);
table_grid_.DisplayBoxes(table_win);
}
#endif // GRAPHICS_DISABLED
} else {
// Remove false alarms consiting of a single column
// TODO(nbeato): verify this is a NOP after structured table rejection.
// Right now it isn't. If the recognize function is doing what it is
// supposed to do, this function is obsolete.
DeleteSingleColumnTables();
#ifndef GRAPHICS_DISABLED
if (textord_show_tables) {
ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
DisplayColPartitions(table_win, &clean_part_grid_,
ScrollView::BLUE, ScrollView::BLUE);
table_grid_.DisplayBoxes(table_win);
}
#endif // GRAPHICS_DISABLED
}
// Merge all colpartitions in table regions to make them a single
// colpartition and revert types of isolated table cells not
// assigned to any table to their original types.
MakeTableBlocks(grid, all_columns, width_cb);
}
// All grids have the same dimensions. The clean_part_grid_ sizes are set from
// the part_grid_ that is passed to InsertCleanPartitions, which was the same as
// the grid that is the base of ColumnFinder. Just return the clean_part_grid_
// dimensions instead of duplicated memory.
int TableFinder::gridsize() const {
return clean_part_grid_.gridsize();
}
int TableFinder::gridwidth() const {
return clean_part_grid_.gridwidth();
}
int TableFinder::gridheight() const {
return clean_part_grid_.gridheight();
}
const ICOORD& TableFinder::bleft() const {
return clean_part_grid_.bleft();
}
const ICOORD& TableFinder::tright() const {
return clean_part_grid_.tright();
}
void TableFinder::InsertTextPartition(ColPartition* part) {
ASSERT_HOST(part != NULL);
if (AllowTextPartition(*part)) {
clean_part_grid_.InsertBBox(true, true, part);
} else {
delete part;
}
}
void TableFinder::InsertFragmentedTextPartition(ColPartition* part) {
ASSERT_HOST(part != NULL);
if (AllowTextPartition(*part)) {
fragmented_text_grid_.InsertBBox(true, true, part);
} else {
delete part;
}
}
void TableFinder::InsertLeaderPartition(ColPartition* part) {
ASSERT_HOST(part != NULL);
if (!part->IsEmpty() && part->bounding_box().area() > 0) {
leader_and_ruling_grid_.InsertBBox(true, true, part);
} else {
delete part;
}
}
void TableFinder::InsertRulingPartition(ColPartition* part) {
leader_and_ruling_grid_.InsertBBox(true, true, part);
}
void TableFinder::InsertImagePartition(ColPartition* part) {
// NOTE: If images are placed into a different grid in the future,
// the function SetPartitionSpacings needs to be updated. It should
// be the only thing that cares about image partitions.
clean_part_grid_.InsertBBox(true, true, part);
}
// Splits a partition into its "words". The splits happen
// at locations with wide inter-blob spacing. This is useful
// because it allows the table recognize to "cut through" the
// text lines on the page. The assumption is that a table
// will have several lines with similar overlapping whitespace
// whereas text will not have this type of property.
// Note: The code Assumes that blobs are sorted by the left side x!
// This will not work (as well) if the blobs are sorted by center/right.
void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition* part) {
ASSERT_HOST(part != NULL);
// Bye bye empty partitions!
if (part->boxes()->empty()) {
delete part;
return;
}
// The AllowBlob function prevents this.
ASSERT_HOST(part->median_width() > 0);
const double kThreshold = part->median_width() * kSplitPartitionSize;
ColPartition* right_part = part;
bool found_split = true;
while (found_split) {
found_split = false;
BLOBNBOX_C_IT box_it(right_part->boxes());
// Blobs are sorted left side first. If blobs overlap,
// the previous blob may have a "more right" right side.
// Account for this by always keeping the largest "right"
// so far.
int previous_right = MIN_INT32;
// Look for the next split in the partition.
for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
const TBOX& box = box_it.data()->bounding_box();
if (previous_right != MIN_INT32 &&
box.left() - previous_right > kThreshold) {
// We have a split position. Split the partition in two pieces.
// Insert the left piece in the grid and keep processing the right.
int mid_x = (box.left() + previous_right) / 2;
ColPartition* left_part = right_part;
right_part = left_part->SplitAt(mid_x);
InsertFragmentedTextPartition(left_part);
found_split = true;
break;
}
// The right side of the previous blobs.
previous_right = MAX(previous_right, box.right());
}
}
// When a split is not found, the right part is minimized
// as much as possible, so process it.
InsertFragmentedTextPartition(right_part);
}
// Some simple criteria to filter out now. We want to make sure the
// average blob size in the partition is consistent with the
// global page stats.
// The area metric will almost always pass for multi-blob partitions.
// It is useful when filtering out noise caused by an isolated blob.
bool TableFinder::AllowTextPartition(const ColPartition& part) const {
const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
const int median_area = global_median_xheight_ * global_median_blob_width_;
const double kAreaPerBlobRequired = median_area * kAllowTextArea;
// Keep comparisons strictly greater to disallow 0!
return part.median_size() > kHeightRequired &&
part.median_width() > kWidthRequired &&
part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
}
// Same as above, applied to blobs. Keep in mind that
// leaders, commas, and periods are important in tables.
bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
const TBOX& box = blob.bounding_box();
const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
const int median_area = global_median_xheight_ * global_median_blob_width_;
const double kAreaRequired = median_area * kAllowBlobArea;
// Keep comparisons strictly greater to disallow 0!
return box.height() > kHeightRequired &&
box.width() > kWidthRequired &&
box.area() > kAreaRequired;
}
// TODO(nbeato): The grid that makes the window doesn't seem to matter.
// The only downside is that window messages will be caught by
// clean_part_grid_ instead of a useful object. This is a temporary solution
// for the debug windows created by the TableFinder.
ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
return clean_part_grid_.MakeWindow(x, y, window_name);
}
// Make single-column blocks from good_columns_ partitions.
void TableFinder::GetColumnBlocks(ColPartitionSet** all_columns,
ColSegment_LIST* column_blocks) {
for (int i = 0; i < gridheight(); ++i) {
ColPartitionSet* columns = all_columns[i];
if (columns != NULL) {
ColSegment_LIST new_blocks;
// Get boxes from the current vertical position on the grid
columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
// Merge the new_blocks boxes into column_blocks if they are well-aligned
GroupColumnBlocks(&new_blocks, column_blocks);
}
}
}
// Merge column segments into the current list if they are well aligned.
void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
ColSegment_LIST* column_blocks) {
ColSegment_IT src_it(new_blocks);
ColSegment_IT dest_it(column_blocks);
// iterate through the source list
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
ColSegment* src_seg = src_it.data();
const TBOX& src_box = src_seg->bounding_box();
bool match_found = false;
// iterate through the destination list to find a matching column block
for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
ColSegment* dest_seg = dest_it.data();
TBOX dest_box = dest_seg->bounding_box();
if (ConsecutiveBoxes(src_box, dest_box)) {
// If matching block is found, insert the current block into it
// and delete the soure block
dest_seg->InsertBox(src_box);
match_found = true;
delete src_it.extract();
break;
}
}
// If no match is found, just append the source block to column_blocks
if (!match_found) {
dest_it.add_after_then_move(src_it.extract());
}
}
}
// are the two boxes immediate neighbors along the vertical direction
bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
int x_margin = 20;
int y_margin = 5;
return (abs(b1.left() - b2.left()) < x_margin) &&
(abs(b1.right() - b2.right()) < x_margin) &&
(abs(b1.top()-b2.bottom()) < y_margin ||
abs(b2.top()-b1.bottom()) < y_margin);
}
// Set up info for clean_part_grid_ partitions to be valid during detection
// code.
void TableFinder::InitializePartitions(ColPartitionSet** all_columns) {
FindNeighbors();
SetPartitionSpacings(&clean_part_grid_, all_columns);
SetGlobalSpacings(&clean_part_grid_);
}
// Set left, right and top, bottom spacings of each colpartition.
void TableFinder::SetPartitionSpacings(ColPartitionGrid* grid,
ColPartitionSet** all_columns) {
// Iterate the ColPartitions in the grid.
ColPartitionGridSearch gsearch(grid);
gsearch.StartFullSearch();
ColPartition* part = NULL;
while ((part = gsearch.NextFullSearch()) != NULL) {
ColPartitionSet* columns = all_columns[gsearch.GridY()];
TBOX box = part->bounding_box();
int y = part->MidY();
ColPartition* left_column = columns->ColumnContaining(box.left(), y);
ColPartition* right_column = columns->ColumnContaining(box.right(), y);
// set distance from left column as space to the left
if (left_column) {
int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
part->set_space_to_left(left_space);
}
// set distance from right column as space to the right
if (right_column) {
int right_space = MAX(0, right_column->RightAtY(y) - box.right());
part->set_space_to_right(right_space);
}
// Look for images that may be closer.
// NOTE: used to be part_grid_, might cause issues now
ColPartitionGridSearch hsearch(grid);
hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
ColPartition* neighbor = NULL;
while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
if (neighbor->type() == PT_PULLOUT_IMAGE ||
neighbor->type() == PT_FLOWING_IMAGE ||
neighbor->type() == PT_HEADING_IMAGE) {
int right = neighbor->bounding_box().right();
if (right < box.left()) {
int space = MIN(box.left() - right, part->space_to_left());
part->set_space_to_left(space);
}
}
}
hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
neighbor = NULL;
while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
if (neighbor->type() == PT_PULLOUT_IMAGE ||
neighbor->type() == PT_FLOWING_IMAGE ||
neighbor->type() == PT_HEADING_IMAGE) {
int left = neighbor->bounding_box().left();
if (left > box.right()) {
int space = MIN(left - box.right(), part->space_to_right());
part->set_space_to_right(space);
}
}
}
ColPartition* upper_part = part->SingletonPartner(true);
if (upper_part) {
int space = MAX(0, upper_part->bounding_box().bottom() -
part->bounding_box().bottom());
part->set_space_above(space);
} else {
// TODO(nbeato): What constitutes a good value?
// 0 is the default value when not set, explicitly noting it needs to
// be something else.
part->set_space_above(MAX_INT32);
}
ColPartition* lower_part = part->SingletonPartner(false);
if (lower_part) {
int space = MAX(0, part->bounding_box().bottom() -
lower_part->bounding_box().bottom());
part->set_space_below(space);
} else {
// TODO(nbeato): What constitutes a good value?
// 0 is the default value when not set, explicitly noting it needs to
// be something else.
part->set_space_below(MAX_INT32);
}
}
}
// Set spacing and closest neighbors above and below a given colpartition.
void TableFinder::SetVerticalSpacing(ColPartition* part) {
TBOX box = part->bounding_box();
int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
box.set_top(top_range);
box.set_bottom(bottom_range);
TBOX part_box = part->bounding_box();
// Start a rect search
GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
rectsearch(&clean_part_grid_);
rectsearch.StartRectSearch(box);
ColPartition* neighbor;
int min_space_above = kMaxVerticalSpacing;
int min_space_below = kMaxVerticalSpacing;
ColPartition* above_neighbor = NULL;
ColPartition* below_neighbor = NULL;
while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
if (neighbor == part)
continue;
TBOX neighbor_box = neighbor->bounding_box();
if (neighbor_box.major_x_overlap(part_box)) {
int gap = abs(part->median_bottom() - neighbor->median_bottom());
// If neighbor is below current partition
if (neighbor_box.top() < part_box.bottom() &&
gap < min_space_below) {
min_space_below = gap;
below_neighbor = neighbor;
} // If neighbor is above current partition
else if (part_box.top() < neighbor_box.bottom() &&
gap < min_space_above) {
min_space_above = gap;
above_neighbor = neighbor;
}
}
}
part->set_space_above(min_space_above);
part->set_space_below(min_space_below);
part->set_nearest_neighbor_above(above_neighbor);
part->set_nearest_neighbor_below(below_neighbor);
}
// Set global spacing and x-height estimates
void TableFinder::SetGlobalSpacings(ColPartitionGrid* grid) {
STATS xheight_stats(0, kMaxVerticalSpacing + 1);
STATS width_stats(0, kMaxBlobWidth + 1);
STATS ledding_stats(0, kMaxVerticalSpacing + 1);
// Iterate the ColPartitions in the grid.
ColPartitionGridSearch gsearch(grid);
gsearch.SetUniqueMode(true);
gsearch.StartFullSearch();
ColPartition* part = NULL;
while ((part = gsearch.NextFullSearch()) != NULL) {
// TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
// ComputeLimits needs to get called somewhere outside of TableFinder
// to make sure the partitions are properly initialized.
// When this is called, SmoothPartitionPartners dies in an assert after
// table find runs. Alternative solution.
// part->ComputeLimits();
if (part->IsTextType()) {
// xheight_stats.add(part->median_size(), part->boxes_count());
// width_stats.add(part->median_width(), part->boxes_count());
// This loop can be removed when above issues are fixed.
// Replace it with the 2 lines commented out above.
BLOBNBOX_C_IT it(part->boxes());
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
xheight_stats.add(it.data()->bounding_box().height(), 1);
width_stats.add(it.data()->bounding_box().width(), 1);
}
ledding_stats.add(part->space_above(), 1);
ledding_stats.add(part->space_below(), 1);
}
}
// Set estimates based on median of statistics obtained
set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
#ifndef GRAPHICS_DISABLED
if (textord_tablefind_show_stats) {
const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
}
#endif // GRAPHICS_DISABLED
}
void TableFinder::set_global_median_xheight(int xheight) {
global_median_xheight_ = xheight;
}
void TableFinder::set_global_median_blob_width(int width) {
global_median_blob_width_ = width;
}
void TableFinder::set_global_median_ledding(int ledding) {
global_median_ledding_ = ledding;
}
void TableFinder::FindNeighbors() {
ColPartitionGridSearch gsearch(&clean_part_grid_);
gsearch.StartFullSearch();
ColPartition* part = NULL;
while ((part = gsearch.NextFullSearch()) != NULL) {
// TODO(nbeato): Rename this function, meaning is different now.
// IT is finding nearest neighbors its own way
//SetVerticalSpacing(part);
ColPartition* upper = part->SingletonPartner(true);
if (upper)
part->set_nearest_neighbor_above(upper);
ColPartition* lower = part->SingletonPartner(false);
if (lower)
part->set_nearest_neighbor_below(lower);
}
}
// High level interface. Input is an unmarked ColPartitionGrid
// (namely, clean_part_grid_). Partitions are identified using local
// information and filter/smoothed. The function exit should contain
// a good sampling of the table partitions.
void TableFinder::MarkTablePartitions() {
MarkPartitionsUsingLocalInformation();
if (textord_tablefind_show_mark) {
ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColPartitions(table_win, &leader_and_ruling_grid_,
ScrollView::AQUAMARINE);
}
FilterFalseAlarms();
if (textord_tablefind_show_mark) {
ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColPartitions(table_win, &leader_and_ruling_grid_,
ScrollView::AQUAMARINE);
}
SmoothTablePartitionRuns();
if (textord_tablefind_show_mark) {
ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColPartitions(table_win, &leader_and_ruling_grid_,
ScrollView::AQUAMARINE);
}
FilterFalseAlarms();
if (textord_tablefind_show_mark || textord_show_tables) {
ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
DisplayColPartitions(table_win, &leader_and_ruling_grid_,
ScrollView::AQUAMARINE);
}
}
// These types of partitions are marked as table partitions:
// 1- Partitions that have at lease one large gap between words
// 2- Partitions that consist of only one word (no significant gap
// between components)
// 3- Partitions that vertically overlap with other partitions within the
// same column.
// 4- Partitions with leaders before/after them.
void TableFinder::MarkPartitionsUsingLocalInformation() {
// Iterate the ColPartitions in the grid.
GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
gsearch(&clean_part_grid_);
gsearch.StartFullSearch();
ColPartition* part = NULL;
while ((part = gsearch.NextFullSearch()) != NULL) {
if (!part->IsTextType()) // Only consider text partitions
continue;
// Only consider partitions in dominant font size or smaller
if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
continue;
// Mark partitions with a large gap, or no significant gap as
// table partitions.
// Comments: It produces several false alarms at:
// - last line of a paragraph (fixed)
// - single word section headings
// - page headers and footers
// - numbered equations
// - line drawing regions
// TODO(faisal): detect and fix above-mentioned cases
if (HasWideOrNoInterWordGap(part) ||
HasLeaderAdjacent(*part)) {
part->set_table_type();
}
}
}
// Check if the partition has at least one large gap between words or no
// significant gap at all
bool TableFinder::HasWideOrNoInterWordGap(ColPartition* part) const {
// Should only get text partitions.
ASSERT_HOST(part->IsTextType());
// Blob access
BLOBNBOX_CLIST* part_boxes = part->boxes();
BLOBNBOX_C_IT it(part_boxes);
// Check if this is a relatively small partition (such as a single word)
if (part->bounding_box().width() <
kMinBoxesInTextPartition * part->median_size() &&
part_boxes->length() < kMinBoxesInTextPartition)
return true;
// Variables used to compute inter-blob spacing.
int current_x0 = -1;
int current_x1 = -1;
int previous_x1 = -1;
// Stores the maximum gap detected.
int largest_partition_gap_found = -1;
// Text partition gap limits. If this is text (and not a table),
// there should be at least one gap larger than min_gap and no gap
// larger than max_gap.
const double max_gap = kMaxGapInTextPartition * part->median_size();
const double min_gap = kMinMaxGapInTextPartition * part->median_size();
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX* blob = it.data();
current_x0 = blob->bounding_box().left();
current_x1 = blob->bounding_box().right();
if (previous_x1 != -1) {
int gap = current_x0 - previous_x1;
// TODO(nbeato): Boxes may overlap? Huh?
// For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
// on the top right of the page are filtered out with this line.
// Note 2: Iterating over blobs in a partition, so we are looking for
// spacing between the words.
if (gap < 0) {
// More likely case, the blobs slightly overlap. This can happen
// with diacritics (accents) or broken alphabet symbols (characters).
// Merge boxes together by taking max of right sides.
if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
previous_x1 = MAX(previous_x1, current_x1);
continue;
}
// Extreme case, blobs overlap significantly in the same partition...
// This should not happen often (if at all), but it does.
// TODO(nbeato): investigate cases when this happens.
else {
// The behavior before was to completely ignore this case.
}
}
// If a large enough gap is found, mark it as a table cell (return true)
if (gap > max_gap)
return true;
if (gap > largest_partition_gap_found)
largest_partition_gap_found = gap;
}
previous_x1 = current_x1;
}
// Since no large gap was found, return false if the partition is too
// long to be a data cell
if (part->bounding_box().width() >
kMaxBoxesInDataPartition * part->median_size() ||
part_boxes->length() > kMaxBoxesInDataPartition)
return false;
// A partition may be a single blob. In this case, it's an isolated symbol
// or non-text (such as a ruling or image).
// Detect these as table partitions? Shouldn't this be case by case?
// The behavior before was to ignore this, making max_partition_gap < 0
// and implicitly return true. Just making it explicit.
if (largest_partition_gap_found == -1)
return true;
// return true if the maximum gap found is smaller than the minimum allowed
// max_gap in a text partition. This indicates that there is no significant
// space in the partition, hence it is likely a single word.
return largest_partition_gap_found < min_gap;
}
// A criteria for possible tables is that a table may have leaders
// between data cells. An aggressive solution to find such tables is to
// explicitly mark partitions that have adjacent leaders.
// Note that this includes overlapping leaders. However, it does not
// include leaders in different columns on the page.
// Possible false-positive will include lists, such as a table of contents.
// As these arise, the aggressive nature of this search may need to be
// trimmed down.
bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
if (part.flow() == BTFT_LEADER)
return true;
// Search range is left and right bounded by an offset of the
// median xheight. This offset is to allow some tolerance to the
// the leaders on the page in the event that the alignment is still
// a bit off.
const TBOX& box = part.bounding_box();
const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
const int top = box.top() + search_size;
const int bottom = box.bottom() - search_size;
ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
for (int direction = 0; direction < 2; ++direction) {
bool right_to_left = (direction == 0);
int x = right_to_left ? box.right() : box.left();
hsearch.StartSideSearch(x, bottom, top);
ColPartition* leader = NULL;
while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
// The leader could be a horizontal ruling in the grid.
// Make sure it is actually a leader.
if (leader->flow() != BTFT_LEADER)
continue;
// This should not happen, they are in different grids.
ASSERT_HOST(&part != leader);
// Make sure the leader shares a page column with the partition,
// otherwise we are spreading across columns.
if (!part.IsInSameColumnAs(*leader))
break;
// There should be a significant vertical overlap
if (!leader->VSignificantCoreOverlap(part))
continue;
// Leader passed all tests, so it is adjacent.
return true;
}
}
// No leaders are adjacent to the given partition.
return false;
}
// Filter individual text partitions marked as table partitions
// consisting of paragraph endings, small section headings, and
// headers and footers.
void TableFinder::FilterFalseAlarms() {
FilterParagraphEndings();
FilterHeaderAndFooter();
// TODO(nbeato): Fully justified text as non-table?
}
void TableFinder::FilterParagraphEndings() {
// Detect last line of paragraph
// Iterate the ColPartitions in the grid.