-
Notifications
You must be signed in to change notification settings - Fork 9.7k
/
Copy pathcolpartition.cpp
2581 lines (2472 loc) · 101 KB
/
colpartition.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
///////////////////////////////////////////////////////////////////////
// File: colpartition.cpp
// Description: Class to hold partitions of the page that correspond
// roughly to text lines.
// Author: Ray Smith
// Created: Thu Aug 14 10:54:01 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "colpartition.h"
#include "colpartitiongrid.h"
#include "colpartitionset.h"
#include "detlinefit.h"
#include "dppoint.h"
#include "imagefind.h"
#include "workingpartset.h"
namespace tesseract {
ELIST2IZE(ColPartition)
CLISTIZE(ColPartition)
//////////////// ColPartition Implementation ////////////////
// Maximum change in spacing (in inches) to ignore.
const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
// Maximum fraction of line height used as an additional allowance
// for top spacing.
const double kMaxTopSpacingFraction = 0.25;
// What multiple of the largest line height should be used as an upper bound
// for whether lines are in the same text block?
const double kMaxSameBlockLineSpacing = 3;
// Maximum ratio of sizes for lines to be considered the same size.
const double kMaxSizeRatio = 1.5;
// Fraction of max of leader width and gap for max IQR of gaps.
const double kMaxLeaderGapFractionOfMax = 0.25;
// Fraction of min of leader width and gap for max IQR of gaps.
const double kMaxLeaderGapFractionOfMin = 0.5;
// Minimum number of blobs to be considered a leader.
const int kMinLeaderCount = 5;
// Minimum score for a STRONG_CHAIN textline.
const int kMinStrongTextValue = 6;
// Minimum score for a CHAIN textline.
const int kMinChainTextValue = 3;
// Minimum number of blobs for strong horizontal text lines.
const int kHorzStrongTextlineCount = 8;
// Minimum height (in image pixels) for strong horizontal text lines.
const int kHorzStrongTextlineHeight = 10;
// Minimum aspect ratio for strong horizontal text lines.
const int kHorzStrongTextlineAspect = 5;
// Maximum upper quartile error allowed on a baseline fit as a fraction
// of height.
const double kMaxBaselineError = 0.4375;
// Min coverage for a good baseline between vectors
const double kMinBaselineCoverage = 0.5;
// Max RMS color noise to compare colors.
const int kMaxRMSColorNoise = 128;
// Maximum distance to allow a partition color to be to use that partition
// in smoothing neighbouring types. This is a squared distance.
const int kMaxColorDistance = 900;
// blob_type is the blob_region_type_ of the blobs in this partition.
// Vertical is the direction of logical vertical on the possibly skewed image.
ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
: left_margin_(-MAX_INT32), right_margin_(MAX_INT32),
median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
good_width_(false), good_column_(false),
left_key_tab_(false), right_key_tab_(false),
left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
working_set_(NULL), last_add_was_vertical_(false), block_owned_(false),
desperately_merged_(false),
first_column_(-1), last_column_(-1), column_set_(NULL),
side_step_(0), top_spacing_(0), bottom_spacing_(0),
type_before_table_(PT_UNKNOWN), inside_table_column_(false),
nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
owns_blobs_(true) {
memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
}
// Constructs a fake ColPartition with a single fake BLOBNBOX, all made
// from a single TBOX.
// WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
// the ColPartition owns the BLOBNBOX!!!
// Call DeleteBoxes before deleting the ColPartition.
ColPartition* ColPartition::FakePartition(const TBOX& box,
PolyBlockType block_type,
BlobRegionType blob_type,
BlobTextFlowType flow) {
ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
part->set_type(block_type);
part->set_flow(flow);
part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
part->set_left_margin(box.left());
part->set_right_margin(box.right());
part->SetBlobTypes();
part->ComputeLimits();
part->ClaimBoxes();
return part;
}
// Constructs and returns a ColPartition with the given real BLOBNBOX,
// and sets it up to be a "big" partition (single-blob partition bigger
// than the surrounding text that may be a dropcap, two or more vertically
// touching characters, or some graphic element.
// If the given list is not NULL, the partition is also added to the list.
ColPartition* ColPartition::MakeBigPartition(BLOBNBOX* box,
ColPartition_LIST* big_part_list) {
box->set_owner(NULL);
ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
single->set_flow(BTFT_NONE);
single->AddBox(box);
single->ComputeLimits();
single->ClaimBoxes();
single->SetBlobTypes();
single->set_block_owned(true);
if (big_part_list != NULL) {
ColPartition_IT part_it(big_part_list);
part_it.add_to_end(single);
}
return single;
}
ColPartition::~ColPartition() {
// Remove this as a partner of all partners, as we don't want them
// referring to a deleted object.
ColPartition_C_IT it(&upper_partners_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
it.data()->RemovePartner(false, this);
}
it.set_to_list(&lower_partners_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
it.data()->RemovePartner(true, this);
}
}
// Constructs a fake ColPartition with no BLOBNBOXes to represent a
// horizontal or vertical line, given a type and a bounding box.
ColPartition* ColPartition::MakeLinePartition(BlobRegionType blob_type,
const ICOORD& vertical,
int left, int bottom,
int right, int top) {
ColPartition* part = new ColPartition(blob_type, vertical);
part->bounding_box_ = TBOX(left, bottom, right, top);
part->median_bottom_ = bottom;
part->median_top_ = top;
part->median_size_ = top - bottom;
part->median_width_ = right - left;
part->left_key_ = part->BoxLeftKey();
part->right_key_ = part->BoxRightKey();
return part;
}
// Adds the given box to the partition, updating the partition bounds.
// The list of boxes in the partition is updated, ensuring that no box is
// recorded twice, and the boxes are kept in increasing left position.
void ColPartition::AddBox(BLOBNBOX* bbox) {
TBOX box = bbox->bounding_box();
// Update the partition limits.
if (boxes_.length() == 0) {
bounding_box_ = box;
} else {
bounding_box_ += box;
}
if (IsVerticalType()) {
if (!last_add_was_vertical_) {
boxes_.sort(SortByBoxBottom<BLOBNBOX>);
last_add_was_vertical_ = true;
}
boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
} else {
if (last_add_was_vertical_) {
boxes_.sort(SortByBoxLeft<BLOBNBOX>);
last_add_was_vertical_ = false;
}
boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
}
if (!left_key_tab_)
left_key_ = BoxLeftKey();
if (!right_key_tab_)
right_key_ = BoxRightKey();
if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
box.left(), box.bottom(), box.right(), box.top(),
bounding_box_.left(), bounding_box_.right());
}
// Removes the given box from the partition, updating the bounds.
void ColPartition::RemoveBox(BLOBNBOX* box) {
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
if (box == bb_it.data()) {
bb_it.extract();
ComputeLimits();
return;
}
}
}
// Returns the tallest box in the partition, as measured perpendicular to the
// presumed flow of text.
BLOBNBOX* ColPartition::BiggestBox() {
BLOBNBOX* biggest = NULL;
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
BLOBNBOX* bbox = bb_it.data();
if (IsVerticalType()) {
if (biggest == NULL ||
bbox->bounding_box().width() > biggest->bounding_box().width())
biggest = bbox;
} else {
if (biggest == NULL ||
bbox->bounding_box().height() > biggest->bounding_box().height())
biggest = bbox;
}
}
return biggest;
}
// Returns the bounding box excluding the given box.
TBOX ColPartition::BoundsWithoutBox(BLOBNBOX* box) {
TBOX result;
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
if (box != bb_it.data()) {
result += bb_it.data()->bounding_box();
}
}
return result;
}
// Claims the boxes in the boxes_list by marking them with a this owner
// pointer. If a box is already owned, then it must be owned by this.
void ColPartition::ClaimBoxes() {
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
BLOBNBOX* bblob = bb_it.data();
ColPartition* other = bblob->owner();
if (other == NULL) {
// Normal case: ownership is available.
bblob->set_owner(this);
} else {
ASSERT_HOST(other == this);
}
}
}
// NULL the owner of the blobs in this partition, so they can be deleted
// independently of the ColPartition.
void ColPartition::DisownBoxes() {
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
BLOBNBOX* bblob = bb_it.data();
ASSERT_HOST(bblob->owner() == this || bblob->owner() == NULL);
bblob->set_owner(NULL);
}
}
// NULL the owner of the blobs in this partition that are owned by this
// partition, so they can be deleted independently of the ColPartition.
// Any blobs that are not owned by this partition get to keep their owner
// without an assert failure.
void ColPartition::DisownBoxesNoAssert() {
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
BLOBNBOX* bblob = bb_it.data();
if (bblob->owner() == this)
bblob->set_owner(NULL);
}
}
// NULLs the owner of the blobs in this partition that are owned by this
// partition and not leader blobs, removing them from the boxes_ list, thus
// turning this partition back to a leader partition if it contains a leader,
// or otherwise leaving it empty. Returns true if any boxes remain.
bool ColPartition::ReleaseNonLeaderBoxes() {
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
BLOBNBOX* bblob = bb_it.data();
if (bblob->flow() != BTFT_LEADER) {
if (bblob->owner() == this) bblob->set_owner(NULL);
bb_it.extract();
}
}
if (bb_it.empty()) return false;
flow_ = BTFT_LEADER;
ComputeLimits();
return true;
}
// Delete the boxes that this partition owns.
void ColPartition::DeleteBoxes() {
// Although the boxes_ list is a C_LIST, in some cases it owns the
// BLOBNBOXes, as the ColPartition takes ownership from the grid,
// and the BLOBNBOXes own the underlying C_BLOBs.
for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
BLOBNBOX* bblob = bb_it.extract();
delete bblob->cblob();
delete bblob;
}
}
// Reflects the partition in the y-axis, assuming that its blobs have
// already been done. Corrects only a limited part of the members, since
// this function is assumed to be used shortly after initial creation, which
// is before a lot of the members are used.
void ColPartition::ReflectInYAxis() {
BLOBNBOX_CLIST reversed_boxes;
BLOBNBOX_C_IT reversed_it(&reversed_boxes);
// Reverse the order of the boxes_.
BLOBNBOX_C_IT bb_it(&boxes_);
for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
reversed_it.add_before_then_move(bb_it.extract());
}
bb_it.add_list_after(&reversed_boxes);
ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
int tmp = left_margin_;
left_margin_ = -right_margin_;
right_margin_ = -tmp;
ComputeLimits();
}
// Returns true if this is a legal partition - meaning that the conditions
// left_margin <= bounding_box left
// left_key <= bounding box left key
// bounding box left <= bounding box right
// and likewise for right margin and key
// are all met.
bool ColPartition::IsLegal() {
if (bounding_box_.left() > bounding_box_.right()) {
if (textord_debug_bugs) {
tprintf("Bounding box invalid\n");
Print();
}
return false; // Bounding box invalid.
}
if (left_margin_ > bounding_box_.left() ||
right_margin_ < bounding_box_.right()) {
if (textord_debug_bugs) {
tprintf("Margins invalid\n");
Print();
}
return false; // Margins invalid.
}
if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
if (textord_debug_bugs) {
tprintf("Key inside box: %d v %d or %d v %d\n",
left_key_, BoxLeftKey(), right_key_, BoxRightKey());
Print();
}
return false; // Keys inside the box.
}
return true;
}
// Returns true if the left and right edges are approximately equal.
bool ColPartition::MatchingColumns(const ColPartition& other) const {
int y = (MidY() + other.MidY()) / 2;
if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
LeftAtY(y) / kColumnWidthFactor, 1))
return false;
if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
RightAtY(y) / kColumnWidthFactor, 1))
return false;
return true;
}
// Returns true if the colors match for two text partitions.
bool ColPartition::MatchingTextColor(const ColPartition& other) const {
if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
return false; // Too noisy.
// Colors must match for other to count.
double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
other.color2_,
color1_);
double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
other.color2_,
color2_);
double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
other.color1_);
double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
other.color2_);
// All 4 distances must be small enough.
return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
}
// Returns true if the sizes match for two text partitions,
// taking orientation into account. See also SizesSimilar.
bool ColPartition::MatchingSizes(const ColPartition& other) const {
if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
return !TabFind::DifferentSizes(median_width_, other.median_width_);
else
return !TabFind::DifferentSizes(median_size_, other.median_size_);
}
// Returns true if there is no tabstop violation in merging this and other.
bool ColPartition::ConfirmNoTabViolation(const ColPartition& other) const {
if (bounding_box_.right() < other.bounding_box_.left() &&
bounding_box_.right() < other.LeftBlobRule())
return false;
if (other.bounding_box_.right() < bounding_box_.left() &&
other.bounding_box_.right() < LeftBlobRule())
return false;
if (bounding_box_.left() > other.bounding_box_.right() &&
bounding_box_.left() > other.RightBlobRule())
return false;
if (other.bounding_box_.left() > bounding_box_.right() &&
other.bounding_box_.left() > RightBlobRule())
return false;
return true;
}
// Returns true if other has a similar stroke width to this.
bool ColPartition::MatchingStrokeWidth(const ColPartition& other,
double fractional_tolerance,
double constant_tolerance) const {
int match_count = 0;
int nonmatch_count = 0;
BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
box_it.mark_cycle_pt();
other_it.mark_cycle_pt();
while (!box_it.cycled_list() && !other_it.cycled_list()) {
if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
fractional_tolerance,
constant_tolerance))
++match_count;
else
++nonmatch_count;
box_it.forward();
other_it.forward();
}
return match_count > nonmatch_count;
}
// Returns true if base is an acceptable diacritic base char merge
// with this as the diacritic.
// Returns true if:
// (1) this is a ColPartition containing only diacritics, and
// (2) the base characters indicated on the diacritics all believably lie
// within the text line of the candidate ColPartition.
bool ColPartition::OKDiacriticMerge(const ColPartition& candidate,
bool debug) const {
BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
int min_top = MAX_INT32;
int max_bottom = -MAX_INT32;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX* blob = it.data();
if (!blob->IsDiacritic()) {
if (debug) {
tprintf("Blob is not a diacritic:");
blob->bounding_box().print();
}
return false; // All blobs must have diacritic bases.
}
if (blob->base_char_top() < min_top)
min_top = blob->base_char_top();
if (blob->base_char_bottom() > max_bottom)
max_bottom = blob->base_char_bottom();
}
// If the intersection of all vertical ranges of all base characters
// overlaps the median range of this, then it is OK.
bool result = min_top > candidate.median_bottom_ &&
max_bottom < candidate.median_top_;
if (debug) {
if (result)
tprintf("OKDiacritic!\n");
else
tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
max_bottom, min_top, median_bottom_, median_top_);
}
return result;
}
// Sets the sort key using either the tab vector, or the bounding box if
// the tab vector is NULL. If the tab_vector lies inside the bounding_box,
// use the edge of the box as a key any way.
void ColPartition::SetLeftTab(const TabVector* tab_vector) {
if (tab_vector != NULL) {
left_key_ = tab_vector->sort_key();
left_key_tab_ = left_key_ <= BoxLeftKey();
} else {
left_key_tab_ = false;
}
if (!left_key_tab_)
left_key_ = BoxLeftKey();
}
// As SetLeftTab, but with the right.
void ColPartition::SetRightTab(const TabVector* tab_vector) {
if (tab_vector != NULL) {
right_key_ = tab_vector->sort_key();
right_key_tab_ = right_key_ >= BoxRightKey();
} else {
right_key_tab_ = false;
}
if (!right_key_tab_)
right_key_ = BoxRightKey();
}
// Copies the left/right tab from the src partition, but if take_box is
// true, copies the box instead and uses that as a key.
void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
left_key_tab_ = take_box ? false : src.left_key_tab_;
if (left_key_tab_) {
left_key_ = src.left_key_;
} else {
bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
left_key_ = BoxLeftKey();
}
if (left_margin_ > bounding_box_.left())
left_margin_ = src.left_margin_;
}
// As CopyLeftTab, but with the right.
void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
right_key_tab_ = take_box ? false : src.right_key_tab_;
if (right_key_tab_) {
right_key_ = src.right_key_;
} else {
bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
right_key_ = BoxRightKey();
}
if (right_margin_ < bounding_box_.right())
right_margin_ = src.right_margin_;
}
// Returns the left rule line x coord of the leftmost blob.
int ColPartition::LeftBlobRule() const {
BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
return it.data()->left_rule();
}
// Returns the right rule line x coord of the rightmost blob.
int ColPartition::RightBlobRule() const {
BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
it.move_to_last();
return it.data()->right_rule();
}
float ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {
ASSERT_HOST(type < BSTT_COUNT);
return special_blobs_densities_[type];
}
int ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {
ASSERT_HOST(type < BSTT_COUNT);
BLOBNBOX_C_IT blob_it(&boxes_);
int count = 0;
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
BLOBNBOX* blob = blob_it.data();
BlobSpecialTextType blob_type = blob->special_text_type();
if (blob_type == type) {
count++;
}
}
return count;
}
void ColPartition::SetSpecialBlobsDensity(
const BlobSpecialTextType type, const float density) {
ASSERT_HOST(type < BSTT_COUNT);
special_blobs_densities_[type] = density;
}
void ColPartition::ComputeSpecialBlobsDensity() {
memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
if (boxes_.empty()) {
return;
}
BLOBNBOX_C_IT blob_it(&boxes_);
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
BLOBNBOX* blob = blob_it.data();
BlobSpecialTextType type = blob->special_text_type();
special_blobs_densities_[type]++;
}
for (int type = 0; type < BSTT_COUNT; ++type) {
special_blobs_densities_[type] /= boxes_.length();
}
}
// Add a partner above if upper, otherwise below.
// Add them uniquely and keep the list sorted by box left.
// Partnerships are added symmetrically to partner and this.
void ColPartition::AddPartner(bool upper, ColPartition* partner) {
if (upper) {
partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, this);
upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
} else {
partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
true, this);
lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
}
}
// Removes the partner from this, but does not remove this from partner.
// This asymmetric removal is so as not to mess up the iterator that is
// working on partner's partner list.
void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
if (it.data() == partner) {
it.extract();
break;
}
}
}
// Returns the partner if the given partner is a singleton, otherwise NULL.
ColPartition* ColPartition::SingletonPartner(bool upper) {
ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
if (!partners->singleton())
return NULL;
ColPartition_C_IT it(partners);
return it.data();
}
// Merge with the other partition and delete it.
void ColPartition::Absorb(ColPartition* other, WidthCallback* cb) {
// The result has to either own all of the blobs or none of them.
// Verify the flag is consisent.
ASSERT_HOST(owns_blobs() == other->owns_blobs());
// TODO(nbeato): check owns_blobs better. Right now owns_blobs
// should always be true when this is called. So there is no issues.
if (TabFind::WithinTestRegion(2, bounding_box_.left(),
bounding_box_.bottom()) ||
TabFind::WithinTestRegion(2, other->bounding_box_.left(),
other->bounding_box_.bottom())) {
tprintf("Merging:");
Print();
other->Print();
}
// Update the special_blobs_densities_.
memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
for (int type = 0; type < BSTT_COUNT; ++type) {
int w1 = boxes_.length(), w2 = other->boxes_.length();
float new_val = special_blobs_densities_[type] * w1 +
other->special_blobs_densities_[type] * w2;
if (!w1 || !w2) {
special_blobs_densities_[type] = new_val / (w1 + w2);
}
}
// Merge the two sorted lists.
BLOBNBOX_C_IT it(&boxes_);
BLOBNBOX_C_IT it2(&other->boxes_);
for (; !it2.empty(); it2.forward()) {
BLOBNBOX* bbox2 = it2.extract();
ColPartition* prev_owner = bbox2->owner();
if (prev_owner != other && prev_owner != NULL) {
// A blob on other's list is owned by someone else; let them have it.
continue;
}
ASSERT_HOST(prev_owner == other || prev_owner == NULL);
if (prev_owner == other)
bbox2->set_owner(this);
it.add_to_end(bbox2);
}
left_margin_ = MIN(left_margin_, other->left_margin_);
right_margin_ = MAX(right_margin_, other->right_margin_);
if (other->left_key_ < left_key_) {
left_key_ = other->left_key_;
left_key_tab_ = other->left_key_tab_;
}
if (other->right_key_ > right_key_) {
right_key_ = other->right_key_;
right_key_tab_ = other->right_key_tab_;
}
// Combine the flow and blob_type in a sensible way.
// Dominant flows stay.
if (!DominatesInMerge(flow_, other->flow_)) {
flow_ = other->flow_;
blob_type_ = other->blob_type_;
}
SetBlobTypes();
if (IsVerticalType()) {
boxes_.sort(SortByBoxBottom<BLOBNBOX>);
last_add_was_vertical_ = true;
} else {
boxes_.sort(SortByBoxLeft<BLOBNBOX>);
last_add_was_vertical_ = false;
}
ComputeLimits();
// Fix partner lists. other is going away, so remove it as a
// partner of all its partners and add this in its place.
for (int upper = 0; upper < 2; ++upper) {
ColPartition_CLIST partners;
ColPartition_C_IT part_it(&partners);
part_it.add_list_after(upper ? &other->upper_partners_
: &other->lower_partners_);
for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
ColPartition* partner = part_it.extract();
partner->RemovePartner(!upper, other);
partner->RemovePartner(!upper, this);
partner->AddPartner(!upper, this);
}
}
delete other;
if (cb != NULL) {
SetColumnGoodness(cb);
}
}
// Merge1 and merge2 are candidates to be merged, yet their combined box
// overlaps this. Is that allowed?
// Returns true if the overlap between this and the merged pair of
// merge candidates is sufficiently trivial to be allowed.
// The merged box can graze the edge of this by the ok_box_overlap
// if that exceeds the margin to the median top and bottom.
// ok_box_overlap should be set by the caller appropriate to the sizes of
// the text involved, and is usually a fraction of the median size of merge1
// and/or merge2, or this.
// TODO(rays) Determine whether vertical text needs to be considered.
bool ColPartition::OKMergeOverlap(const ColPartition& merge1,
const ColPartition& merge2,
int ok_box_overlap, bool debug) {
// Vertical partitions are not allowed to be involved.
if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
if (debug)
tprintf("Vertical partition\n");
return false;
}
// The merging partitions must strongly overlap each other.
if (!merge1.VSignificantCoreOverlap(merge2)) {
if (debug)
tprintf("Voverlap %d (%d)\n",
merge1.VCoreOverlap(merge2),
merge1.VSignificantCoreOverlap(merge2));
return false;
}
// The merged box must not overlap the median bounds of this.
TBOX merged_box(merge1.bounding_box());
merged_box += merge2.bounding_box();
if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
if (debug)
tprintf("Excessive box overlap\n");
return false;
}
// Looks OK!
return true;
}
// Find the blob at which to split this to minimize the overlap with the
// given box. Returns the first blob to go in the second partition.
BLOBNBOX* ColPartition::OverlapSplitBlob(const TBOX& box) {
if (boxes_.empty() || boxes_.singleton())
return NULL;
BLOBNBOX_C_IT it(&boxes_);
TBOX left_box(it.data()->bounding_box());
for (it.forward(); !it.at_first(); it.forward()) {
BLOBNBOX* bbox = it.data();
left_box += bbox->bounding_box();
if (left_box.overlap(box))
return bbox;
}
return NULL;
}
// Split this partition keeping the first half in this and returning
// the second half.
// Splits by putting the split_blob and the blobs that follow
// in the second half, and the rest in the first half.
ColPartition* ColPartition::SplitAtBlob(BLOBNBOX* split_blob) {
ColPartition* split_part = ShallowCopy();
split_part->set_owns_blobs(owns_blobs());
BLOBNBOX_C_IT it(&boxes_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX* bbox = it.data();
ColPartition* prev_owner = bbox->owner();
ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
if (bbox == split_blob || !split_part->boxes_.empty()) {
split_part->AddBox(it.extract());
if (owns_blobs() && prev_owner != NULL)
bbox->set_owner(split_part);
}
}
ASSERT_HOST(!it.empty());
if (split_part->IsEmpty()) {
// Split part ended up with nothing. Possible if split_blob is not
// in the list of blobs.
delete split_part;
return NULL;
}
right_key_tab_ = false;
split_part->left_key_tab_ = false;
ComputeLimits();
// TODO(nbeato) Merge Ray's CL like this:
// if (owns_blobs())
// SetBlobTextlineGoodness();
split_part->ComputeLimits();
// TODO(nbeato) Merge Ray's CL like this:
// if (split_part->owns_blobs())
// split_part->SetBlobTextlineGoodness();
return split_part;
}
// Split this partition at the given x coordinate, returning the right
// half and keeping the left half in this.
ColPartition* ColPartition::SplitAt(int split_x) {
if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
return NULL; // There will be no change.
ColPartition* split_part = ShallowCopy();
split_part->set_owns_blobs(owns_blobs());
BLOBNBOX_C_IT it(&boxes_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX* bbox = it.data();
ColPartition* prev_owner = bbox->owner();
ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
const TBOX& box = bbox->bounding_box();
if (box.left() >= split_x) {
split_part->AddBox(it.extract());
if (owns_blobs() && prev_owner != NULL)
bbox->set_owner(split_part);
}
}
if (it.empty()) {
// Possible if split-x passes through the first blob.
it.add_list_after(&split_part->boxes_);
}
ASSERT_HOST(!it.empty());
if (split_part->IsEmpty()) {
// Split part ended up with nothing. Possible if split_x passes
// through the last blob.
delete split_part;
return NULL;
}
right_key_tab_ = false;
split_part->left_key_tab_ = false;
right_margin_ = split_x;
split_part->left_margin_ = split_x;
ComputeLimits();
split_part->ComputeLimits();
return split_part;
}
// Recalculates all the coordinate limits of the partition.
void ColPartition::ComputeLimits() {
bounding_box_ = TBOX(); // Clear it
BLOBNBOX_C_IT it(&boxes_);
BLOBNBOX* bbox = NULL;
int non_leader_count = 0;
if (it.empty()) {
bounding_box_.set_left(left_margin_);
bounding_box_.set_right(right_margin_);
bounding_box_.set_bottom(0);
bounding_box_.set_top(0);
} else {
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
bbox = it.data();
bounding_box_ += bbox->bounding_box();
if (bbox->flow() != BTFT_LEADER)
++non_leader_count;
}
}
if (!left_key_tab_)
left_key_ = BoxLeftKey();
if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
// TODO(rays) investigate the causes of these error messages, to find
// out if they are genuinely harmful, or just indicative of junk input.
tprintf("Computed left-illegal partition\n");
Print();
}
if (!right_key_tab_)
right_key_ = BoxRightKey();
if (right_key_ < BoxRightKey() && textord_debug_bugs) {
tprintf("Computed right-illegal partition\n");
Print();
}
if (it.empty())
return;
if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
blob_type() == BRT_POLYIMAGE) {
median_top_ = bounding_box_.top();
median_bottom_ = bounding_box_.bottom();
median_size_ = bounding_box_.height();
median_left_ = bounding_box_.left();
median_right_ = bounding_box_.right();
median_width_ = bounding_box_.width();
} else {
STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
STATS size_stats(0, bounding_box_.height() + 1);
STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
STATS width_stats(0, bounding_box_.width() + 1);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
bbox = it.data();
if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
TBOX box = bbox->bounding_box();
int area = box.area();
top_stats.add(box.top(), area);
bottom_stats.add(box.bottom(), area);
size_stats.add(box.height(), area);
left_stats.add(box.left(), area);
right_stats.add(box.right(), area);
width_stats.add(box.width(), area);
}
}
median_top_ = static_cast<int>(top_stats.median() + 0.5);
median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
median_size_ = static_cast<int>(size_stats.median() + 0.5);
median_left_ = static_cast<int>(left_stats.median() + 0.5);
median_right_ = static_cast<int>(right_stats.median() + 0.5);
median_width_ = static_cast<int>(width_stats.median() + 0.5);
}
if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
tprintf("Made partition with bad right coords");
Print();
}
if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
tprintf("Made partition with bad left coords");
Print();
}
// Fix partner lists. The bounding box has changed and partners are stored
// in bounding box order, so remove and reinsert this as a partner
// of all its partners.
for (int upper = 0; upper < 2; ++upper) {
ColPartition_CLIST partners;
ColPartition_C_IT part_it(&partners);
part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
ColPartition* partner = part_it.extract();
partner->RemovePartner(!upper, this);
partner->AddPartner(!upper, this);
}
}
if (TabFind::WithinTestRegion(2, bounding_box_.left(),
bounding_box_.bottom())) {
tprintf("Recomputed box for partition %p\n", this);
Print();
}
}
// Returns the number of boxes that overlap the given box.
int ColPartition::CountOverlappingBoxes(const TBOX& box) {
BLOBNBOX_C_IT it(&boxes_);
int overlap_count = 0;
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
BLOBNBOX* bbox = it.data();
if (box.overlap(bbox->bounding_box()))
++overlap_count;
}
return overlap_count;
}
// Computes and sets the type_ and first_column_, last_column_ and column_set_.
// resolution refers to the ppi resolution of the image.
void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
int first_spanned_col = -1;
ColumnSpanningType span_type =
columns->SpanningType(resolution,
bounding_box_.left(), bounding_box_.right(),
MIN(bounding_box_.height(), bounding_box_.width()),
MidY(), left_margin_, right_margin_,
&first_column_, &last_column_,
&first_spanned_col);
column_set_ = columns;
if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
!IsLineType()) {
// Unequal columns may indicate that the pullout spans one of the columns
// it lies in, so force it to be allocated to just that column.
if (first_spanned_col >= 0) {
first_column_ = first_spanned_col;
last_column_ = first_spanned_col;
} else {
if ((first_column_ & 1) == 0)
last_column_ = first_column_;