forked from gali8/Tesseract-OCR-iOS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchar_set.h
174 lines (159 loc) · 6.08 KB
/
char_set.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/**********************************************************************
* File: char_samp_enum.h
* Description: Declaration of a Character Set Class
* Author: Ahmad Abdulkader
* Created: 2007
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The CharSet class encapsulates the list of 32-bit strings/characters that
// Cube supports for a specific language. The char set is loaded from the
// .unicharset file corresponding to a specific language
// Each string has a corresponding int class-id that gets used throughout Cube
// The class provides pass back and forth conversion between the class-id
// and its corresponding 32-bit string. This is done using a hash table that
// maps the string to the class id.
#ifndef CHAR_SET_H
#define CHAR_SET_H
#include <string.h>
#include <string>
#include <algorithm>
#include "string_32.h"
#include "tessdatamanager.h"
#include "unicharset.h"
#include "cube_const.h"
namespace tesseract {
class CharSet {
public:
CharSet();
~CharSet();
// Returns true if Cube is sharing Tesseract's unicharset.
inline bool SharedUnicharset() { return (unicharset_map_ == NULL); }
// Returns the class id corresponding to a 32-bit string. Returns -1
// if the string is not supported. This is done by hashing the
// string and then looking up the string in the hash-bin if there
// are collisions.
inline int ClassID(const char_32 *str) const {
int hash_val = Hash(str);
if (hash_bin_size_[hash_val] == 0)
return -1;
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0)
return hash_bins_[hash_val][bin];
}
return -1;
}
// Same as above but using a 32-bit char instead of a string
inline int ClassID(char_32 ch) const {
int hash_val = Hash(ch);
if (hash_bin_size_[hash_val] == 0)
return -1;
for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch &&
class_strings_[hash_bins_[hash_val][bin]]->length() == 1) {
return hash_bins_[hash_val][bin];
}
}
return -1;
}
// Retrieve the unicharid in Tesseract's unicharset corresponding
// to a 32-bit string. When Tesseract and Cube share the same
// unicharset, this will just be the class id.
inline int UnicharID(const char_32 *str) const {
int class_id = ClassID(str);
if (class_id == INVALID_UNICHAR_ID)
return INVALID_UNICHAR_ID;
int unichar_id;
if (unicharset_map_)
unichar_id = unicharset_map_[class_id];
else
unichar_id = class_id;
return unichar_id;
}
// Same as above but using a 32-bit char instead of a string
inline int UnicharID(char_32 ch) const {
int class_id = ClassID(ch);
if (class_id == INVALID_UNICHAR_ID)
return INVALID_UNICHAR_ID;
int unichar_id;
if (unicharset_map_)
unichar_id = unicharset_map_[class_id];
else
unichar_id = class_id;
return unichar_id;
}
// Returns the 32-bit string corresponding to a class id
inline const char_32 * ClassString(int class_id) const {
if (class_id < 0 || class_id >= class_cnt_) {
return NULL;
}
return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str());
}
// Returns the count of supported strings
inline int ClassCount() const { return class_cnt_; }
// Creates CharSet object by reading the unicharset from the
// TessDatamanager, and mapping Cube's unicharset to Tesseract's if
// they differ.
static CharSet *Create(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset);
// Return the UNICHARSET cube is using for recognition internally --
// ClassId() returns unichar_id's in this unicharset.
UNICHARSET *InternalUnicharset() { return unicharset_; }
private:
// Hash table configuration params. Determined emperically on
// the supported languages so far (Eng, Ara, Hin). Might need to be
// tuned for speed when more languages are supported
static const int kHashBins = 3001;
static const int kMaxHashSize = 16;
// Using djb2 hashing function to hash a 32-bit string
// introduced in http://www.cse.yorku.ca/~oz/hash.html
static inline int Hash(const char_32 *str) {
unsigned long hash = 5381;
int c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c;
return (hash%kHashBins);
}
// Same as above but for a single char
static inline int Hash(char_32 ch) {
char_32 b[2];
b[0] = ch;
b[1] = 0;
return Hash(b);
}
// Load the list of supported chars from the given data file
// pointer. If tess_unicharset is non-NULL, mapping each Cube class
// id to a tesseract unicharid.
bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset);
// class count
int class_cnt_;
// hash-bin sizes array
int hash_bin_size_[kHashBins];
// hash bins
int hash_bins_[kHashBins][kMaxHashSize];
// supported strings array
string_32 **class_strings_;
// map from class id to secondary (tesseract's) unicharset's ids
int *unicharset_map_;
// A unicharset which is filled in with a Tesseract-style UNICHARSET for
// cube's data if our unicharset is different from tesseract's.
UNICHARSET cube_unicharset_;
// This points to either the tess_unicharset we're passed or cube_unicharset_,
// depending upon whether we just have one unicharset or one for each
// tesseract and cube, respectively.
UNICHARSET *unicharset_;
// has the char set been initialized flag
bool init_;
};
}
#endif // CHAR_SET_H