forked from tesseract-ocr/tesseract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcube_reco_context.cpp
202 lines (177 loc) · 6.58 KB
/
cube_reco_context.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/**********************************************************************
* File: cube_reco_context.cpp
* Description: Implementation of the Cube Recognition Context Class
* Author: Ahmad Abdulkader
* Created: 2007
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <string>
#include <limits.h>
#include "cube_reco_context.h"
#include "classifier_factory.h"
#include "cube_tuning_params.h"
#include "dict.h"
#include "feature_bmp.h"
#include "tessdatamanager.h"
#include "tesseractclass.h"
#include "tess_lang_model.h"
namespace tesseract {
// Instantiate a CubeRecoContext object using a Tesseract object.
// CubeRecoContext will not take ownership of tess_obj, but will
// record the pointer to it and will make use of various Tesseract
// components (language model, flags, etc). Thus the caller should
// keep tess_obj alive so long as the instantiated CubeRecoContext is used.
CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) {
tess_obj_ = tess_obj;
lang_ = "";
loaded_ = false;
lang_mod_ = NULL;
params_ = NULL;
char_classifier_ = NULL;
char_set_ = NULL;
word_size_model_ = NULL;
char_bigrams_ = NULL;
word_unigrams_ = NULL;
noisy_input_ = false;
size_normalization_ = false;
}
CubeRecoContext::~CubeRecoContext() {
if (char_classifier_ != NULL) {
delete char_classifier_;
char_classifier_ = NULL;
}
if (word_size_model_ != NULL) {
delete word_size_model_;
word_size_model_ = NULL;
}
if (char_set_ != NULL) {
delete char_set_;
char_set_ = NULL;
}
if (char_bigrams_ != NULL) {
delete char_bigrams_;
char_bigrams_ = NULL;
}
if (word_unigrams_ != NULL) {
delete word_unigrams_;
word_unigrams_ = NULL;
}
if (lang_mod_ != NULL) {
delete lang_mod_;
lang_mod_ = NULL;
}
if (params_ != NULL) {
delete params_;
params_ = NULL;
}
}
// Returns the path of the data files by looking up the TESSDATA_PREFIX
// environment variable and appending a "tessdata" directory to it
bool CubeRecoContext::GetDataFilePath(string *path) const {
*path = tess_obj_->datadir.string();
return true;
}
// The object initialization function that loads all the necessary
// components of a RecoContext. TessdataManager is used to load the
// data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET
// component is present, Cube will be instantiated with the unicharset
// specified in this component and the corresponding dictionary
// (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to
// Tesseract's. Otherwise, TessdataManager will assume that Cube will
// be using Tesseract's unicharset and dawgs, and will load the
// unicharset from the TESSDATA_UNICHARSET component and will load the
// dawgs from TESSDATA_*_DAWG components.
bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset) {
ASSERT_HOST(tess_obj_ != NULL);
tess_unicharset_ = tess_unicharset;
string data_file_path;
// Get the data file path.
if (GetDataFilePath(&data_file_path) == false) {
fprintf(stderr, "Unable to get data file path\n");
return false;
}
// Get the language from the Tesseract object.
lang_ = tess_obj_->lang.string();
// Create the char set.
if ((char_set_ =
CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
"CharSet\n");
return false;
}
// Create the language model.
string lm_file_name = data_file_path + lang_ + ".cube.lm";
string lm_params;
if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube "
"language model params from %s\n", lm_file_name.c_str());
return false;
}
lang_mod_ = new TessLangModel(lm_params, data_file_path,
tess_obj_->getDict().load_system_dawg,
tessdata_manager, this);
if (lang_mod_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create "
"TessLangModel\n");
return false;
}
// Create the optional char bigrams object.
char_bigrams_ = CharBigrams::Create(data_file_path, lang_);
// Create the optional word unigrams object.
word_unigrams_ = WordUnigrams::Create(data_file_path, lang_);
// Create the optional size model.
word_size_model_ = WordSizeModel::Create(data_file_path, lang_,
char_set_, Contextual());
// Load tuning params.
params_ = CubeTuningParams::Create(data_file_path, lang_);
if (params_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read "
"CubeTuningParams from %s\n", data_file_path.c_str());
return false;
}
// Create the char classifier.
char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_,
lang_mod_, char_set_,
params_);
if (char_classifier_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
"CharClassifierFactory object from %s\n", data_file_path.c_str());
return false;
}
loaded_ = true;
return true;
}
// Creates a CubeRecoContext object using a tesseract object
CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj,
TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset) {
// create the object
CubeRecoContext *cntxt = new CubeRecoContext(tess_obj);
if (cntxt == NULL) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create "
"CubeRecoContext object\n");
return NULL;
}
// load the necessary components
if (cntxt->Load(tessdata_manager, tess_unicharset) == false) {
fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init "
"CubeRecoContext object\n");
delete cntxt;
return NULL;
}
// success
return cntxt;
}
} // tesseract}