Skip to content

Commit 930074c

Browse files
committed
feat(dictionary): packs extends the dictionary with extra binary table files
1 parent c83b246 commit 930074c

File tree

12 files changed

+303
-125
lines changed

12 files changed

+303
-125
lines changed

src/rime/algo/utilities.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ int CompareVersionString(const string& x, const string& y) {
3030
return 0;
3131
}
3232

33+
ChecksumComputer::ChecksumComputer(uint32_t initial_remainder)
34+
: crc_(initial_remainder) {}
35+
3336
void ChecksumComputer::ProcessFile(const string& file_name) {
3437
std::ifstream fin(file_name.c_str());
3538
string file_content((std::istreambuf_iterator<char>(fin)),

src/rime/algo/utilities.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ int CompareVersionString(const string& x,
1818

1919
class ChecksumComputer {
2020
public:
21+
explicit ChecksumComputer(uint32_t initial_remainder = 0);
2122
void ProcessFile(const string& file_name);
2223
uint32_t Checksum();
2324

src/rime/dict/dict_compiler.cc

Lines changed: 149 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -26,85 +26,103 @@ namespace rime {
2626

2727
DictCompiler::DictCompiler(Dictionary *dictionary, const string& prefix)
2828
: dict_name_(dictionary->name()),
29+
packs_(dictionary->packs()),
2930
prism_(dictionary->prism()),
30-
table_(dictionary->table()),
31+
tables_(dictionary->tables()),
3132
prefix_(prefix) {
3233
}
3334

34-
static string LocateFile(const string& file_name) {
35+
static string locate_file(const string& file_name) {
3536
the<ResourceResolver> resolver(
3637
Service::instance().CreateResourceResolver({"build_source", "", ""}));
3738
return resolver->ResolvePath(file_name).string();
3839
}
3940

41+
static bool load_dict_settings_from_file(DictSettings* settings,
42+
const string& dict_file) {
43+
std::ifstream fin(dict_file.c_str());
44+
bool success = settings->LoadDictHeader(fin);
45+
fin.close();
46+
return success;
47+
}
48+
49+
static bool get_dict_files_from_settings(vector<string>* dict_files,
50+
DictSettings& settings) {
51+
if (auto tables = settings.GetTables()) {
52+
for(auto it = tables->begin(); it != tables->end(); ++it) {
53+
string dict_name = As<ConfigValue>(*it)->str();
54+
string dict_file = locate_file(dict_name + ".dict.yaml");
55+
if (!boost::filesystem::exists(dict_file)) {
56+
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
57+
return false;
58+
}
59+
dict_files->push_back(dict_file);
60+
}
61+
}
62+
return true;
63+
}
64+
65+
static uint32_t compute_dict_file_checksum(uint32_t initial_checksum,
66+
const vector<string>& dict_files,
67+
DictSettings& settings) {
68+
if (dict_files.empty()) {
69+
return initial_checksum;
70+
}
71+
ChecksumComputer cc(initial_checksum);
72+
for (const auto& file_name : dict_files) {
73+
cc.ProcessFile(file_name);
74+
}
75+
if (settings.use_preset_vocabulary()) {
76+
cc.ProcessFile(PresetVocabulary::DictFilePath(settings.vocabulary()));
77+
}
78+
return cc.Checksum();
79+
}
80+
4081
bool DictCompiler::Compile(const string &schema_file) {
4182
LOG(INFO) << "compiling dictionary for " << schema_file;
4283
bool build_table_from_source = true;
4384
DictSettings settings;
44-
string dict_file = LocateFile(dict_name_ + ".dict.yaml");
85+
string dict_file = locate_file(dict_name_ + ".dict.yaml");
4586
if (!boost::filesystem::exists(dict_file)) {
4687
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
4788
build_table_from_source = false;
4889
}
49-
else {
50-
std::ifstream fin(dict_file.c_str());
51-
if (!settings.LoadDictHeader(fin)) {
52-
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
53-
return false;
54-
}
55-
fin.close();
56-
LOG(INFO) << "dict name: " << settings.dict_name();
57-
LOG(INFO) << "dict version: " << settings.dict_version();
90+
else if (!load_dict_settings_from_file(&settings, dict_file)) {
91+
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
92+
return false;
5893
}
5994
vector<string> dict_files;
60-
auto tables = settings.GetTables();
61-
for(auto it = tables->begin(); it != tables->end(); ++it) {
62-
if (!Is<ConfigValue>(*it))
63-
continue;
64-
string dict_name = As<ConfigValue>(*it)->str();
65-
string dict_file = LocateFile(dict_name + ".dict.yaml");
66-
if (!boost::filesystem::exists(dict_file)) {
67-
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
68-
return false;
69-
}
70-
dict_files.push_back(dict_file);
71-
}
72-
uint32_t dict_file_checksum = 0;
73-
if (!dict_files.empty()) {
74-
ChecksumComputer cc;
75-
for (const auto& file_name : dict_files) {
76-
cc.ProcessFile(file_name);
77-
}
78-
if (settings.use_preset_vocabulary()) {
79-
cc.ProcessFile(PresetVocabulary::DictFilePath(settings.vocabulary()));
80-
}
81-
dict_file_checksum = cc.Checksum();
95+
if (!get_dict_files_from_settings(&dict_files, settings)) {
96+
return false;
8297
}
98+
uint32_t dict_file_checksum =
99+
compute_dict_file_checksum(0, dict_files, settings);
83100
uint32_t schema_file_checksum =
84101
schema_file.empty() ? 0 : Checksum(schema_file);
85-
bool rebuild_table = true;
86-
bool rebuild_prism = true;
87-
if (table_->Exists() && table_->Load()) {
88-
if (!build_table_from_source) {
89-
dict_file_checksum = table_->dict_file_checksum();
90-
LOG(INFO) << "reuse existing table: " << table_->file_name();
91-
}
92-
if (table_->dict_file_checksum() == dict_file_checksum) {
93-
rebuild_table = false;
102+
bool rebuild_table = false;
103+
bool rebuild_prism = false;
104+
const auto& primary_table = tables_[0];
105+
if (primary_table->Exists() && primary_table->Load()) {
106+
if (build_table_from_source) {
107+
rebuild_table = primary_table->dict_file_checksum() != dict_file_checksum;
108+
} else {
109+
dict_file_checksum = primary_table->dict_file_checksum();
110+
LOG(INFO) << "reuse existing table: " << primary_table->file_name();
94111
}
95-
table_->Close();
96-
}
97-
else if (!build_table_from_source) {
112+
primary_table->Close();
113+
} else if (build_table_from_source) {
114+
rebuild_table = true;
115+
} else {
98116
LOG(ERROR) << "neither " << dict_name_ << ".dict.yaml nor "
99117
<< dict_name_ << ".table.bin exists.";
100118
return false;
101119
}
102120
if (prism_->Exists() && prism_->Load()) {
103-
if (prism_->dict_file_checksum() == dict_file_checksum &&
104-
prism_->schema_file_checksum() == schema_file_checksum) {
105-
rebuild_prism = false;
106-
}
121+
rebuild_prism = prism_->dict_file_checksum() != dict_file_checksum ||
122+
prism_->schema_file_checksum() != schema_file_checksum;
107123
prism_->Close();
124+
} else {
125+
rebuild_prism = true;
108126
}
109127
LOG(INFO) << dict_file << "[" << dict_files.size() << " file(s)]"
110128
<< " (" << dict_file_checksum << ")";
@@ -126,11 +144,55 @@ bool DictCompiler::Compile(const string &schema_file) {
126144
if (options_ & kRebuildPrism) {
127145
rebuild_prism = true;
128146
}
129-
if (rebuild_table && !BuildTable(&settings, dict_files, dict_file_checksum))
130-
return false;
131-
if (rebuild_prism && !BuildPrism(schema_file,
132-
dict_file_checksum, schema_file_checksum))
147+
Syllabary syllabary;
148+
if (rebuild_table) {
149+
EntryCollector collector;
150+
if (!BuildTable(0,
151+
collector,
152+
&settings,
153+
dict_files,
154+
dict_file_checksum)) {
155+
return false;
156+
}
157+
syllabary = std::move(collector.syllabary);
158+
}
159+
if (rebuild_prism &&
160+
!BuildPrism(schema_file,
161+
syllabary,
162+
dict_file_checksum,
163+
schema_file_checksum)) {
133164
return false;
165+
}
166+
if (rebuild_table) {
167+
for (int table_index = 1; table_index < tables_.size(); ++table_index) {
168+
const auto& pack_name = packs_[table_index - 1];
169+
EntryCollector collector(std::move(syllabary));
170+
DictSettings settings;
171+
string dict_file = locate_file(pack_name + ".dict.yaml");
172+
if (!boost::filesystem::exists(dict_file)) {
173+
LOG(ERROR) << "source file '" << dict_file << "' does not exist.";
174+
continue;
175+
}
176+
if (!load_dict_settings_from_file(&settings, dict_file)) {
177+
LOG(ERROR) << "failed to load settings from '" << dict_file << "'.";
178+
continue;
179+
}
180+
vector<string> dict_files;
181+
if (!get_dict_files_from_settings(&dict_files, settings)) {
182+
continue;
183+
}
184+
uint32_t pack_file_checksum =
185+
compute_dict_file_checksum(dict_file_checksum, dict_files, settings);
186+
if (!BuildTable(table_index,
187+
collector,
188+
&settings,
189+
dict_files,
190+
pack_file_checksum)) {
191+
LOG(ERROR) << "failed to build pack: " << pack_name;
192+
}
193+
syllabary = std::move(collector.syllabary);
194+
}
195+
}
134196
// done!
135197
return true;
136198
}
@@ -143,17 +205,20 @@ static string RelocateToUserDirectory(const string& prefix,
143205
return resolver.ResolvePath(resource_id).string();
144206
}
145207

146-
bool DictCompiler::BuildTable(DictSettings* settings,
208+
bool DictCompiler::BuildTable(int table_index,
209+
EntryCollector& collector,
210+
DictSettings* settings,
147211
const vector<string>& dict_files,
148212
uint32_t dict_file_checksum) {
149-
LOG(INFO) << "building table...";
150-
table_ = New<Table>(RelocateToUserDirectory(prefix_, table_->file_name()));
213+
auto& table = tables_[table_index];
214+
auto path = RelocateToUserDirectory(prefix_, table->file_name());
215+
LOG(INFO) << "building table: " << path;
216+
table = New<Table>(path);
151217

152-
EntryCollector collector;
153218
collector.Configure(settings);
154219
collector.Collect(dict_files);
155220
if (options_ & kDump) {
156-
boost::filesystem::path path(table_->file_name());
221+
boost::filesystem::path path(table->file_name());
157222
path.replace_extension(".txt");
158223
collector.Dump(path.string());
159224
}
@@ -184,16 +249,34 @@ bool DictCompiler::BuildTable(DictSettings* settings,
184249
if (settings->sort_order() != "original") {
185250
vocabulary.SortHomophones();
186251
}
187-
table_->Remove();
188-
if (!table_->Build(collector.syllabary, vocabulary, collector.num_entries,
189-
dict_file_checksum) ||
190-
!table_->Save()) {
252+
table->Remove();
253+
if (!table->Build(collector.syllabary,
254+
vocabulary,
255+
collector.num_entries,
256+
dict_file_checksum) ||
257+
!table->Save()) {
191258
return false;
192259
}
193260
}
261+
// build reverse db for the primary table
262+
if (table_index == 0 &&
263+
!BuildReverseDb(settings,
264+
collector,
265+
vocabulary,
266+
dict_file_checksum)) {
267+
return false;
268+
}
269+
return true;
270+
}
271+
272+
bool DictCompiler::BuildReverseDb(DictSettings* settings,
273+
const EntryCollector& collector,
274+
const Vocabulary& vocabulary,
275+
uint32_t dict_file_checksum) {
194276
// build .reverse.bin
195-
ReverseDb reverse_db(RelocateToUserDirectory(prefix_,
196-
dict_name_ + ".reverse.bin"));
277+
auto path = RelocateToUserDirectory(prefix_,
278+
dict_name_ + ".reverse.bin");
279+
ReverseDb reverse_db(path);
197280
if (!reverse_db.Build(settings,
198281
collector.syllabary,
199282
vocabulary,
@@ -206,15 +289,12 @@ bool DictCompiler::BuildTable(DictSettings* settings,
206289
}
207290

208291
bool DictCompiler::BuildPrism(const string &schema_file,
292+
const Syllabary& syllabary,
209293
uint32_t dict_file_checksum,
210294
uint32_t schema_file_checksum) {
211295
LOG(INFO) << "building prism...";
212296
prism_ = New<Prism>(RelocateToUserDirectory(prefix_, prism_->file_name()));
213297

214-
// get syllabary from table
215-
Syllabary syllabary;
216-
if (!table_->Load() || !table_->GetSyllabary(&syllabary) || syllabary.empty())
217-
return false;
218298
// apply spelling algebra and prepare corrections (if enabled)
219299
Script script;
220300
if (!schema_file.empty()) {

src/rime/dict/dict_compiler.h

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ class Table;
1818
class ReverseDb;
1919
class DictSettings;
2020
class EditDistanceCorrector;
21+
class EntryCollector;
22+
class Vocabulary;
2123

2224
class DictCompiler {
2325
public:
@@ -34,18 +36,25 @@ class DictCompiler {
3436
void set_options(int options) { options_ = options; }
3537

3638
private:
37-
bool BuildTable(DictSettings* settings,
39+
bool BuildTable(int table_index,
40+
EntryCollector& collector,
41+
DictSettings* settings,
3842
const vector<string>& dict_files,
3943
uint32_t dict_file_checksum);
4044
bool BuildPrism(const string& schema_file,
45+
const Syllabary& syllabary,
4146
uint32_t dict_file_checksum,
4247
uint32_t schema_file_checksum);
43-
bool BuildReverseLookupDict(ReverseDb* db, uint32_t dict_file_checksum);
48+
bool BuildReverseDb(DictSettings* settings,
49+
const EntryCollector& collector,
50+
const Vocabulary& vocabulary,
51+
uint32_t dict_file_checksum);
4452

45-
string dict_name_;
53+
const string& dict_name_;
54+
const vector<string>& packs_;
4655
an<Prism> prism_;
4756
an<EditDistanceCorrector> correction_;
48-
an<Table> table_;
57+
vector<of<Table>> tables_;
4958
int options_ = 0;
5059
string prefix_;
5160
};

src/rime/dict/dict_settings.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ bool DictSettings::LoadDictHeader(std::istream& stream) {
3737
return true;
3838
}
3939

40+
bool DictSettings::empty() {
41+
return (*this)["name"].IsNull();
42+
}
43+
4044
string DictSettings::dict_name() {
4145
return (*this)["name"].ToString();
4246
}
@@ -74,6 +78,8 @@ double DictSettings::min_phrase_weight() {
7478
}
7579

7680
an<ConfigList> DictSettings::GetTables() {
81+
if (empty())
82+
return nullptr;
7783
auto tables = New<ConfigList>();
7884
tables->Append((*this)["name"]);
7985
auto imports = (*this)["import_tables"].AsList();

src/rime/dict/dict_settings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class DictSettings : public Config {
1717
public:
1818
DictSettings();
1919
bool LoadDictHeader(std::istream& stream);
20+
bool empty();
2021
string dict_name();
2122
string dict_version();
2223
string sort_order();

0 commit comments

Comments
 (0)