-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.hpp
92 lines (67 loc) · 1.93 KB
/
model.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#ifndef __MODEL_HPP_INCLUDED__
#define __MODEL_HPP_INCLUDED__
#include <cmath>
#include <limits>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/discrete_distribution.hpp>
#include "sequence.hpp"
#include "numerator.hpp"
#include "tokenizer.hpp"
namespace markov {
class Model {
public:
Model(size_t dim):
dimention(dim),
sequence(dimention)
{
}
Model(): Model(0) {}
void add_word(const std::string &word);
void build(bool sanity_check=false);
void build(const std::string &data, bool sanity_check=false);
void print();
std::vector<std::string> generate(const std::vector<std::string> &start_sequence, size_t count);
size_t order() const {
return dimention;
}
private:
friend class boost::serialization::access;
using Frequencies = std::map<uint32_t, size_t>;
struct SequenceStat {
size_t total_suffixes_count;
Frequencies frequencies;
};
using TextStat = std::map<Sequence, SequenceStat>;
class Transition {
public:
void add(uint32_t id, double probability)
{
words_id.push_back(id);
probabilities.push_back(probability);
}
std::vector<uint32_t> words_id;
std::vector<double> probabilities;
template<typename Archive>
void serialize(Archive &ar, const unsigned int)
{
ar & words_id;
ar & probabilities;
}
};
using TransitionTable = std::map<Sequence, Transition>;
size_t dimention;
TransitionTable transition_table;
Numerator numerator;
Sequence sequence;
TextStat text_stat;
double kahan_sum(const std::vector<double> &data);
template<typename Archive>
void serialize(Archive &ar, const unsigned int)
{
ar & dimention;
ar & transition_table;
ar & numerator;
}
};
} // namespace
#endif