@@ -22,27 +22,38 @@ class Decoder {
22
22
virtual float GetFloat () { return 0 ; }
23
23
virtual String GetString () { return String (); }
24
24
25
+ int value_left () const { return num_values_; }
26
+
25
27
protected:
26
- Decoder (parquet::SchemaElement* schema) : schema_(schema) {}
28
+ Decoder (parquet::SchemaElement* schema, int num_values)
29
+ : schema_(schema), num_values_(num_values) {}
27
30
parquet::SchemaElement* schema_;
31
+ int num_values_;
28
32
};
29
33
30
34
class BoolDecoder : public Decoder {
31
35
public:
32
- BoolDecoder (parquet::SchemaElement* schema) : Decoder(schema) { }
36
+ BoolDecoder (parquet::SchemaElement* schema, int num_values ) : Decoder(schema, num_values ) { }
33
37
34
38
virtual void SetData (const uint8_t * data, int len) {
39
+ decoder_ = impala::RleDecoder (data, len, 1 );
40
+ }
41
+
42
+ virtual bool GetBool () {
43
+ bool result;
44
+ if (!decoder_.Get (&result)) throw " EOF" ;
45
+ --num_values_;
46
+ return result;
35
47
}
36
48
37
- virtual bool GetBool () {
38
- return false ;
39
- }
49
+ private:
50
+ impala::RleDecoder decoder_;
40
51
};
41
52
42
53
class PlainDecoder : public Decoder {
43
54
public:
44
- PlainDecoder (parquet::SchemaElement* schema)
45
- : Decoder(schema), data_(NULL ), len_(0 ) {
55
+ PlainDecoder (parquet::SchemaElement* schema, int num_values )
56
+ : Decoder(schema, num_values ), data_(NULL ), len_(0 ) {
46
57
}
47
58
48
59
virtual void SetData (const uint8_t * data, int len) {
@@ -51,39 +62,43 @@ class PlainDecoder : public Decoder {
51
62
}
52
63
53
64
virtual int32_t GetInt32 () {
54
- if (len_ < sizeof (int32_t )) throw " Bad " ;
65
+ if (len_ < sizeof (int32_t )) throw " EOF " ;
55
66
int32_t val = *reinterpret_cast <const int32_t *>(data_);
56
67
data_ += sizeof (int32_t );
57
68
len_ -= sizeof (int32_t );
69
+ --num_values_;
58
70
return val;
59
71
}
60
72
61
73
virtual int64_t GetInt64 () {
62
- if (len_ < sizeof (int64_t )) throw " Bad " ;
74
+ if (len_ < sizeof (int64_t )) throw " EOF " ;
63
75
int64_t val = *reinterpret_cast <const int64_t *>(data_);
64
76
data_ += sizeof (int64_t );
65
77
len_ -= sizeof (int64_t );
78
+ --num_values_;
66
79
return val;
67
80
}
68
81
69
82
virtual float GetFloat () {
70
- if (len_ < sizeof (float )) throw " Bad " ;
83
+ if (len_ < sizeof (float )) throw " EOF " ;
71
84
float val = *reinterpret_cast <const float *>(data_);
72
85
data_ += sizeof (float );
73
86
len_ -= sizeof (float );
87
+ --num_values_;
74
88
return val;
75
89
}
76
90
77
91
virtual String GetString () {
78
92
String result;
79
- if (len_ < sizeof (uint32_t )) throw " Bad " ;
93
+ if (len_ < sizeof (uint32_t )) throw " EOF " ;
80
94
result.len = *reinterpret_cast <const uint32_t *>(data_);
81
95
data_ += sizeof (uint32_t );
82
96
len_ -= sizeof (uint32_t );
83
- if (len_ < result.len ) throw " Bad " ;
97
+ if (len_ < result.len ) throw " EOF " ;
84
98
result.ptr = data_;
85
99
data_ += result.len ;
86
100
len_ -= result.len ;
101
+ --num_values_;
87
102
return result;
88
103
}
89
104
@@ -94,25 +109,68 @@ class PlainDecoder : public Decoder {
94
109
95
110
class DictionaryDecoder : public Decoder {
96
111
public:
97
- DictionaryDecoder (parquet::SchemaElement* schema, Decoder* dictionary)
98
- : Decoder(schema) {
112
+ DictionaryDecoder (parquet::SchemaElement* schema, int num_values, Decoder* dictionary)
113
+ : Decoder(schema, num_values) {
114
+ int num_dictionary_values = dictionary->value_left ();
115
+ switch (schema->type ) {
116
+ case parquet::Type::BOOLEAN: throw " Boolean cols should not be dictionary encoded." ;
117
+ case parquet::Type::INT32:
118
+ int32_dictionary_.resize (num_dictionary_values);
119
+ for (int i = 0 ; i < num_dictionary_values; ++i) {
120
+ int32_dictionary_[i] = dictionary->GetInt32 ();
121
+ }
122
+ break ;
123
+ case parquet::Type::INT64:
124
+ int64_dictionary_.resize (num_dictionary_values);
125
+ for (int i = 0 ; i < num_dictionary_values; ++i) {
126
+ int64_dictionary_[i] = dictionary->GetInt64 ();
127
+ }
128
+ break ;
129
+ case parquet::Type::FLOAT:
130
+ float_dictionary_.resize (num_dictionary_values);
131
+ for (int i = 0 ; i < num_dictionary_values; ++i) {
132
+ float_dictionary_[i] = dictionary->GetFloat ();
133
+ }
134
+ break ;
135
+ case parquet::Type::BYTE_ARRAY:
136
+ string_dictionary_.resize (num_dictionary_values);
137
+ for (int i = 0 ; i < num_dictionary_values; ++i) {
138
+ string_dictionary_[i] = dictionary->GetString ();
139
+ }
140
+ break ;
141
+ default :
142
+ throw " NYI" ;
143
+ }
99
144
}
100
145
101
146
virtual void SetData (const uint8_t * data, int len) {
147
+ if (len == 0 ) return ;
148
+ uint8_t bit_width = *data;
149
+ ++data;
150
+ --len;
151
+ idx_decoder_ = impala::RleDecoder (data, len, bit_width);
102
152
}
103
153
104
- virtual int32_t GetInt32 () {
105
- return 0 ;
106
- }
107
- virtual int64_t GetInt64 () {
108
- return 0 ;
109
- }
110
- virtual float GetFloat () {
111
- return 0 ;
112
- }
113
- virtual String GetString () {
114
- return String () ;
154
+ virtual int32_t GetInt32 () { return int32_dictionary_[ index ()]; }
155
+ virtual int64_t GetInt64 () { return int64_dictionary_[ index ()]; }
156
+ virtual float GetFloat () { return float_dictionary_[ index ()]; }
157
+ virtual String GetString () { return string_dictionary_[ index ()]; }
158
+
159
+ private:
160
+ int index () {
161
+ int idx ;
162
+ if (!idx_decoder_. Get (&idx)) throw " EOF " ;
163
+ --num_values_;
164
+ return idx ;
115
165
}
166
+
167
+ // Only one is set.
168
+ std::vector<int32_t > int32_dictionary_;
169
+ std::vector<int64_t > int64_dictionary_;
170
+ std::vector<float > float_dictionary_;
171
+ std::vector<String> string_dictionary_;
172
+
173
+ impala::RleDecoder idx_decoder_;
116
174
};
117
175
118
176
}
0 commit comments