Skip to content

Commit e01c54f

Browse files
zuochunweizuochunwei
andauthored
[DWIO] refactor the reader of dwrf/orc (#261)
Co-authored-by: zuochunwei <zuochunwei@meituan.com>
1 parent 2d0dd93 commit e01c54f

28 files changed

+1073
-369
lines changed

velox/dwio/dwrf/common/Common.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ std::string writerVersionToString(WriterVersion version) {
3636
return folly::to<std::string>("future - ", version);
3737
}
3838

39+
/* unused
3940
std::string streamKindToString(StreamKind kind) {
4041
switch (static_cast<int32_t>(kind)) {
4142
case StreamKind_PRESENT:
@@ -63,6 +64,7 @@ std::string streamKindToString(StreamKind kind) {
6364
}
6465
return folly::to<std::string>("unknown - ", kind);
6566
}
67+
*/
6668

6769
std::string columnEncodingKindToString(ColumnEncodingKind kind) {
6870
switch (static_cast<int32_t>(kind)) {
@@ -82,6 +84,11 @@ DwrfStreamIdentifier EncodingKey::forKind(const proto::Stream_Kind kind) const {
8284
return DwrfStreamIdentifier(node, sequence, 0, kind);
8385
}
8486

87+
DwrfStreamIdentifier EncodingKey::forKind(
88+
const proto::orc::Stream_Kind kind) const {
89+
return DwrfStreamIdentifier(node, sequence, 0, kind);
90+
}
91+
8592
namespace {
8693
using dwio::common::CompressionKind;
8794

velox/dwio/dwrf/common/Common.h

Lines changed: 99 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@
2929

3030
namespace facebook::velox::dwrf {
3131

32+
enum class DwrfFormat : uint8_t {
33+
kDwrf = 0,
34+
kOrc = 1,
35+
};
36+
3237
// Writer version
3338
constexpr folly::StringPiece WRITER_NAME_KEY{"orc.writer.name"};
3439
constexpr folly::StringPiece WRITER_VERSION_KEY{"orc.writer.version"};
@@ -54,6 +59,7 @@ constexpr WriterVersion WriterVersion_CURRENT = WriterVersion::DWRF_7_0;
5459
*/
5560
std::string writerVersionToString(WriterVersion kind);
5661

62+
// Stream kind of dwrf.
5763
enum StreamKind {
5864
StreamKind_PRESENT = 0,
5965
StreamKind_DATA = 1,
@@ -69,15 +75,40 @@ enum StreamKind {
6975
StreamKind_IN_MAP = 11
7076
};
7177

78+
// Stream kind of orc.
79+
enum StreamKindOrc {
80+
StreamKindOrc_PRESENT = 0,
81+
StreamKindOrc_DATA = 1,
82+
StreamKindOrc_LENGTH = 2,
83+
StreamKindOrc_DICTIONARY_DATA = 3,
84+
StreamKindOrc_DICTIONARY_COUNT = 4,
85+
StreamKindOrc_SECONDARY = 5,
86+
StreamKindOrc_ROW_INDEX = 6,
87+
StreamKindOrc_BLOOM_FILTER = 7,
88+
StreamKindOrc_BLOOM_FILTER_UTF8 = 8,
89+
StreamKindOrc_ENCRYPTED_INDEX = 9,
90+
StreamKindOrc_ENCRYPTED_DATA = 10,
91+
StreamKindOrc_STRIPE_STATISTICS = 100,
92+
StreamKindOrc_FILE_STATISTICS = 101,
93+
94+
StreamKindOrc_INVALID = -1
95+
};
96+
7297
inline bool isIndexStream(StreamKind kind) {
7398
return kind == StreamKind::StreamKind_ROW_INDEX ||
7499
kind == StreamKind::StreamKind_BLOOM_FILTER_UTF8;
75100
}
76101

102+
inline bool isIndexStream(StreamKindOrc kind) {
103+
return kind == StreamKindOrc::StreamKindOrc_ROW_INDEX ||
104+
kind == StreamKindOrc::StreamKindOrc_BLOOM_FILTER ||
105+
kind == StreamKindOrc::StreamKindOrc_BLOOM_FILTER_UTF8;
106+
}
107+
77108
/**
78109
* Get the string representation of the StreamKind.
79110
*/
80-
std::string streamKindToString(StreamKind kind);
111+
// std::string streamKindToString(StreamKind kind);
81112

82113
class StreamInformation {
83114
public:
@@ -90,6 +121,12 @@ class StreamInformation {
90121
virtual uint64_t getLength() const = 0;
91122
virtual bool getUseVInts() const = 0;
92123
virtual bool valid() const = 0;
124+
125+
// providing a default implementation otherwise leading to too much compiling
126+
// errors
127+
virtual StreamKindOrc getKindOrc() const {
128+
return StreamKindOrc_INVALID;
129+
}
93130
};
94131

95132
enum ColumnEncodingKind {
@@ -100,21 +137,21 @@ enum ColumnEncodingKind {
100137
};
101138

102139
class DwrfStreamIdentifier;
140+
103141
class EncodingKey {
104142
public:
105143
static const EncodingKey& getInvalid() {
106144
static const EncodingKey INVALID;
107145
return INVALID;
108146
}
109147

110-
public:
148+
uint32_t node;
149+
uint32_t sequence;
150+
111151
EncodingKey()
112152
: EncodingKey(dwio::common::MAX_UINT32, dwio::common::MAX_UINT32) {}
113153

114-
/* implicit */ EncodingKey(uint32_t n, uint32_t s = 0)
115-
: node{n}, sequence{s} {}
116-
uint32_t node;
117-
uint32_t sequence;
154+
EncodingKey(uint32_t n, uint32_t s = 0) : node{n}, sequence{s} {}
118155

119156
bool operator==(const EncodingKey& other) const {
120157
return node == other.node && sequence == other.sequence;
@@ -133,6 +170,8 @@ class EncodingKey {
133170
}
134171

135172
DwrfStreamIdentifier forKind(const proto::Stream_Kind kind) const;
173+
174+
DwrfStreamIdentifier forKind(const proto::orc::Stream_Kind kind) const;
136175
};
137176

138177
struct EncodingKeyHash {
@@ -150,15 +189,24 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
150189

151190
public:
152191
DwrfStreamIdentifier()
153-
: column_(dwio::common::MAX_UINT32), kind_(StreamKind_DATA) {}
192+
: column_(dwio::common::MAX_UINT32),
193+
format_(DwrfFormat::kDwrf),
194+
kind_(StreamKind_DATA) {}
154195

155-
/* implicit */ DwrfStreamIdentifier(const proto::Stream& stream)
196+
DwrfStreamIdentifier(const proto::Stream& stream)
156197
: DwrfStreamIdentifier(
157198
stream.node(),
158199
stream.has_sequence() ? stream.sequence() : 0,
159200
stream.has_column() ? stream.column() : dwio::common::MAX_UINT32,
160201
stream.kind()) {}
161202

203+
DwrfStreamIdentifier(const proto::orc::Stream& stream)
204+
: DwrfStreamIdentifier(
205+
stream.column(),
206+
0,
207+
dwio::common::MAX_UINT32,
208+
stream.kind()) {}
209+
162210
DwrfStreamIdentifier(
163211
uint32_t node,
164212
uint32_t sequence,
@@ -167,9 +215,22 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
167215
: StreamIdentifier(
168216
velox::cache::TrackingId((node << kNodeShift) | kind).id()),
169217
column_{column},
218+
format_(DwrfFormat::kDwrf),
170219
kind_(kind),
171220
encodingKey_{node, sequence} {}
172221

222+
DwrfStreamIdentifier(
223+
uint32_t node,
224+
uint32_t sequence,
225+
uint32_t column,
226+
StreamKindOrc kind)
227+
: StreamIdentifier(
228+
velox::cache::TrackingId((node << kNodeShift) | kind).id()),
229+
column_{column},
230+
format_(DwrfFormat::kOrc),
231+
kindOrc_(kind),
232+
encodingKey_{node, sequence} {}
233+
173234
DwrfStreamIdentifier(
174235
uint32_t node,
175236
uint32_t sequence,
@@ -181,6 +242,17 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
181242
column,
182243
static_cast<StreamKind>(pkind)) {}
183244

245+
DwrfStreamIdentifier(
246+
uint32_t node,
247+
uint32_t sequence,
248+
uint32_t column,
249+
proto::orc::Stream_Kind pkind)
250+
: DwrfStreamIdentifier(
251+
node,
252+
sequence,
253+
column,
254+
static_cast<StreamKindOrc>(pkind)) {}
255+
184256
~DwrfStreamIdentifier() = default;
185257

186258
bool operator==(const DwrfStreamIdentifier& other) const {
@@ -189,37 +261,52 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
189261
return encodingKey_ == other.encodingKey_ && kind_ == other.kind_;
190262
}
191263

192-
std::size_t hash() const {
264+
std::size_t hash() const override {
193265
return encodingKey_.hash() ^ std::hash<uint32_t>()(kind_);
194266
}
195267

196268
uint32_t column() const {
197269
return column_;
198270
}
199271

272+
DwrfFormat format() const {
273+
return format_;
274+
}
275+
200276
const StreamKind& kind() const {
201277
return kind_;
202278
}
203279

280+
const StreamKindOrc& kindOrc() const {
281+
return kindOrc_;
282+
}
283+
204284
const EncodingKey& encodingKey() const {
205285
return encodingKey_;
206286
}
207287

208-
std::string toString() const {
288+
std::string toString() const override {
209289
return fmt::format(
210-
"[id={}, node={}, sequence={}, column={}, kind={}]",
290+
"[id={}, node={}, sequence={}, column={}, format={}, kind={}]",
211291
id_,
212292
encodingKey_.node,
213293
encodingKey_.sequence,
214294
column_,
295+
(uint32_t)format_,
215296
static_cast<uint32_t>(kind_));
216297
}
217298

218299
private:
219300
static constexpr int32_t kNodeShift = 5;
220301

221302
uint32_t column_;
222-
StreamKind kind_;
303+
304+
DwrfFormat format_;
305+
union {
306+
StreamKind kind_; // format_ == kDwrf
307+
StreamKindOrc kindOrc_; // format_ == kOrc
308+
};
309+
223310
EncodingKey encodingKey_;
224311
};
225312

velox/dwio/dwrf/common/FileMetadata.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,6 @@
2525

2626
namespace facebook::velox::dwrf {
2727

28-
enum class DwrfFormat : uint8_t {
29-
kDwrf = 0,
30-
kOrc = 1,
31-
};
32-
3328
class ProtoWrapperBase {
3429
protected:
3530
ProtoWrapperBase(DwrfFormat format, const void* impl)

0 commit comments

Comments
 (0)