2929
3030namespace facebook ::velox::dwrf {
3131
32+ enum class DwrfFormat : uint8_t {
33+ kDwrf = 0 ,
34+ kOrc = 1 ,
35+ };
36+
3237// Writer version
3338constexpr folly::StringPiece WRITER_NAME_KEY{" orc.writer.name" };
3439constexpr folly::StringPiece WRITER_VERSION_KEY{" orc.writer.version" };
@@ -54,6 +59,7 @@ constexpr WriterVersion WriterVersion_CURRENT = WriterVersion::DWRF_7_0;
5459 */
5560std::string writerVersionToString (WriterVersion kind);
5661
62+ // Stream kind of dwrf.
5763enum StreamKind {
5864 StreamKind_PRESENT = 0 ,
5965 StreamKind_DATA = 1 ,
@@ -69,15 +75,40 @@ enum StreamKind {
6975 StreamKind_IN_MAP = 11
7076};
7177
78+ // Stream kind of orc.
79+ enum StreamKindOrc {
80+ StreamKindOrc_PRESENT = 0 ,
81+ StreamKindOrc_DATA = 1 ,
82+ StreamKindOrc_LENGTH = 2 ,
83+ StreamKindOrc_DICTIONARY_DATA = 3 ,
84+ StreamKindOrc_DICTIONARY_COUNT = 4 ,
85+ StreamKindOrc_SECONDARY = 5 ,
86+ StreamKindOrc_ROW_INDEX = 6 ,
87+ StreamKindOrc_BLOOM_FILTER = 7 ,
88+ StreamKindOrc_BLOOM_FILTER_UTF8 = 8 ,
89+ StreamKindOrc_ENCRYPTED_INDEX = 9 ,
90+ StreamKindOrc_ENCRYPTED_DATA = 10 ,
91+ StreamKindOrc_STRIPE_STATISTICS = 100 ,
92+ StreamKindOrc_FILE_STATISTICS = 101 ,
93+
94+ StreamKindOrc_INVALID = -1
95+ };
96+
7297inline bool isIndexStream (StreamKind kind) {
7398 return kind == StreamKind::StreamKind_ROW_INDEX ||
7499 kind == StreamKind::StreamKind_BLOOM_FILTER_UTF8;
75100}
76101
102+ inline bool isIndexStream (StreamKindOrc kind) {
103+ return kind == StreamKindOrc::StreamKindOrc_ROW_INDEX ||
104+ kind == StreamKindOrc::StreamKindOrc_BLOOM_FILTER ||
105+ kind == StreamKindOrc::StreamKindOrc_BLOOM_FILTER_UTF8;
106+ }
107+
77108/* *
78109 * Get the string representation of the StreamKind.
79110 */
80- std::string streamKindToString (StreamKind kind);
111+ // std::string streamKindToString(StreamKind kind);
81112
82113class StreamInformation {
83114 public:
@@ -90,6 +121,12 @@ class StreamInformation {
90121 virtual uint64_t getLength () const = 0;
91122 virtual bool getUseVInts () const = 0;
92123 virtual bool valid () const = 0;
124+
125+ // providing a default implementation otherwise leading to too much compiling
126+ // errors
127+ virtual StreamKindOrc getKindOrc () const {
128+ return StreamKindOrc_INVALID;
129+ }
93130};
94131
95132enum ColumnEncodingKind {
@@ -100,21 +137,21 @@ enum ColumnEncodingKind {
100137};
101138
102139class DwrfStreamIdentifier ;
140+
103141class EncodingKey {
104142 public:
105143 static const EncodingKey& getInvalid () {
106144 static const EncodingKey INVALID;
107145 return INVALID;
108146 }
109147
110- public:
148+ uint32_t node;
149+ uint32_t sequence;
150+
111151 EncodingKey ()
112152 : EncodingKey(dwio::common::MAX_UINT32, dwio::common::MAX_UINT32) {}
113153
114- /* implicit */ EncodingKey(uint32_t n, uint32_t s = 0 )
115- : node{n}, sequence{s} {}
116- uint32_t node;
117- uint32_t sequence;
154+ EncodingKey (uint32_t n, uint32_t s = 0 ) : node{n}, sequence{s} {}
118155
119156 bool operator ==(const EncodingKey& other) const {
120157 return node == other.node && sequence == other.sequence ;
@@ -133,6 +170,8 @@ class EncodingKey {
133170 }
134171
135172 DwrfStreamIdentifier forKind (const proto::Stream_Kind kind) const ;
173+
174+ DwrfStreamIdentifier forKind (const proto::orc::Stream_Kind kind) const ;
136175};
137176
138177struct EncodingKeyHash {
@@ -150,15 +189,24 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
150189
151190 public:
152191 DwrfStreamIdentifier ()
153- : column_(dwio::common::MAX_UINT32), kind_(StreamKind_DATA) {}
192+ : column_(dwio::common::MAX_UINT32),
193+ format_ (DwrfFormat::kDwrf ),
194+ kind_(StreamKind_DATA) {}
154195
155- /* implicit */ DwrfStreamIdentifier(const proto::Stream& stream)
196+ DwrfStreamIdentifier (const proto::Stream& stream)
156197 : DwrfStreamIdentifier(
157198 stream.node(),
158199 stream.has_sequence() ? stream.sequence() : 0,
159200 stream.has_column() ? stream.column() : dwio::common::MAX_UINT32,
160201 stream.kind()) {}
161202
203+ DwrfStreamIdentifier (const proto::orc::Stream& stream)
204+ : DwrfStreamIdentifier(
205+ stream.column(),
206+ 0,
207+ dwio::common::MAX_UINT32,
208+ stream.kind()) {}
209+
162210 DwrfStreamIdentifier (
163211 uint32_t node,
164212 uint32_t sequence,
@@ -167,9 +215,22 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
167215 : StreamIdentifier(
168216 velox::cache::TrackingId ((node << kNodeShift ) | kind).id()),
169217 column_{column},
218+ format_ (DwrfFormat::kDwrf ),
170219 kind_(kind),
171220 encodingKey_{node, sequence} {}
172221
222+ DwrfStreamIdentifier (
223+ uint32_t node,
224+ uint32_t sequence,
225+ uint32_t column,
226+ StreamKindOrc kind)
227+ : StreamIdentifier(
228+ velox::cache::TrackingId ((node << kNodeShift ) | kind).id()),
229+ column_{column},
230+ format_ (DwrfFormat::kOrc ),
231+ kindOrc_(kind),
232+ encodingKey_{node, sequence} {}
233+
173234 DwrfStreamIdentifier (
174235 uint32_t node,
175236 uint32_t sequence,
@@ -181,6 +242,17 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
181242 column,
182243 static_cast <StreamKind>(pkind)) {}
183244
245+ DwrfStreamIdentifier (
246+ uint32_t node,
247+ uint32_t sequence,
248+ uint32_t column,
249+ proto::orc::Stream_Kind pkind)
250+ : DwrfStreamIdentifier(
251+ node,
252+ sequence,
253+ column,
254+ static_cast <StreamKindOrc>(pkind)) {}
255+
184256 ~DwrfStreamIdentifier () = default ;
185257
186258 bool operator ==(const DwrfStreamIdentifier& other) const {
@@ -189,37 +261,52 @@ class DwrfStreamIdentifier : public dwio::common::StreamIdentifier {
189261 return encodingKey_ == other.encodingKey_ && kind_ == other.kind_ ;
190262 }
191263
192- std::size_t hash () const {
264+ std::size_t hash () const override {
193265 return encodingKey_.hash () ^ std::hash<uint32_t >()(kind_);
194266 }
195267
196268 uint32_t column () const {
197269 return column_;
198270 }
199271
272+ DwrfFormat format () const {
273+ return format_;
274+ }
275+
200276 const StreamKind& kind () const {
201277 return kind_;
202278 }
203279
280+ const StreamKindOrc& kindOrc () const {
281+ return kindOrc_;
282+ }
283+
204284 const EncodingKey& encodingKey () const {
205285 return encodingKey_;
206286 }
207287
208- std::string toString () const {
288+ std::string toString () const override {
209289 return fmt::format (
210- " [id={}, node={}, sequence={}, column={}, kind={}]" ,
290+ " [id={}, node={}, sequence={}, column={}, format={}, kind={}]" ,
211291 id_,
212292 encodingKey_.node ,
213293 encodingKey_.sequence ,
214294 column_,
295+ (uint32_t )format_,
215296 static_cast <uint32_t >(kind_));
216297 }
217298
218299 private:
219300 static constexpr int32_t kNodeShift = 5 ;
220301
221302 uint32_t column_;
222- StreamKind kind_;
303+
304+ DwrfFormat format_;
305+ union {
306+ StreamKind kind_; // format_ == kDwrf
307+ StreamKindOrc kindOrc_; // format_ == kOrc
308+ };
309+
223310 EncodingKey encodingKey_;
224311};
225312
0 commit comments