Skip to content

Commit 9f5b94e

Browse files
committed
[T2] Wide column metadata improvemnts
1. Make `ColumnMetaData.type` optional 2. Make `ColumnMetaData.path_in_schema` optional 3. Add `ColumnMetaData.schema_index`. This is the ordinal in `FileMetaData.schema` this column corresponds to. This allows sparse representation of columns in a rowgroup.
1 parent 384bedd commit 9f5b94e

File tree

1 file changed

+40
-24
lines changed

1 file changed

+40
-24
lines changed

src/main/thrift/parquet.thrift

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -242,43 +242,36 @@ struct SizeStatistics {
242242
* All fields are optional.
243243
*/
244244
struct Statistics {
245-
/**
246-
* DEPRECATED: min and max value of the column. Use min_value and max_value.
247-
*
248-
* Values are encoded using PLAIN encoding, except that variable-length byte
249-
* arrays do not include a length prefix.
250-
*
251-
* These fields encode min and max values determined by signed comparison
252-
* only. New files should use the correct order for a column's logical type
253-
* and store the values in the min_value and max_value fields.
254-
*
255-
* To support older readers, these may be set when the column order is
256-
* signed.
257-
*/
245+
/* DEPRECATED: do not use */
258246
1: optional binary max;
259247
2: optional binary min;
260248
/** count of null value in the column */
261249
3: optional i64 null_count;
262250
/** count of distinct values occurring */
263251
4: optional i64 distinct_count;
264252
/**
265-
* Lower and upper bound values for the column, determined by its ColumnOrder.
253+
* Only one pair of max_value/min_value, max1/min1, max2/min2, max4/min4,
254+
* max8/min8 can be set. The pair is determined by the physical type of the
255+
* column. Floating point values are bitcasted to integers. Variable length
256+
* values are set in min_value/max_value.
257+
*
258+
* Min and Max are the lower and upper bound values for the column,
259+
* respectively, as determined by its ColumnOrder.
266260
*
267261
* These may be the actual minimum and maximum values found on a page or column
268262
* chunk, but can also be (more compact) values that do not exist on a page or
269263
* column chunk. For example, instead of storing "Blart Versenwald III", a writer
270264
* may set min_value="B", max_value="C". Such more compact values must still be
271265
* valid values within the column's logical type.
272-
*
273-
* Values are encoded using PLAIN encoding, except that variable-length byte
274-
* arrays do not include a length prefix.
275266
*/
276267
5: optional binary max_value;
277268
6: optional binary min_value;
278269
/** If true, max_value is the actual maximum value for a column */
279270
7: optional bool is_max_value_exact;
280271
/** If true, min_value is the actual minimum value for a column */
281272
8: optional bool is_min_value_exact;
273+
9: optional i64 max8;
274+
10: optional i64 min8;
282275
}
283276

284277
/** Empty structs to use as logical type annotations */
@@ -490,7 +483,7 @@ enum Encoding {
490483
// GROUP_VAR_INT = 1;
491484

492485
/**
493-
* Deprecated: Dictionary encoding. The values in the dictionary are encoded in the
486+
* DEPRECATED: Dictionary encoding. The values in the dictionary are encoded in the
494487
* plain type.
495488
* in a data page use RLE_DICTIONARY instead.
496489
* in a Dictionary page use PLAIN instead
@@ -772,15 +765,25 @@ struct PageEncodingStats {
772765
* Description for column metadata
773766
*/
774767
struct ColumnMetaData {
775-
/** Type of this column **/
776-
1: required Type type
768+
/**
769+
* DEPRECATED: can be found in SchemaElement
770+
*
771+
* Writers MUST NOT omit this field until 2025-10-01.
772+
* Readers MUST ignore this field before 2025-10-01.
773+
*/
774+
1: optional Type type
777775

778776
/** Set of all encodings used for this column. The purpose is to validate
779777
* whether we can decode those pages. **/
780778
2: required list<Encoding> encodings
781779

782-
/** Path in schema **/
783-
3: required list<string> path_in_schema
780+
/**
781+
* DEPRECATED: can be found in SchemaElement
782+
*
783+
* Writers MUST NOT omit this field until 2025-10-01.
784+
* Readers MUST ignore this field before 2025-10-01.
785+
*/
786+
3: optional list<string> path_in_schema
784787

785788
/** Compression codec **/
786789
4: required CompressionCodec codec
@@ -810,9 +813,13 @@ struct ColumnMetaData {
810813
/** optional statistics for this column chunk */
811814
12: optional Statistics statistics;
812815

813-
/** Set of all encodings used for pages in this column chunk.
816+
/**
817+
* DEPRECATED: use is_fully_dict_encoded instead
818+
*
819+
* Set of all encodings used for pages in this column chunk.
814820
* This information can be used to determine if all data pages are
815-
* dictionary encoded for example **/
821+
* dictionary encoded for example
822+
*/
816823
13: optional list<PageEncodingStats> encoding_stats;
817824

818825
/** Byte offset from beginning of file to Bloom filter data. **/
@@ -833,6 +840,15 @@ struct ColumnMetaData {
833840
* filter pushdown.
834841
*/
835842
16: optional SizeStatistics size_statistics;
843+
844+
/* True if all pages in this column chunk are dictionary encoded */
845+
17: optional bool is_fully_dict_encoded;
846+
/**
847+
* The index into FileMetadata.schema (list<SchemaElement>) for this column.
848+
* This implies that ColumnMetaData can be sparse in a rowgroup, if for example
849+
* a column does not have any data pages in a rowgroup.
850+
*/
851+
18: optional i32 schema_index;
836852
}
837853

838854
struct EncryptionWithFooterKey {

0 commit comments

Comments
 (0)