Skip to content

Commit 9eba113

Browse files
FelixYBWglutenperfbot
authored andcommitted
Revert "fix(parquet): Avoid SEGV if table column type does not match file column type (facebookincubator#12350)"
This reverts commit 5ad65e4.
1 parent 6599e6b commit 9eba113

File tree

3 files changed

+32
-256
lines changed

3 files changed

+32
-256
lines changed

velox/dwio/parquet/reader/ParquetReader.cpp

Lines changed: 32 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -719,82 +719,69 @@ TypePtr ReaderBase::convertType(
719719
schemaElement.__isset.type_length,
720720
"FIXED_LEN_BYTE_ARRAY requires length to be set");
721721

722-
static std::string_view kTypeMappingErrorFmtStr =
723-
"Converted type {} is not allowed for requested type {}";
724722
if (schemaElement.__isset.converted_type) {
725723
switch (schemaElement.converted_type) {
726724
case thrift::ConvertedType::INT_8:
727-
case thrift::ConvertedType::UINT_8:
728725
VELOX_CHECK_EQ(
729726
schemaElement.type,
730727
thrift::Type::INT32,
731-
"{} converted type can only be set for value of thrift::Type::INT32",
732-
schemaElement.converted_type);
733-
VELOX_CHECK(
734-
!requestedType || requestedType->kind() == TypeKind::TINYINT ||
735-
requestedType->kind() == TypeKind::SMALLINT ||
736-
requestedType->kind() == TypeKind::INTEGER ||
737-
requestedType->kind() == TypeKind::BIGINT,
738-
kTypeMappingErrorFmtStr,
739-
"TINYINT",
740-
requestedType->toString());
728+
"INT8 converted type can only be set for value of thrift::Type::INT32");
741729
return TINYINT();
742730

743731
case thrift::ConvertedType::INT_16:
744-
case thrift::ConvertedType::UINT_16:
745732
VELOX_CHECK_EQ(
746733
schemaElement.type,
747734
thrift::Type::INT32,
748-
"{} converted type can only be set for value of thrift::Type::INT32",
749-
schemaElement.converted_type);
750-
VELOX_CHECK(
751-
!requestedType || requestedType->kind() == TypeKind::SMALLINT ||
752-
requestedType->kind() == TypeKind::INTEGER ||
753-
requestedType->kind() == TypeKind::BIGINT,
754-
kTypeMappingErrorFmtStr,
755-
"SMALLINT",
756-
requestedType->toString());
735+
"INT16 converted type can only be set for value of thrift::Type::INT32");
757736
return SMALLINT();
758737

759738
case thrift::ConvertedType::INT_32:
760-
case thrift::ConvertedType::UINT_32:
761739
VELOX_CHECK_EQ(
762740
schemaElement.type,
763741
thrift::Type::INT32,
764-
"{} converted type can only be set for value of thrift::Type::INT32",
765-
schemaElement.converted_type);
766-
VELOX_CHECK(
767-
!requestedType || requestedType->kind() == TypeKind::INTEGER ||
768-
requestedType->kind() == TypeKind::BIGINT,
769-
kTypeMappingErrorFmtStr,
770-
"INTEGER",
771-
requestedType->toString());
742+
"INT32 converted type can only be set for value of thrift::Type::INT32");
772743
return INTEGER();
773744

774745
case thrift::ConvertedType::INT_64:
746+
VELOX_CHECK_EQ(
747+
schemaElement.type,
748+
thrift::Type::INT64,
749+
"INT64 converted type can only be set for value of thrift::Type::INT64");
750+
return BIGINT();
751+
752+
case thrift::ConvertedType::UINT_8:
753+
VELOX_CHECK_EQ(
754+
schemaElement.type,
755+
thrift::Type::INT32,
756+
"UINT_8 converted type can only be set for value of thrift::Type::INT32");
757+
return TINYINT();
758+
759+
case thrift::ConvertedType::UINT_16:
760+
VELOX_CHECK_EQ(
761+
schemaElement.type,
762+
thrift::Type::INT32,
763+
"UINT_16 converted type can only be set for value of thrift::Type::INT32");
764+
return SMALLINT();
765+
766+
case thrift::ConvertedType::UINT_32:
767+
VELOX_CHECK_EQ(
768+
schemaElement.type,
769+
thrift::Type::INT32,
770+
"UINT_32 converted type can only be set for value of thrift::Type::INT32");
771+
return INTEGER();
772+
775773
case thrift::ConvertedType::UINT_64:
776774
VELOX_CHECK_EQ(
777775
schemaElement.type,
778776
thrift::Type::INT64,
779-
"{} converted type can only be set for value of thrift::Type::INT32",
780-
schemaElement.converted_type);
781-
VELOX_CHECK(
782-
!requestedType || requestedType->kind() == TypeKind::BIGINT,
783-
kTypeMappingErrorFmtStr,
784-
"BIGINT",
785-
requestedType->toString());
777+
"UINT_64 converted type can only be set for value of thrift::Type::INT64");
786778
return BIGINT();
787779

788780
case thrift::ConvertedType::DATE:
789781
VELOX_CHECK_EQ(
790782
schemaElement.type,
791783
thrift::Type::INT32,
792784
"DATE converted type can only be set for value of thrift::Type::INT32");
793-
VELOX_CHECK(
794-
!requestedType || requestedType->isDate(),
795-
kTypeMappingErrorFmtStr,
796-
"DATE",
797-
requestedType->toString());
798785
return DATE();
799786

800787
case thrift::ConvertedType::TIMESTAMP_MICROS:
@@ -803,65 +790,19 @@ TypePtr ReaderBase::convertType(
803790
schemaElement.type,
804791
thrift::Type::INT64,
805792
"TIMESTAMP_MICROS or TIMESTAMP_MILLIS converted type can only be set for value of thrift::Type::INT64");
806-
VELOX_CHECK(
807-
!requestedType || requestedType->kind() == TypeKind::TIMESTAMP,
808-
kTypeMappingErrorFmtStr,
809-
"TIMESTAMP",
810-
requestedType->toString());
811793
return TIMESTAMP();
812794

813795
case thrift::ConvertedType::DECIMAL: {
814796
VELOX_CHECK(
815797
schemaElement.__isset.precision && schemaElement.__isset.scale,
816798
"DECIMAL requires a length and scale specifier!");
817-
const auto schemaElementPrecision = schemaElement.precision;
818-
const auto schemaElementScale = schemaElement.scale;
819-
// A long decimal requested type cannot read a value of a short decimal.
820-
// As a result, the mapping from short to long decimal is currently
821-
// restricted.
822-
auto type = DECIMAL(schemaElementPrecision, schemaElementScale);
823-
if (requestedType) {
824-
VELOX_CHECK(
825-
requestedType->isDecimal(),
826-
kTypeMappingErrorFmtStr,
827-
"DECIMAL",
828-
requestedType->toString());
829-
// Reading short decimals with a long decimal requested type is not
830-
// yet possible. To allow for correct interpretation of the values,
831-
// the scale of the file type and requested type must match while
832-
// precision may be larger.
833-
if (requestedType->isShortDecimal()) {
834-
const auto& shortDecimalType = requestedType->asShortDecimal();
835-
VELOX_CHECK(
836-
type->isShortDecimal() &&
837-
shortDecimalType.precision() >= schemaElementPrecision &&
838-
shortDecimalType.scale() == schemaElementScale,
839-
kTypeMappingErrorFmtStr,
840-
type->toString(),
841-
requestedType->toString());
842-
} else {
843-
const auto& longDecimalType = requestedType->asLongDecimal();
844-
VELOX_CHECK(
845-
type->isLongDecimal() &&
846-
longDecimalType.precision() >= schemaElementPrecision &&
847-
longDecimalType.scale() == schemaElementScale,
848-
kTypeMappingErrorFmtStr,
849-
type->toString(),
850-
requestedType->toString());
851-
}
852-
}
853-
return type;
799+
return DECIMAL(schemaElement.precision, schemaElement.scale);
854800
}
855801

856802
case thrift::ConvertedType::UTF8:
857803
switch (schemaElement.type) {
858804
case thrift::Type::BYTE_ARRAY:
859805
case thrift::Type::FIXED_LEN_BYTE_ARRAY:
860-
VELOX_CHECK(
861-
!requestedType || requestedType->kind() == TypeKind::VARCHAR,
862-
kTypeMappingErrorFmtStr,
863-
"VARCHAR",
864-
requestedType->toString());
865806
return VARCHAR();
866807
default:
867808
VELOX_FAIL(
@@ -872,11 +813,6 @@ TypePtr ReaderBase::convertType(
872813
schemaElement.type,
873814
thrift::Type::BYTE_ARRAY,
874815
"ENUM converted type can only be set for value of thrift::Type::BYTE_ARRAY");
875-
VELOX_CHECK(
876-
!requestedType || requestedType->kind() == TypeKind::VARCHAR,
877-
kTypeMappingErrorFmtStr,
878-
"VARCHAR",
879-
requestedType->toString());
880816
return VARCHAR();
881817
}
882818
case thrift::ConvertedType::MAP:
@@ -895,69 +831,27 @@ TypePtr ReaderBase::convertType(
895831
} else {
896832
switch (schemaElement.type) {
897833
case thrift::Type::type::BOOLEAN:
898-
VELOX_CHECK(
899-
!requestedType || requestedType->kind() == TypeKind::BOOLEAN,
900-
kTypeMappingErrorFmtStr,
901-
"BOOLEAN",
902-
requestedType->toString());
903834
return BOOLEAN();
904835
case thrift::Type::type::INT32:
905-
VELOX_CHECK(
906-
!requestedType || requestedType->kind() == TypeKind::INTEGER ||
907-
requestedType->kind() == TypeKind::BIGINT,
908-
kTypeMappingErrorFmtStr,
909-
"INTEGER",
910-
requestedType->toString());
911836
return INTEGER();
912837
case thrift::Type::type::INT64:
913838
// For Int64 Timestamp in nano precision
914839
if (schemaElement.__isset.logicalType &&
915840
schemaElement.logicalType.__isset.TIMESTAMP) {
916-
VELOX_CHECK(
917-
!requestedType || requestedType->kind() == TypeKind::TIMESTAMP,
918-
kTypeMappingErrorFmtStr,
919-
"TIMESTAMP",
920-
requestedType->toString());
921841
return TIMESTAMP();
922842
}
923-
VELOX_CHECK(
924-
!requestedType || requestedType->kind() == TypeKind::BIGINT,
925-
kTypeMappingErrorFmtStr,
926-
"BIGINT",
927-
requestedType->toString());
928843
return BIGINT();
929844
case thrift::Type::type::INT96:
930-
VELOX_CHECK(
931-
!requestedType || requestedType->kind() == TypeKind::TIMESTAMP,
932-
kTypeMappingErrorFmtStr,
933-
"TIMESTAMP",
934-
requestedType->toString());
935845
return TIMESTAMP(); // INT96 only maps to a timestamp
936846
case thrift::Type::type::FLOAT:
937-
VELOX_CHECK(
938-
!requestedType || requestedType->kind() == TypeKind::REAL ||
939-
requestedType->kind() == TypeKind::DOUBLE,
940-
kTypeMappingErrorFmtStr,
941-
"REAL",
942-
requestedType->toString());
943847
return REAL();
944848
case thrift::Type::type::DOUBLE:
945-
VELOX_CHECK(
946-
!requestedType || requestedType->kind() == TypeKind::DOUBLE,
947-
kTypeMappingErrorFmtStr,
948-
"DOUBLE",
949-
requestedType->toString());
950849
return DOUBLE();
951850
case thrift::Type::type::BYTE_ARRAY:
952851
case thrift::Type::type::FIXED_LEN_BYTE_ARRAY:
953852
if (requestedType && requestedType->isVarchar()) {
954853
return VARCHAR();
955854
} else {
956-
VELOX_CHECK(
957-
!requestedType || requestedType->isVarbinary(),
958-
kTypeMappingErrorFmtStr,
959-
"VARBINARY",
960-
requestedType->toString());
961855
return VARBINARY();
962856
}
963857

velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
* limitations under the License.
1515
*/
1616

17-
#include "velox/common/base/tests/GTestUtils.h"
1817
#include "velox/dwio/parquet/tests/ParquetTestBase.h"
1918
#include "velox/expression/ExprToSubfieldFilter.h"
2019
#include "velox/vector/tests/utils/VectorMaker.h"
@@ -1716,44 +1715,3 @@ TEST_F(ParquetReaderTest, parquet251) {
17161715
assertReadWithFilters(
17171716
"parquet-251.parquet", rowType, std::move(filters), expected);
17181717
}
1719-
1720-
TEST_F(ParquetReaderTest, fileColumnVarcharToMetadataColumnMismatchTest) {
1721-
const std::string sample(getExampleFilePath("nation.parquet"));
1722-
1723-
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
1724-
1725-
auto runVarcharColTest = [&](const TypePtr& requestedType) {
1726-
// The type in the file is a BYTE_ARRAY resolving to VARCHAR.
1727-
// The requested type must match with what is requested as otherwise:
1728-
// - errors occur in the column readers
1729-
// - SIGSEGVs can be encountered during partitioning and subsequent
1730-
// operators following the table scan
1731-
auto outputRowType =
1732-
ROW({"nationkey", "name", "regionkey", "comment"},
1733-
{BIGINT(), requestedType, BIGINT(), VARCHAR()});
1734-
1735-
// Sets the metadata schema requested, for example from Hive, and not the
1736-
// schema from the file.
1737-
readerOptions.setFileSchema(outputRowType);
1738-
VELOX_ASSERT_THROW(
1739-
createReader(sample, readerOptions),
1740-
fmt::format(
1741-
"Converted type VARCHAR is not allowed for requested type {}",
1742-
requestedType->toString()));
1743-
};
1744-
1745-
auto types = std::vector<TypePtr>{
1746-
SMALLINT(),
1747-
INTEGER(),
1748-
BIGINT(),
1749-
DECIMAL(10, 2),
1750-
REAL(),
1751-
DOUBLE(),
1752-
TIMESTAMP(),
1753-
DATE(),
1754-
VARBINARY()};
1755-
1756-
for (const auto& type : types) {
1757-
runVarcharColTest(type);
1758-
}
1759-
}

velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp

Lines changed: 0 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,82 +1390,6 @@ TEST_F(ParquetTableScanTest, singleBooleanRle) {
13901390
assertSelect({"c2"}, "SELECT c2 FROM tmp");
13911391
}
13921392

1393-
TEST_F(ParquetTableScanTest, intToBigintRead) {
1394-
vector_size_t kSize = 100;
1395-
RowVectorPtr intDataFileVectors = makeRowVector(
1396-
{"c1"}, {makeFlatVector<int32_t>(kSize, [](auto row) { return row; })});
1397-
1398-
RowVectorPtr bigintDataFileVectors = makeRowVector(
1399-
{"c1"}, {makeFlatVector<int64_t>(kSize, [](auto row) { return row; })});
1400-
1401-
const std::shared_ptr<exec::test::TempDirectoryPath> dataFileFolder =
1402-
exec::test::TempDirectoryPath::create();
1403-
auto filePath = dataFileFolder->getPath() + "/" + "data.parquet";
1404-
WriterOptions options;
1405-
options.writeInt96AsTimestamp = false;
1406-
writeToParquetFile(filePath, {intDataFileVectors}, options);
1407-
1408-
auto rowType = ROW({"c1"}, {BIGINT()});
1409-
auto op = PlanBuilder()
1410-
.startTableScan()
1411-
.outputType(rowType)
1412-
.dataColumns(rowType)
1413-
.endTableScan()
1414-
.planNode();
1415-
1416-
auto split = makeSplit(filePath);
1417-
auto result = AssertQueryBuilder(op).split(split).copyResults(pool());
1418-
auto rows = result->as<RowVector>();
1419-
1420-
assertEqualVectors(bigintDataFileVectors->childAt(0), rows->childAt(0));
1421-
}
1422-
1423-
TEST_F(ParquetTableScanTest, shortAndLongDecimalReadWithLargerPrecision) {
1424-
// decimal.parquet holds two columns (a: DECIMAL(5, 2), b: DECIMAL(20, 5)) and
1425-
// 20 rows (10 rows per group). Data is in plain uncompressed format:
1426-
// a: [100.01 .. 100.20]
1427-
// b: [100000000000000.00001 .. 100000000000000.00020]
1428-
// This test reads the DECIMAL(5, 2)a and DECIMAL(20, 5) file columns
1429-
// with DECIMAL(8, 2) and DECIMAL(22, 5) row types.
1430-
vector_size_t kSize = 20;
1431-
std::vector<int64_t> unscaledShortValues(kSize);
1432-
std::iota(unscaledShortValues.begin(), unscaledShortValues.end(), 10001);
1433-
std::vector<int128_t> longDecimalValues;
1434-
for (int i = 1; i <= kSize; ++i) {
1435-
if (i < 10) {
1436-
longDecimalValues.emplace_back(
1437-
HugeInt::parse(fmt::format("1000000000000000000{}", i)));
1438-
} else {
1439-
longDecimalValues.emplace_back(
1440-
HugeInt::parse(fmt::format("100000000000000000{}", i)));
1441-
}
1442-
}
1443-
1444-
RowVectorPtr expectedDecimalVectors = makeRowVector(
1445-
{"c1", "c2"},
1446-
{makeFlatVector<int64_t>(unscaledShortValues, DECIMAL(8, 2)),
1447-
makeFlatVector<int128_t>(longDecimalValues, DECIMAL(22, 5))});
1448-
1449-
const std::shared_ptr<exec::test::TempDirectoryPath> dataFileFolder =
1450-
exec::test::TempDirectoryPath::create();
1451-
auto filePath = getExampleFilePath("decimal.parquet");
1452-
1453-
auto rowType = ROW({"c1", "c2"}, {DECIMAL(8, 2), DECIMAL(22, 5)});
1454-
auto op = PlanBuilder()
1455-
.startTableScan()
1456-
.outputType(rowType)
1457-
.dataColumns(rowType)
1458-
.endTableScan()
1459-
.planNode();
1460-
1461-
auto split = makeSplit(filePath);
1462-
auto result = AssertQueryBuilder(op).split(split).copyResults(pool());
1463-
auto rows = result->as<RowVector>();
1464-
1465-
assertEqualVectors(expectedDecimalVectors->childAt(0), rows->childAt(0));
1466-
assertEqualVectors(expectedDecimalVectors->childAt(1), rows->childAt(1));
1467-
}
1468-
14691393
int main(int argc, char** argv) {
14701394
testing::InitGoogleTest(&argc, argv);
14711395
folly::Init init{&argc, &argv, false};

0 commit comments

Comments
 (0)