Skip to content

Commit bf74dbb

Browse files
authored
[fix](load)fix ingestion load error case cause be core. (#55906)
Related PR: #45937 branch-3.1: #55500 Problem Summary: Fix the error case on ingestion load and the core in parquet reader. ``` ==8898==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x62f0020603fc at pc 0x55f634e64ded bp 0x7fba0d03c410 sp 0x7fba0d03bbd8 READ of size 4 at 0x62f0020603fc thread T768 (PUSH-9699) #0 0x55f634e64dec in __asan_memcpy (/mnt/hdd01/ci/doris-deploy-branch-3.1-local/be/lib/doris_be+0x39a24dec) (BuildId: 9b04e7f7d3075dac) #1 0x55f634eca93f in std::char_traits::copy(char*, char const*, unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/char_traits.h:409:33 #2 0x55f634eca93f in std::__cxx11::basic_string, std::allocator>::_S_copy(char*, char const*, unsigned long) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:351:4 #3 0x55f634eca93f in std::__cxx11::basic_string, std::allocator>::_S_copy_chars(char*, char const*, char const*) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:398:9 #4 0x55f634eca93f in void std::__cxx11::basic_string, std::allocator>::_M_construct(char const*, char const*, std::forward_iterator_tag) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.tcc:225:6 #5 0x55f654a4f74d in void std::__cxx11::basic_string, std::allocator>::_M_construct_aux(char const*, char const*, std::__false_type) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:247:11 #6 0x55f654a4f74d in void std::__cxx11::basic_string, std::allocator>::_M_construct(char const*, char const*) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:266:4 #7 0x55f654a4f74d in std::__cxx11::basic_string, std::allocator>::basic_string(char const*, unsigned long, std::allocator const&) /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/basic_string.h:513:9 #8 0x55f654a4f74d in doris::vectorized::parse_thrift_footer(std::shared_ptr, doris::vectorized::FileMetaData**, unsigned long*, doris::io::IOContext*) /home/zcp/repo_center/doris_branch-3.1/doris/be/src/vec/exec/format/parquet/parquet_thrift_util.h:55:17 ```
1 parent cebcda4 commit bf74dbb

9 files changed

+38
-29
lines changed

be/src/vec/exec/format/parquet/parquet_thrift_util.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,11 @@ static Status parse_thrift_footer(io::FileReaderSPtr file,
4848

4949
// validate magic
5050
uint8_t* magic_ptr = footer.data() + bytes_read - 4;
51-
if (bytes_read < PARQUET_FOOTER_SIZE ||
52-
memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
51+
if (bytes_read < PARQUET_FOOTER_SIZE) {
52+
return Status::Corruption(
53+
"Read parquet file footer fail, bytes read: {}, file size: {}, path: {}",
54+
bytes_read, file_size, file->path().native());
55+
} else if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) {
5356
return Status::Corruption(
5457
"Invalid magic number in parquet file, bytes read: {}, file size: {}, path: {}, "
5558
"read magic: {}",
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !select --
3+
2024-09-01 5
4+
2024-09-02 1
5+
2024-09-03 1
6+
2024-09-04 3
7+

regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import java.nio.file.StandardCopyOption
2121

2222
suite('test_ingestion_load', 'p0,external') {
2323

24-
def testIngestLoadJob = { testTable, loadLabel, String dataFile ->
24+
def testIngestLoadJob = { testTable, loadLabel, String dataFile , filesize ->
2525

2626
sql "TRUNCATE TABLE ${testTable}"
2727

@@ -85,7 +85,7 @@ suite('test_ingestion_load', 'p0,external') {
8585
"msg": "",
8686
"appId": "",
8787
"dppResult": "${dppResult}",
88-
"filePathToSize": "{\\"${etlResultFilePath}\\": 81758}",
88+
"filePathToSize": "{\\"${etlResultFilePath}\\": ${filesize}}",
8989
"hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}"
9090
}
9191
}"""
@@ -156,7 +156,7 @@ suite('test_ingestion_load', 'p0,external') {
156156

157157
def label = "test_ingestion_load"
158158

159-
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet')
159+
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet',5745)
160160

161161
tableName = 'tbl_test_spark_load_unique_mor'
162162

@@ -189,7 +189,7 @@ suite('test_ingestion_load', 'p0,external') {
189189

190190
label = "test_ingestion_load_unique_mor"
191191

192-
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet')
192+
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet',5745)
193193

194194
tableName = 'tbl_test_spark_load_agg'
195195

@@ -215,7 +215,7 @@ suite('test_ingestion_load', 'p0,external') {
215215

216216
label = "test_ingestion_load_agg"
217217

218-
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data1.parquet')
218+
testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data1.parquet',4057)
219219

220220
}
221221

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ suite('test_ingestion_load_alter_column', 'p0,external') {
8585
"msg": "",
8686
"appId": "",
8787
"dppResult": "${dppResult}",
88-
"filePathToSize": "{\\"${etlResultFilePath}\\": 81758}",
88+
"filePathToSize": "{\\"${etlResultFilePath}\\": 5745}",
8989
"hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}"
9090
}
9191
}"""
@@ -112,7 +112,7 @@ suite('test_ingestion_load_alter_column', 'p0,external') {
112112
while (max_try_milli_secs) {
113113
def result = sql "show load where label = '${loadLabel}'"
114114
if (result[0][2] == "CANCELLED") {
115-
msg = result[0][7]
115+
def msg = result[0][7]
116116
logger.info("err msg: " + msg)
117117
assertTrue((result[0][7] =~ /schema of index \[\d+\] has changed/).find())
118118
break
@@ -134,6 +134,8 @@ suite('test_ingestion_load_alter_column', 'p0,external') {
134134

135135
try {
136136

137+
sql "DROP TABLE if exists ${tableName1}"
138+
sql "DROP TABLE if exists ${tableName2}"
137139
sql """
138140
CREATE TABLE IF NOT EXISTS ${tableName1} (
139141
c_int int(11) NULL,
@@ -199,10 +201,8 @@ suite('test_ingestion_load_alter_column', 'p0,external') {
199201
})
200202

201203
} finally {
202-
//sql "DROP TABLE ${tableName1}"
203-
//sql "DROP TABLE ${tableName2}"
204-
}
205204

205+
}
206206
}
207207

208-
}
208+
}

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ suite('test_ingestion_load_alter_partition', 'p0,external') {
123123
qt_select "select c1, count(*) from ${testTable} group by c1 order by c1"
124124
break
125125
} else if (result[0][2] == "CANCELLED") {
126-
msg = result[0][7]
127-
logger.info("err msg: " + msg)
126+
def msg2 = result[0][7]
127+
logger.info("err msg: " + msg2)
128128
assertTrue((result[0][7] =~ /partition does not exist/).find())
129129
break
130130
} else {
@@ -146,6 +146,10 @@ suite('test_ingestion_load_alter_partition', 'p0,external') {
146146

147147
try {
148148

149+
sql "DROP TABLE if exists ${tableName1}"
150+
sql "DROP TABLE if exists ${tableName2}"
151+
sql "DROP TABLE if exists ${tableName3}"
152+
149153
sql """
150154
CREATE TABLE IF NOT EXISTS ${tableName1} (
151155
c0 int not null,
@@ -214,9 +218,6 @@ suite('test_ingestion_load_alter_partition', 'p0,external') {
214218
})
215219

216220
} finally {
217-
// sql "DROP TABLE ${tableName1}"
218-
// sql "DROP TABLE ${tableName2}"
219-
// sql "DROP TABLE ${tableName3}"
220221
}
221222

222223
}

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ suite('test_ingestion_load_drop_table', 'p0,external') {
8585
"msg": "",
8686
"appId": "",
8787
"dppResult": "${dppResult}",
88-
"filePathToSize": "{\\"${etlResultFilePath}\\": 81758}",
88+
"filePathToSize": "{\\"${etlResultFilePath}\\": 5745}",
8989
"hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}"
9090
}
9191
}"""
@@ -188,7 +188,6 @@ suite('test_ingestion_load_drop_table', 'p0,external') {
188188
})
189189

190190
} finally {
191-
sql "DROP TABLE ${tableName}"
192191
}
193192

194193
}

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ suite('test_ingestion_load_multi_table', 'p0,external') {
103103
"msg": "",
104104
"appId": "",
105105
"dppResult": "${dppResult}",
106-
"filePathToSize": "{\\"${etlResultFilePath1}\\": 81758, \\"${etlResultFilePath2}\\": 81758}",
106+
"filePathToSize": "{\\"${etlResultFilePath1}\\": 5745, \\"${etlResultFilePath2}\\": 5745}",
107107
"hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}"
108108
}
109109
}"""

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ suite('test_ingestion_load_with_inverted_index', 'p0,external') {
8585
"msg": "",
8686
"appId": "",
8787
"dppResult": "${dppResult}",
88-
"filePathToSize": "{\\"${etlResultFilePath}\\": 81758}",
88+
"filePathToSize": "{\\"${etlResultFilePath}\\": 5745}",
8989
"hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}"
9090
}
9191
}"""

regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ suite('test_ingestion_load_with_partition', 'p0,external') {
7171
}
7272
}
7373

74-
etlResultFilePaths = []
74+
def etlResultFilePaths = []
7575
for(int i=0; i < dataFiles.size(); i++) {
7676
Files.copy(Paths.get(dataFiles[i]),
7777
Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileNames[i]}"), StandardCopyOption.REPLACE_EXISTING)
@@ -115,7 +115,7 @@ suite('test_ingestion_load_with_partition', 'p0,external') {
115115

116116
def max_try_milli_secs = 120000
117117
while (max_try_milli_secs) {
118-
result = sql "show load where label = '${loadLabel}'"
118+
def result = sql "show load where label = '${loadLabel}'"
119119
if (result[0][2] == "FINISHED") {
120120
sql "sync"
121121
qt_select "select c1, count(*) from ${testTable} group by c1 order by c1"
@@ -132,9 +132,8 @@ suite('test_ingestion_load_with_partition', 'p0,external') {
132132
}
133133

134134
if (enableHdfs()) {
135-
136-
tableName = 'tbl_test_spark_load_partition'
137-
135+
def tableName = 'tbl_test_spark_load_with_partition'
136+
sql "DROP TABLE if exists ${tableName}"
138137
sql """
139138
CREATE TABLE IF NOT EXISTS ${tableName} (
140139
c0 int not null,
@@ -151,10 +150,10 @@ suite('test_ingestion_load_with_partition', 'p0,external') {
151150
)
152151
"""
153152

154-
def label = "test_ingestion_load_partition"
153+
def label = "test_ingestion_load_with_partition__"
155154

156155
testIngestLoadJob.call(tableName, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet'])
157156

158157
}
159158

160-
}
159+
}

0 commit comments

Comments
 (0)