Tialon · pull · Sep 9, 2020 · Dec 17, 2022 · Mar 23, 2023 · Mar 23, 2023
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ DataX目前已经有了比较全面的插件体系，主流的RDBMS数据库、N
 |--------------|---------------------------|:---------:|:---------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 | RDBMS 关系型数据库 | MySQL                           |     √      |     √      |                                       [读](https://github.com/alibaba/DataX/blob/master/mysqlreader/doc/mysqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/mysqlwriter/doc/mysqlwriter.md)                                       |
 |                    | Oracle                          |     √      |     √      |                                     [读](https://github.com/alibaba/DataX/blob/master/oraclereader/doc/oraclereader.md) 、[写](https://github.com/alibaba/DataX/blob/master/oraclewriter/doc/oraclewriter.md)                                     |
-|                    | OceanBase                       |     √      |     √      | [读](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) 、[写](https://open.oceanbase.com/docs/community/oceanbase-database/V3.1.0/use-datax-to-full-migration-data-to-oceanbase) |
+|                    | OceanBase                       |     √      |     √      | [读](https://github.com/alibaba/DataX/blob/master/oceanbasev10reader/doc/oceanbasev10reader.md) 、[写](https://github.com/alibaba/DataX/blob/master/oceanbasev10writer/doc/oceanbasev10writer.md) |
 |                    | SQLServer                       |     √      |     √      |                               [读](https://github.com/alibaba/DataX/blob/master/sqlserverreader/doc/sqlserverreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/sqlserverwriter/doc/sqlserverwriter.md)                               |
 |                    | PostgreSQL                      |     √      |     √      |                             [读](https://github.com/alibaba/DataX/blob/master/postgresqlreader/doc/postgresqlreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/postgresqlwriter/doc/postgresqlwriter.md)                             |
 |                    | DRDS                            |     √      |     √      |                                         [读](https://github.com/alibaba/DataX/blob/master/drdsreader/doc/drdsreader.md) 、[写](https://github.com/alibaba/DataX/blob/master/drdswriter/doc/drdswriter.md)                                         |
@@ -108,7 +108,7 @@ DataX目前已经有了比较全面的插件体系，主流的RDBMS数据库、N
 
 # 重要版本更新说明
 
-DataX 后续计划月度迭代更新，也欢迎感兴趣的同学提交 Pull requests，月度更新内容会介绍介绍如下。
+DataX 后续计划月度迭代更新，也欢迎感兴趣的同学提交 Pull requests，月度更新内容如下。
 
 - [datax_v202309]（https://github.com/alibaba/DataX/releases/tag/datax_v202309)
   - 支持Phoenix 同步数据添加 where条件

diff --git a/adbpgwriter/src/main/doc/adbpgwriter.md b/adbpgwriter/src/main/doc/adbpgwriter.md
@@ -149,6 +149,7 @@ COPY命令将数据写入ADB PG数据库中。
 
                注意：1、我们强烈不推荐你这样配置，因为当你目的表字段个数、类型等有改动时，你的任务可能运行不正确或者失败
                     2、此处 column 不能配置任何常量值
+                    3、大写字段名,此处配置时,不需要拼接转义符号:\"
 
   * 必选：是 <br />
 
@@ -229,4 +230,4 @@ create table schematest.test_datax (
 
 #### 4.2.2 性能测试小结
 1. `channel数对性能影响很大`
-2. `通常不建议写入数据库时，通道个数 > 32`
+2. `通常不建议写入数据库时，通道个数 > 32`
diff --git a/...ter/src/main/java/com/alibaba/datax/plugin/writer/adbpgwriter/copy/Adb4pgClientProxy.java b/...ter/src/main/java/com/alibaba/datax/plugin/writer/adbpgwriter/copy/Adb4pgClientProxy.java
@@ -56,7 +56,7 @@ public Adb4pgClientProxy(Configuration configuration,TaskPluginCollector  taskPl
         int retryIntervalTime = configuration.getInt(Key.RETRY_INTERVAL_TIME, 1000);
         databaseConfig.setRetryIntervalTime(retryIntervalTime);
 
-        // 设置自动提交的SQL长度（单位Byte），默认为32KB，一般不建议设置
+        // 设置自动提交的SQL长度（单位Byte），默认为10MB，一般不建议设置
         int commitSize = configuration.getInt("commitSize", 10 * 1024 * 1024);
         databaseConfig.setCommitSize(commitSize);
 

diff --git a/common/src/main/java/com/alibaba/datax/common/util/Configuration.java b/common/src/main/java/com/alibaba/datax/common/util/Configuration.java
@@ -1047,7 +1047,7 @@ private void checkPath(final String path) {
 					"系统编程错误, 该异常代表系统编程错误, 请联系DataX开发团队!.");
 		}
 
-		for (final String each : StringUtils.split(".")) {
+		for (final String each : StringUtils.split(path, ".")) {
 			if (StringUtils.isBlank(each)) {
 				throw new IllegalArgumentException(String.format(
 						"系统编程错误, 路径[%s]不合法, 路径层次之间不能出现空白字符 .", path));

diff --git a/core/src/main/job/job.json b/core/src/main/job/job.json
@@ -2,41 +2,40 @@
     "job": {
         "setting": {
             "speed": {
-                "channel":1
+                "channel": 2
             },
             "errorLimit": {
-                "record": 0,
-                "percentage": 0.02
+                "record": 0
             }
         },
         "content": [
             {
                 "reader": {
                     "name": "streamreader",
                     "parameter": {
-                        "column" : [
+                        "column": [
                             {
                                 "value": "DataX",
                                 "type": "string"
                             },
                             {
-                                "value": 19890604,
+                                "value": 1724154616370,
                                 "type": "long"
                             },
                             {
-                                "value": "1989-06-04 00:00:00",
+                                "value": "2024-01-01 00:00:00",
                                 "type": "date"
                             },
                             {
                                 "value": true,
                                 "type": "bool"
                             },
                             {
-                                "value": "test",
+                                "value": "TestRawData",
                                 "type": "bytes"
                             }
                         ],
-                        "sliceRecordCount": 100000
+                        "sliceRecordCount": 100
                     }
                 },
                 "writer": {
@@ -49,4 +48,4 @@
             }
         ]
     }
-}
+}
diff --git a/doriswriter/doc/doriswriter.md b/doriswriter/doc/doriswriter.md
@@ -36,8 +36,6 @@ DorisWriter 通过Doris原生支持Stream load方式导入数据， DorisWriter
                     "name": "doriswriter",
                     "parameter": {
                         "loadUrl": ["172.16.0.13:8030"],
-                        "loadProps": {
-                        },
                         "column": ["emp_no", "birth_date", "first_name","last_name","gender","hire_date"],
                         "username": "root",
                         "password": "xxxxxx",
@@ -178,4 +176,4 @@ DorisWriter 通过Doris原生支持Stream load方式导入数据， DorisWriter
 }
 ```
 
-更多信息请参照 Doris 官网：[Stream load - Apache Doris](https://doris.apache.org/zh-CN/docs/data-operate/import/import-way/stream-load-manual)
+更多信息请参照 Doris 官网：[Stream load - Apache Doris](https://doris.apache.org/zh-CN/docs/data-operate/import/import-way/stream-load-manual)
diff --git a/elasticsearchwriter/doc/elasticsearchwriter.md b/elasticsearchwriter/doc/elasticsearchwriter.md
@@ -167,79 +167,4 @@
 * dynamic
  * 描述: 不使用datax的mappings，使用es自己的自动mappings
  * 必选: 否
- * 默认值: false
-
-
-
-## 4 性能报告
-
-### 4.1 环境准备
-
-* 总数据量 1kw条数据, 每条0.1kb
-* 1个shard, 0个replica
-* 不加id，这样默认是append_only模式，不检查版本，插入速度会有20%左右的提升
-
-#### 4.1.1 输入数据类型(streamreader)
-
-```
-{"value": "1.1.1.1", "type": "string"},
-{"value": 19890604.0, "type": "double"},
-{"value": 19890604, "type": "long"},
-{"value": 19890604, "type": "long"},
-{"value": "hello world", "type": "string"},
-{"value": "hello world", "type": "string"},
-{"value": "41.12,-71.34", "type": "string"},
-{"value": "2017-05-25", "type": "string"},
-```
-
-#### 4.1.2 输出数据类型(eswriter)
-
-```
-{ "name": "col_ip","type": "ip" },
-{ "name": "col_double","type": "double" },
-{ "name": "col_long","type": "long" },
-{ "name": "col_integer","type": "integer" },
-{ "name": "col_keyword", "type": "keyword" },
-{ "name": "col_text", "type": "text"},
-{ "name": "col_geo_point", "type": "geo_point" },
-{ "name": "col_date", "type": "date"}
-```
-
-#### 4.1.2 机器参数
-
-1. cpu: 32  Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
-2. mem: 128G
-3. net: 千兆双网卡
-
-#### 4.1.3 DataX jvm 参数
-
--Xms1024m -Xmx1024m -XX:+HeapDumpOnOutOfMemoryError
-
-### 4.2 测试报告
-
-| 通道数|  批量提交行数| DataX速度(Rec/s)|DataX流量(MB/s)|
-|--------|--------| --------|--------|
-| 4| 256| 11013| 0.828|
-| 4| 1024| 19417| 1.43|
-| 4| 4096| 23923| 1.76|
-| 4| 8172| 24449| 1.80|
-| 8| 256| 21459| 1.58|
-| 8| 1024| 37037| 2.72|
-| 8| 4096| 45454| 3.34|
-| 8| 8172| 45871| 3.37|
-| 16| 1024| 67567| 4.96|
-| 16| 4096| 78125| 5.74|
-| 16| 8172| 77519| 5.69|
-| 32| 1024| 94339| 6.93|
-| 32| 4096| 96153| 7.06|
-| 64| 1024| 91743| 6.74|
-
-### 4.3 测试总结
-
-* 最好的结果是32通道，每次传4096，如果单条数据很大， 请适当减少批量数，防止oom
-* 当然这个很容易水平扩展，而且es也是分布式的，多设置几个shard也可以水平扩展
-
-## 5 约束限制
-
-* 如果导入id，这样数据导入失败也会重试，重新导入也仅仅是覆盖，保证数据一致性
-* 如果不导入id，就是append_only模式，elasticsearch自动生成id，速度会提升20%左右，但数据无法修复，适合日志型数据(对数据精度要求不高的)
+ * 默认值: false
diff --git a/...n/reader/gaussdbwriter/GaussDbWriter.java → ...n/writer/gaussdbwriter/GaussDbWriter.java b/...n/reader/gaussdbwriter/GaussDbWriter.java → ...n/writer/gaussdbwriter/GaussDbWriter.java
@@ -1,4 +1,4 @@
-package com.alibaba.datax.plugin.reader.gaussdbwriter;
+package com.alibaba.datax.plugin.writer.gaussdbwriter;
 
 import com.alibaba.datax.common.exception.DataXException;
 import com.alibaba.datax.common.plugin.RecordReceiver;

diff --git a/...rc/main/java/com/alibaba/datax/plugin/reader/hbase20xsqlreader/HBase20xSQLReaderTask.java b/...rc/main/java/com/alibaba/datax/plugin/reader/hbase20xsqlreader/HBase20xSQLReaderTask.java
@@ -86,16 +86,18 @@ private Column convertPhoenixValueToDataxColumn(int sqlType, Object value) {
                 column = new LongColumn((Integer) value);
                 break;
             case Types.TINYINT:
-                column = new LongColumn(((Byte) value).longValue());
+                Byte aByte = (Byte) value;
+                column = new LongColumn(null == aByte ? null : aByte.longValue());
                 break;
             case Types.SMALLINT:
-                column = new LongColumn(((Short) value).longValue());
+                Short aShort = (Short) value;
+                column = new LongColumn(null == aShort ? null : aShort.longValue());
                 break;
             case Types.BIGINT:
                 column = new LongColumn((Long) value);
                 break;
             case Types.FLOAT:
-                column = new DoubleColumn((Float.valueOf(value.toString())));
+                column = new DoubleColumn(null == value ? null : (Float.valueOf(value.toString())));
                 break;
             case Types.DECIMAL:
                 column = new DoubleColumn((BigDecimal)value);

diff --git a/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsHelper.java b/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsHelper.java
@@ -31,6 +31,7 @@
 import parquet.schema.*;
 
 import java.io.IOException;
+import java.sql.Timestamp;
 import java.text.SimpleDateFormat;
 import java.util.*;
 
@@ -440,7 +441,7 @@ public List<ObjectInspector>  getColumnTypeInspectors(List<Configuration> column
                     objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(Double.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                     break;
                 case TIMESTAMP:
-                    objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(java.sql.Timestamp.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
+                    objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(org.apache.hadoop.hive.common.type.Timestamp.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
                     break;
                 case DATE:
                     objectInspector = ObjectInspectorFactory.getReflectionObjectInspector(java.sql.Date.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
@@ -533,7 +534,13 @@ public static MutablePair<List<Object>, Boolean> transportOneRecord(
                                 recordList.add(new java.sql.Date(column.asDate().getTime()));
                                 break;
                             case TIMESTAMP:
-                                recordList.add(new java.sql.Timestamp(column.asDate().getTime()));
+                                Date date = column.asDate();
+                                if (date == null) {
+                                    recordList.add(null);
+                                } else {
+                                    Timestamp ts = new Timestamp(date.getTime());
+                                    recordList.add(org.apache.hadoop.hive.common.type.Timestamp.ofEpochMilli(ts.getTime(), ts.getNanos()));
+                                }
                                 break;
                             default:
                                 throw DataXException
@@ -630,7 +637,14 @@ public void parquetFileStartWrite(RecordReceiver lineReceiver, Configuration con
         MessageType messageType = null;
         ParquetFileProccessor proccessor = null;
         Path outputPath = new Path(fileName);
-        String schema = config.getString(Key.PARQUET_SCHEMA);
+        String schema = config.getString(Key.PARQUET_SCHEMA, null);
+        if (schema == null) {
+            List<Configuration> columns = config.getListConfiguration(Key.COLUMN);
+            if (columns == null || columns.isEmpty()) {
+                throw DataXException.asDataXException("parquetSchema or column can't be blank!");
+            }
+            schema = HdfsHelper.generateParquetSchemaFromColumnAndType(columns);
+        }
         try {
             messageType = MessageTypeParser.parseMessageType(schema);
         } catch (Exception e) {

diff --git a/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java b/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java
@@ -228,6 +228,12 @@ public List<Configuration> split(int mandatoryNumber) {
                 String endFullFileName = null;
 
                 fileSuffix = UUID.randomUUID().toString().replace('-', '_');
+                if (fileType.equalsIgnoreCase("PARQUET")) {
+                    if (StringUtils.isNotBlank(this.compress)) {
+                        fileSuffix += "." + this.compress.toLowerCase();
+                    }
+                    fileSuffix += ".parquet";
+                }
 
                 fullFileName = String.format("%s%s%s__%s", defaultFS, storePath, filePrefix, fileSuffix);
                 endFullFileName = String.format("%s%s%s__%s", defaultFS, endStorePath, filePrefix, fileSuffix);