[docs] Add user manual for hdfs load and transaction. (apache#7497)

yujun777 · Dec 30, 2021 · dc9cd34 · dc9cd34
1 parent 0894848
commit dc9cd34
Show file tree

Hide file tree

Showing 15 changed files with 492 additions and 341 deletions.
diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -643,13 +643,13 @@ CONF_mInt32(external_table_connect_timeout_sec, "5");
 CONF_mInt32(segment_cache_capacity, "1000000");
 
 // s3 config
-CONF_String(s3_ak, "");
-CONF_String(s3_sk, "");
-CONF_String(s3_endpoint, "");
-CONF_String(s3_region, "");
-CONF_mInt32(s3_max_conn, "50");
-CONF_mInt32(s3_request_timeout_ms, "3000");
-CONF_mInt32(s3_conn_timeout_ms, "1000");
+CONF_String(default_remote_storage_s3_ak, "");
+CONF_String(default_remote_storage_s3_sk, "");
+CONF_String(default_remote_storage_s3_endpoint, "");
+CONF_String(default_remote_storage_s3_region, "");
+CONF_mInt32(default_remote_storage_s3_max_conn, "50");
+CONF_mInt32(default_remote_storage_s3_request_timeout_ms, "3000");
+CONF_mInt32(default_remote_storage_s3_conn_timeout_ms, "1000");
 // Set to true to disable the minidump feature.
 CONF_Bool(disable_minidump , "false");
 

diff --git a/be/src/env/env_remote.cpp b/be/src/env/env_remote.cpp
@@ -158,13 +158,17 @@ class RemoteRandomRWFile : public RandomRWFile {
 
 Status RemoteEnv::init_conf() {
     std::map<std::string, std::string> storage_prop;
-    storage_prop[S3_AK] = doris::config::s3_ak;
-    storage_prop[S3_SK] = doris::config::s3_sk;
-    storage_prop[S3_ENDPOINT] = doris::config::s3_endpoint;
-    storage_prop[S3_REGION] = doris::config::s3_region;
-    storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::s3_max_conn);
-    storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::s3_request_timeout_ms);
-    storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::s3_conn_timeout_ms);
+    if (doris::config::default_remote_storage_s3_ak.empty() || doris::config::default_remote_storage_s3_sk.empty()
+            || doris::config::default_remote_storage_s3_endpoint.empty() || doris::config::default_remote_storage_s3_region.empty()) {
+        return Status::OK();
+    }
+    storage_prop[S3_AK] = doris::config::default_remote_storage_s3_ak;
+    storage_prop[S3_SK] = doris::config::default_remote_storage_s3_sk;
+    storage_prop[S3_ENDPOINT] = doris::config::default_remote_storage_s3_endpoint;
+    storage_prop[S3_REGION] = doris::config::default_remote_storage_s3_region;
+    storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::default_remote_storage_s3_max_conn);
+    storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_request_timeout_ms);
+    storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_conn_timeout_ms);
 
     if (ClientFactory::is_s3_conf_valid(storage_prop)) {
         _storage_backend.reset(new S3StorageBackend(storage_prop));

diff --git a/be/src/exec/hdfs_writer.cpp b/be/src/exec/hdfs_writer.cpp
@@ -193,6 +193,13 @@ Status HDFSWriter::_parse_properties(std::map<std::string, std::string>& prop) {
         LOG(ERROR) << "hdfs properties is incorrect.";
         return Status::InternalError("hdfs properties is incorrect");
     }
+
+    // if the format of _path is hdfs://ip:port/path, replace it to /path.
+    // path like hdfs://ip:port/path can't be used by libhdfs3.
+    if (_path.find(_namenode) != _path.npos) {
+        _path = _path.substr(_namenode.size());
+    }
+
     return Status::OK();
 }
 

diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp
@@ -71,12 +71,12 @@ DataDir::DataDir(const std::string& path, int64_t capacity_bytes,
           _disk_capacity_bytes(0),
           _storage_medium(storage_medium),
           _is_used(false),
+          _env(Env::get_env(storage_medium)),
           _tablet_manager(tablet_manager),
           _txn_manager(txn_manager),
           _cluster_id(-1),
           _cluster_id_incomplete(false),
           _to_be_deleted(false),
-          _env(Env::get_env(storage_medium)),
           _current_shard(0),
           _meta(nullptr) {
     _path_desc.storage_medium = storage_medium;

diff --git a/be/src/runtime/export_sink.cpp b/be/src/runtime/export_sink.cpp
@@ -22,6 +22,7 @@
 #include <sstream>
 
 #include "exec/broker_writer.h"
+#include "exec/hdfs_reader_writer.h"
 #include "exec/local_file_writer.h"
 #include "exec/s3_writer.h"
 #include "exprs/expr.h"
@@ -258,6 +259,15 @@ Status ExportSink::open_file_writer() {
         _file_writer.reset(s3_writer);
         break;
     }
+    case TFileType::FILE_HDFS: {
+        FileWriter* hdfs_writer;
+        RETURN_IF_ERROR(HdfsReaderWriter::create_writer(
+                const_cast<std::map<std::string, std::string>&>(_t_export_sink.properties),
+                _t_export_sink.export_path + "/" + file_name, &hdfs_writer));
+        RETURN_IF_ERROR(hdfs_writer->open());
+        _file_writer.reset(hdfs_writer);
+        break;
+    }
     default: {
         std::stringstream ss;
         ss << "Unknown file type, type=" << _t_export_sink.file_type;

diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js
@@ -563,6 +563,7 @@ module.exports = [
             title: "Data Manipulation",
             directoryPath: "Data Manipulation/",
             children: [
+              "BEGIN",
               "BROKER LOAD",
               "CANCEL DELETE",
               "CANCEL LABEL",
@@ -575,6 +576,7 @@ module.exports = [
               "LOAD",
               "MINI LOAD",
               "MULTI LOAD",
+              "OUTFILE",
               "PAUSE ROUTINE LOAD",
               "PAUSE SYNC JOB",
               "RESTORE TABLET",

diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js
@@ -569,6 +569,7 @@ module.exports = [
             title: "DML",
             directoryPath: "Data Manipulation/",
             children: [
+              "BEGIN",
               "BROKER LOAD",
               "CANCEL LOAD",
               "CREATE SYNC JOB",
@@ -578,6 +579,7 @@ module.exports = [
               "LOAD",
               "MINI LOAD",
               "MULTI LOAD",
+              "OUTFILE",
               "PAUSE ROUTINE LOAD",
               "PAUSE SYNC JOB",
               "RESUME ROUTINE LOAD",

diff --git a/docs/en/administrator-guide/outfile.md b/docs/en/administrator-guide/outfile.md
@@ -139,166 +139,7 @@ Planning example for concurrent export:
 
 ## Usage example
 
-1. Example 1
-
-    Export simple query results to the file `hdfs://path/to/result.txt`. Specify the export format as CSV. Use `my_broker` and set kerberos authentication information. Specify the column separator as `,` and the line delimiter as `\n`.
-    
-    ```
-    SELECT * FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "column_separator" = ",",
-        "line_delimiter" = "\n",
-        "max_file_size" = "100MB"
-    );
-    ```
-    
-    If the result is less than 100MB, file will be: `result_0.csv`.
-    
-    If larger than 100MB, may be: `result_0.csv, result_1.csv, ...`.
-
-2. Example 2
-
-    Export simple query results to the file `hdfs://path/to/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set kerberos authentication information. 
-    
-    ```
-    SELECT c1, c2, c3 FROM tbl
-    INTO OUTFILE "hdfs://path/to/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.hadoop.security.authentication" = "kerberos",
-        "broker.kerberos_principal" = "doris@YOUR.COM",
-        "broker.kerberos_keytab" = "/home/doris/my.keytab",
-        "schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
-    );
-    ```
-   
-   If the exported file format is PARQUET, `schema` must be specified.
-
-3. Example 3
-
-    Export the query result of the CTE statement to the file `hdfs://path/to/result.txt`. The default export format is CSV. Use `my_broker` and set hdfs high availability information. Use the default column separators and line delimiter.
-
-    ```
-    WITH
-    x1 AS
-    (SELECT k1, k2 FROM tbl1),
-    x2 AS
-    (SELECT k3 FROM tbl2)
-    SELEC k1 FROM x1 UNION SELECT k3 FROM x2
-    INTO OUTFILE "hdfs://path/to/result_"
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.username"="user",
-        "broker.password"="passwd",
-        "broker.dfs.nameservices" = "my_ha",
-        "broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
-        "broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
-        "broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
-    );
-    ```
-    
-    If the result is less than 1GB, file will be: `result_0.csv`.
-    
-    If larger than 1GB, may be: `result_0.csv, result_1.csv, ...`.
-    
-4. Example 4
-
-    Export the query results of the UNION statement to the file `bos://bucket/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set hdfs high availability information. PARQUET format does not need to specify the column separator and line delimiter.
-    
-    ```
-    SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
-    INTO OUTFILE "bos://bucket/result_"
-    FORMAT AS PARQUET
-    PROPERTIES
-    (
-        "broker.name" = "my_broker",
-        "broker.bos_endpoint" = "http://bj.bcebos.com",
-        "broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
-        "broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
-        "schema"="required,int32,k1;required,byte_array,k2"
-    );
-    ```
-
-5. Example 5
-
-    Export simple query results to the file `cos://${bucket_name}/path/result.txt`. Specify the export format as CSV.
-    And create a mark file after export finished.
-    
-    ```
-    select k1,k2,v1 from tbl1 limit 100000
-    into outfile "s3a://my_bucket/export/my_file_"
-    FORMAT AS CSV
-    PROPERTIES
-    (
-       "broker.name" = "hdfs_broker",
-       "broker.fs.s3a.access.key" = "xxx",
-       "broker.fs.s3a.secret.key" = "xxxx",
-       "broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
-       "column_separator" = ",",
-       "line_delimiter" = "\n",
-       "max_file_size" = "1024MB",
-       "success_file_name" = "SUCCESS"
-    )
-    ```
-    
-    If the result is less than 1GB, file will be: `my_file_0.csv`.
-    
-    If larger than 1GB, may be: `my_file_0.csv, result_1.csv, ...`.
-    
-    Please Note: 
-    1. Paths that do not exist are automatically created.
-    2. These parameters(access.key/secret.key/endpointneed) need to be confirmed with `Tecent Cloud COS`. In particular, the value of endpoint does not need to be filled in bucket_name.
-
-6. Example 6
-
-    Use the s3 protocol to export to bos, and concurrent export is enabled.
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    The final generated file prefix is `my_file_{fragment_instance_id}_`。
-
-7. Example 7
-
-    Use the s3 protocol to export to bos, and enable concurrent export of session variables.
-
-    ```
-    set enable_parallel_outfile = true;
-    select k1 from tb1 order by k1 limit 1000
-    into outfile "s3://my_bucket/export/my_file_"
-    format as csv
-    properties
-    (
-        "AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
-        "AWS_ACCESS_KEY" = "xxxx",
-        "AWS_SECRET_KEY" = "xxx",
-        "AWS_REGION" = "bd"
-    )
-    ```
-
-    **But because the query statement has a top-level sorting node, even if the query is enabled for concurrently exported session variables, it cannot be exported concurrently.**
+For details, please refer to [OUTFILE Document](../sql-reference/sql-statements/Data%20Manipulation/OUTFILE.md).
 
 ## Return result
 

diff --git a/docs/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md b/docs/en/sql-reference/sql-statements/Data Manipulation/BROKER LOAD.md
@@ -222,9 +222,11 @@ under the License.
             (
                 "fs.defaultFS" = "",
                 "hdfs_user"="",
-                "kerb_principal" = "",
-                "kerb_ticket_cache_path" = "",
-                "kerb_token" = ""
+                "dfs.nameservices"="my_ha",
+                "dfs.ha.namenodes.xxx"="my_nn1,my_nn2",
+                "dfs.namenode.rpc-address.xxx.my_nn1"="host1:port",
+                "dfs.namenode.rpc-address.xxx.my_nn2"="host2:port",
+                "dfs.client.failover.proxy.provider.xxx"="org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
             )
             fs.defaultFS: defaultFS
             hdfs_user: hdfs user

diff --git a/docs/en/sql-reference/sql-statements/Data Manipulation/EXPORT.md b/docs/en/sql-reference/sql-statements/Data Manipulation/EXPORT.md
@@ -74,6 +74,15 @@ under the License.
         For brokers corresponding to different storage systems, the input parameters are different. Specific parameters can be referred to: `help broker load', broker required properties.
         When exporting to local, you do not need to fill in this part.
 
+    7. hdfs
+      Specify to use libhdfs export to hdfs
+          Grammar：
+          WITH HDFS ("key"="value"[,...])
+
+          The following parameters can be specified:
+            fs.defaultFS: Set the fs such as：hdfs://ip:port
+            hdfs_user：Specify hdfs user name
+
 ## example
 
     1. Export all data from the testTbl table to HDFS
@@ -97,5 +106,8 @@ under the License.
     7. Export column k1, v1 from the testTbl to the local.
        EXPORT TABLE testTbl TO "file:///home/data/a" PROPERTIES ("columns" = "k1,v1");
 
+    8. Export all data in the testTbl table to hdfs, using the invisible character "\x07" as the column and row separator. 
+        EXPORT TABLE testTbl TO "hdfs://hdfs_host:port/a/b/c" PROPERTIES ("column_separator"="\\x07", "line_delimiter" = "\\x07") WITH HDFS ("fs.defaultFS"="hdfs://hdfs_host:port", "hdfs_user"="yyy")
+
 ## keyword
     EXPORT