Skip to content

Commit

Permalink
[docs] Add user manual for hdfs load and transaction. (apache#7497)
Browse files Browse the repository at this point in the history
  • Loading branch information
pengxiangyu authored Dec 30, 2021
1 parent 0894848 commit dc9cd34
Show file tree
Hide file tree
Showing 15 changed files with 492 additions and 341 deletions.
14 changes: 7 additions & 7 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -643,13 +643,13 @@ CONF_mInt32(external_table_connect_timeout_sec, "5");
CONF_mInt32(segment_cache_capacity, "1000000");

// s3 config
CONF_String(s3_ak, "");
CONF_String(s3_sk, "");
CONF_String(s3_endpoint, "");
CONF_String(s3_region, "");
CONF_mInt32(s3_max_conn, "50");
CONF_mInt32(s3_request_timeout_ms, "3000");
CONF_mInt32(s3_conn_timeout_ms, "1000");
CONF_String(default_remote_storage_s3_ak, "");
CONF_String(default_remote_storage_s3_sk, "");
CONF_String(default_remote_storage_s3_endpoint, "");
CONF_String(default_remote_storage_s3_region, "");
CONF_mInt32(default_remote_storage_s3_max_conn, "50");
CONF_mInt32(default_remote_storage_s3_request_timeout_ms, "3000");
CONF_mInt32(default_remote_storage_s3_conn_timeout_ms, "1000");
// Set to true to disable the minidump feature.
CONF_Bool(disable_minidump , "false");

Expand Down
18 changes: 11 additions & 7 deletions be/src/env/env_remote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,17 @@ class RemoteRandomRWFile : public RandomRWFile {

Status RemoteEnv::init_conf() {
std::map<std::string, std::string> storage_prop;
storage_prop[S3_AK] = doris::config::s3_ak;
storage_prop[S3_SK] = doris::config::s3_sk;
storage_prop[S3_ENDPOINT] = doris::config::s3_endpoint;
storage_prop[S3_REGION] = doris::config::s3_region;
storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::s3_max_conn);
storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::s3_request_timeout_ms);
storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::s3_conn_timeout_ms);
if (doris::config::default_remote_storage_s3_ak.empty() || doris::config::default_remote_storage_s3_sk.empty()
|| doris::config::default_remote_storage_s3_endpoint.empty() || doris::config::default_remote_storage_s3_region.empty()) {
return Status::OK();
}
storage_prop[S3_AK] = doris::config::default_remote_storage_s3_ak;
storage_prop[S3_SK] = doris::config::default_remote_storage_s3_sk;
storage_prop[S3_ENDPOINT] = doris::config::default_remote_storage_s3_endpoint;
storage_prop[S3_REGION] = doris::config::default_remote_storage_s3_region;
storage_prop[S3_MAX_CONN_SIZE] = std::to_string(doris::config::default_remote_storage_s3_max_conn);
storage_prop[S3_REQUEST_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_request_timeout_ms);
storage_prop[S3_CONN_TIMEOUT_MS] = std::to_string(doris::config::default_remote_storage_s3_conn_timeout_ms);

if (ClientFactory::is_s3_conf_valid(storage_prop)) {
_storage_backend.reset(new S3StorageBackend(storage_prop));
Expand Down
7 changes: 7 additions & 0 deletions be/src/exec/hdfs_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ Status HDFSWriter::_parse_properties(std::map<std::string, std::string>& prop) {
LOG(ERROR) << "hdfs properties is incorrect.";
return Status::InternalError("hdfs properties is incorrect");
}

// if the format of _path is hdfs://ip:port/path, replace it to /path.
// path like hdfs://ip:port/path can't be used by libhdfs3.
if (_path.find(_namenode) != _path.npos) {
_path = _path.substr(_namenode.size());
}

return Status::OK();
}

Expand Down
2 changes: 1 addition & 1 deletion be/src/olap/data_dir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ DataDir::DataDir(const std::string& path, int64_t capacity_bytes,
_disk_capacity_bytes(0),
_storage_medium(storage_medium),
_is_used(false),
_env(Env::get_env(storage_medium)),
_tablet_manager(tablet_manager),
_txn_manager(txn_manager),
_cluster_id(-1),
_cluster_id_incomplete(false),
_to_be_deleted(false),
_env(Env::get_env(storage_medium)),
_current_shard(0),
_meta(nullptr) {
_path_desc.storage_medium = storage_medium;
Expand Down
10 changes: 10 additions & 0 deletions be/src/runtime/export_sink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <sstream>

#include "exec/broker_writer.h"
#include "exec/hdfs_reader_writer.h"
#include "exec/local_file_writer.h"
#include "exec/s3_writer.h"
#include "exprs/expr.h"
Expand Down Expand Up @@ -258,6 +259,15 @@ Status ExportSink::open_file_writer() {
_file_writer.reset(s3_writer);
break;
}
case TFileType::FILE_HDFS: {
FileWriter* hdfs_writer;
RETURN_IF_ERROR(HdfsReaderWriter::create_writer(
const_cast<std::map<std::string, std::string>&>(_t_export_sink.properties),
_t_export_sink.export_path + "/" + file_name, &hdfs_writer));
RETURN_IF_ERROR(hdfs_writer->open());
_file_writer.reset(hdfs_writer);
break;
}
default: {
std::stringstream ss;
ss << "Unknown file type, type=" << _t_export_sink.file_type;
Expand Down
2 changes: 2 additions & 0 deletions docs/.vuepress/sidebar/en.js
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ module.exports = [
title: "Data Manipulation",
directoryPath: "Data Manipulation/",
children: [
"BEGIN",
"BROKER LOAD",
"CANCEL DELETE",
"CANCEL LABEL",
Expand All @@ -575,6 +576,7 @@ module.exports = [
"LOAD",
"MINI LOAD",
"MULTI LOAD",
"OUTFILE",
"PAUSE ROUTINE LOAD",
"PAUSE SYNC JOB",
"RESTORE TABLET",
Expand Down
2 changes: 2 additions & 0 deletions docs/.vuepress/sidebar/zh-CN.js
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ module.exports = [
title: "DML",
directoryPath: "Data Manipulation/",
children: [
"BEGIN",
"BROKER LOAD",
"CANCEL LOAD",
"CREATE SYNC JOB",
Expand All @@ -578,6 +579,7 @@ module.exports = [
"LOAD",
"MINI LOAD",
"MULTI LOAD",
"OUTFILE",
"PAUSE ROUTINE LOAD",
"PAUSE SYNC JOB",
"RESUME ROUTINE LOAD",
Expand Down
161 changes: 1 addition & 160 deletions docs/en/administrator-guide/outfile.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,166 +139,7 @@ Planning example for concurrent export:
## Usage example
1. Example 1
Export simple query results to the file `hdfs://path/to/result.txt`. Specify the export format as CSV. Use `my_broker` and set kerberos authentication information. Specify the column separator as `,` and the line delimiter as `\n`.
```
SELECT * FROM tbl
INTO OUTFILE "hdfs://path/to/result_"
FORMAT AS CSV
PROPERTIES
(
"broker.name" = "my_broker",
"broker.hadoop.security.authentication" = "kerberos",
"broker.kerberos_principal" = "doris@YOUR.COM",
"broker.kerberos_keytab" = "/home/doris/my.keytab",
"column_separator" = ",",
"line_delimiter" = "\n",
"max_file_size" = "100MB"
);
```
If the result is less than 100MB, file will be: `result_0.csv`.
If larger than 100MB, may be: `result_0.csv, result_1.csv, ...`.
2. Example 2
Export simple query results to the file `hdfs://path/to/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set kerberos authentication information.
```
SELECT c1, c2, c3 FROM tbl
INTO OUTFILE "hdfs://path/to/result_"
FORMAT AS PARQUET
PROPERTIES
(
"broker.name" = "my_broker",
"broker.hadoop.security.authentication" = "kerberos",
"broker.kerberos_principal" = "doris@YOUR.COM",
"broker.kerberos_keytab" = "/home/doris/my.keytab",
"schema"="required,int32,c1;required,byte_array,c2;required,byte_array,c2"
);
```
If the exported file format is PARQUET, `schema` must be specified.
3. Example 3
Export the query result of the CTE statement to the file `hdfs://path/to/result.txt`. The default export format is CSV. Use `my_broker` and set hdfs high availability information. Use the default column separators and line delimiter.
```
WITH
x1 AS
(SELECT k1, k2 FROM tbl1),
x2 AS
(SELECT k3 FROM tbl2)
SELEC k1 FROM x1 UNION SELECT k3 FROM x2
INTO OUTFILE "hdfs://path/to/result_"
PROPERTIES
(
"broker.name" = "my_broker",
"broker.username"="user",
"broker.password"="passwd",
"broker.dfs.nameservices" = "my_ha",
"broker.dfs.ha.namenodes.my_ha" = "my_namenode1, my_namenode2",
"broker.dfs.namenode.rpc-address.my_ha.my_namenode1" = "nn1_host:rpc_port",
"broker.dfs.namenode.rpc-address.my_ha.my_namenode2" = "nn2_host:rpc_port",
"broker.dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
);
```
If the result is less than 1GB, file will be: `result_0.csv`.
If larger than 1GB, may be: `result_0.csv, result_1.csv, ...`.
4. Example 4
Export the query results of the UNION statement to the file `bos://bucket/result.parquet`. Specify the export format as PARQUET. Use `my_broker` and set hdfs high availability information. PARQUET format does not need to specify the column separator and line delimiter.
```
SELECT k1 FROM tbl1 UNION SELECT k2 FROM tbl1
INTO OUTFILE "bos://bucket/result_"
FORMAT AS PARQUET
PROPERTIES
(
"broker.name" = "my_broker",
"broker.bos_endpoint" = "http://bj.bcebos.com",
"broker.bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx",
"broker.bos_secret_accesskey" = "yyyyyyyyyyyyyyyyyyyyyyyyyy",
"schema"="required,int32,k1;required,byte_array,k2"
);
```
5. Example 5
Export simple query results to the file `cos://${bucket_name}/path/result.txt`. Specify the export format as CSV.
And create a mark file after export finished.
```
select k1,k2,v1 from tbl1 limit 100000
into outfile "s3a://my_bucket/export/my_file_"
FORMAT AS CSV
PROPERTIES
(
"broker.name" = "hdfs_broker",
"broker.fs.s3a.access.key" = "xxx",
"broker.fs.s3a.secret.key" = "xxxx",
"broker.fs.s3a.endpoint" = "https://cos.xxxxxx.myqcloud.com/",
"column_separator" = ",",
"line_delimiter" = "\n",
"max_file_size" = "1024MB",
"success_file_name" = "SUCCESS"
)
```
If the result is less than 1GB, file will be: `my_file_0.csv`.
If larger than 1GB, may be: `my_file_0.csv, result_1.csv, ...`.
Please Note:
1. Paths that do not exist are automatically created.
2. These parameters(access.key/secret.key/endpointneed) need to be confirmed with `Tecent Cloud COS`. In particular, the value of endpoint does not need to be filled in bucket_name.
6. Example 6
Use the s3 protocol to export to bos, and concurrent export is enabled.
```
set enable_parallel_outfile = true;
select k1 from tb1 limit 1000
into outfile "s3://my_bucket/export/my_file_"
format as csv
properties
(
"AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
"AWS_ACCESS_KEY" = "xxxx",
"AWS_SECRET_KEY" = "xxx",
"AWS_REGION" = "bd"
)
```
The final generated file prefix is `my_file_{fragment_instance_id}_`。
7. Example 7
Use the s3 protocol to export to bos, and enable concurrent export of session variables.
```
set enable_parallel_outfile = true;
select k1 from tb1 order by k1 limit 1000
into outfile "s3://my_bucket/export/my_file_"
format as csv
properties
(
"AWS_ENDPOINT" = "http://s3.bd.bcebos.com",
"AWS_ACCESS_KEY" = "xxxx",
"AWS_SECRET_KEY" = "xxx",
"AWS_REGION" = "bd"
)
```
**But because the query statement has a top-level sorting node, even if the query is enabled for concurrently exported session variables, it cannot be exported concurrently.**
For details, please refer to [OUTFILE Document](../sql-reference/sql-statements/Data%20Manipulation/OUTFILE.md).
## Return result
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,11 @@ under the License.
(
"fs.defaultFS" = "",
"hdfs_user"="",
"kerb_principal" = "",
"kerb_ticket_cache_path" = "",
"kerb_token" = ""
"dfs.nameservices"="my_ha",
"dfs.ha.namenodes.xxx"="my_nn1,my_nn2",
"dfs.namenode.rpc-address.xxx.my_nn1"="host1:port",
"dfs.namenode.rpc-address.xxx.my_nn2"="host2:port",
"dfs.client.failover.proxy.provider.xxx"="org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
)
fs.defaultFS: defaultFS
hdfs_user: hdfs user
Expand Down
12 changes: 12 additions & 0 deletions docs/en/sql-reference/sql-statements/Data Manipulation/EXPORT.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ under the License.
For brokers corresponding to different storage systems, the input parameters are different. Specific parameters can be referred to: `help broker load', broker required properties.
When exporting to local, you do not need to fill in this part.

7. hdfs
Specify to use libhdfs export to hdfs
Grammar:
WITH HDFS ("key"="value"[,...])

The following parameters can be specified:
fs.defaultFS: Set the fs such as:hdfs://ip:port
hdfs_user:Specify hdfs user name

## example

1. Export all data from the testTbl table to HDFS
Expand All @@ -97,5 +106,8 @@ under the License.
7. Export column k1, v1 from the testTbl to the local.
EXPORT TABLE testTbl TO "file:///home/data/a" PROPERTIES ("columns" = "k1,v1");

8. Export all data in the testTbl table to hdfs, using the invisible character "\x07" as the column and row separator.
EXPORT TABLE testTbl TO "hdfs://hdfs_host:port/a/b/c" PROPERTIES ("column_separator"="\\x07", "line_delimiter" = "\\x07") WITH HDFS ("fs.defaultFS"="hdfs://hdfs_host:port", "hdfs_user"="yyy")

## keyword
EXPORT
Loading

0 comments on commit dc9cd34

Please sign in to comment.