tiup: add document of tispark support in tiup cluster (#3611)

* tiup: add document of tispark support in tiup cluster * Apply suggestions from code review Co-authored-by: Keke Yi <40977455+yikeke@users.noreply.github.com> * Update the order of warning and heading * Update yaml file links * Update tispark-deployment-topology.md Co-authored-by: Keke Yi <40977455+yikeke@users.noreply.github.com> Co-authored-by: Lilian Lee <lilin@pingcap.com>
pingcap · Aug 27, 2020 · 2dd251c · 2dd251c
1 parent 976ec8f
commit 2dd251c
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 0 deletions.
diff --git a/TOC.md b/TOC.md
@@ -31,6 +31,7 @@
     + [TiFlash Topology](/tiflash-deployment-topology.md)
     + [TiCDC Topology](/ticdc-deployment-topology.md)
     + [TiDB Binlog Topology](/tidb-binlog-deployment-topology.md)
+    + [TiSpark Topology](/tispark-deployment-topology.md)
     + [Cross-DC Topology](/geo-distributed-deployment-topology.md)
     + [Hybrid Topology](/hybrid-deployment-topology.md)
   + Install and Start

diff --git a/config-templates/complex-tispark.yaml b/config-templates/complex-tispark.yaml
@@ -0,0 +1,150 @@
+# # Global variables are applied to all deployments and used as the default value of
+# # the deployments if a specific deployment value is missing.
+global:
+  user: "tidb"
+  ssh_port: 22
+  deploy_dir: "/tidb-deploy"
+  data_dir: "/tidb-data"
+
+# # Monitored variables are applied to all the machines.
+monitored:
+  node_exporter_port: 9100
+  blackbox_exporter_port: 9115
+  # deploy_dir: "/tidb-deploy/monitored-9100"
+  # data_dir: "/tidb-data/monitored-9100"
+  # log_dir: "/tidb-deploy/monitored-9100/log"
+
+# # Server configs are used to specify the runtime configuration of TiDB components.
+# # All configuration items can be found in TiDB docs:
+# # - TiDB: https://pingcap.com/docs/stable/reference/configuration/tidb-server/configuration-file/
+# # - TiKV: https://pingcap.com/docs/stable/reference/configuration/tikv-server/configuration-file/
+# # - PD: https://pingcap.com/docs/stable/reference/configuration/pd-server/configuration-file/
+# # All configuration items use points to represent the hierarchy, e.g:
+# #   readpool.storage.use-unified-pool
+# #
+# # You can overwrite this configuration via the instance-level `config` field.
+
+server_configs:
+  tidb:
+    log.slow-threshold: 300
+  tikv:
+    # server.grpc-concurrency: 4
+    # raftstore.apply-pool-size: 2
+    # raftstore.store-pool-size: 2
+    # rocksdb.max-sub-compactions: 1
+    # storage.block-cache.capacity: "16GB"
+    # readpool.unified.max-thread-count: 12
+    readpool.storage.use-unified-pool: false
+    readpool.coprocessor.use-unified-pool: true
+  pd:
+    schedule.leader-schedule-limit: 4
+    schedule.region-schedule-limit: 2048
+    schedule.replica-schedule-limit: 64
+
+pd_servers:
+  - host: 10.0.1.4
+    # ssh_port: 22
+    # name: "pd-1"
+    # client_port: 2379
+    # peer_port: 2380
+    # deploy_dir: "/tidb-deploy/pd-2379"
+    # data_dir: "/tidb-data/pd-2379"
+    # log_dir: "/tidb-deploy/pd-2379/log"
+    # numa_node: "0,1"
+    # # The following configs are used to overwrite the `server_configs.pd` values.
+    # config:
+    #   schedule.max-merge-region-size: 20
+    #   schedule.max-merge-region-keys: 200000
+  - host: 10.0.1.5
+  - host: 10.0.1.6
+
+tidb_servers:
+  - host: 10.0.1.1
+    # ssh_port: 22
+    # port: 4000
+    # status_port: 10080
+    # deploy_dir: "/tidb-deploy/tidb-4000"
+    # log_dir: "/tidb-deploy/tidb-4000/log"
+    # numa_node: "0,1"
+    # # The following configs are used to overwrite the `server_configs.tidb` values.
+    # config:
+    #   log.slow-query-file: tidb-slow-overwrited.log
+  - host: 10.0.1.2
+  - host: 10.0.1.3
+
+tikv_servers:
+  - host: 10.0.1.7
+    # ssh_port: 22
+    # port: 20160
+    # status_port: 20180
+    # deploy_dir: "/tidb-deploy/tikv-20160"
+    # data_dir: "/tidb-data/tikv-20160"
+    # log_dir: "/tidb-deploy/tikv-20160/log"
+    # numa_node: "0,1"
+    # # The following configs are used to overwrite the `server_configs.tikv` values.
+    # config:
+    #   server.grpc-concurrency: 4
+    #   server.labels: { zone: "zone1", dc: "dc1", host: "host1" }
+
+  - host: 10.0.1.8
+  - host: 10.0.1.9
+
+# NOTE: TiSpark support is an experimental feature, it's not recommend to be used in
+# production at present.
+# To use TiSpark, you need to manually install Java Runtime Environment (JRE) 8 on the
+# host, see the OpenJDK doc for a reference: https://openjdk.java.net/install/
+# If you have already installed JRE 1.8 at a location other than the default of system's
+# package management system, you may use the "java_home" field to set the JAVA_HOME variable.
+# NOTE: Only 1 master node is supported for now
+tispark_masters:
+  - host: 10.0.1.21
+    # ssh_port: 22
+    # port: 7077
+    # web_port: 8080
+    # deploy_dir: "/tidb-deploy/tispark-master-7077"
+    # java_home: "/usr/local/bin/java-1.8.0"
+    # spark_config:
+    #   spark.driver.memory: "2g"
+    #   spark.eventLog.enabled: "False"
+    #   spark.tispark.grpc.framesize: 268435456
+    #   spark.tispark.grpc.timeout_in_sec: 100
+    #   spark.tispark.meta.reload_period_in_sec: 60
+    #   spark.tispark.request.command.priority: "Low"
+    #   spark.tispark.table.scan_concurrency: 256
+    # spark_env:
+    #   SPARK_EXECUTOR_CORES: 5
+    #   SPARK_EXECUTOR_MEMORY: "10g"
+    #   SPARK_WORKER_CORES: 5
+    #   SPARK_WORKER_MEMORY: "10g"
+
+# NOTE: multiple worker nodes on the same host is not supported by Spark
+tispark_workers:
+  - host: 10.0.1.22
+    # ssh_port: 22
+    # port: 7078
+    # web_port: 8081
+    # deploy_dir: "/tidb-deploy/tispark-worker-7078"
+    # java_home: "/usr/local/bin/java-1.8.0"
+  - host: 10.0.1.23
+
+monitoring_servers:
+  - host: 10.0.1.10
+    # ssh_port: 22
+    # port: 9090
+    # deploy_dir: "/tidb-deploy/prometheus-8249"
+    # data_dir: "/tidb-data/prometheus-8249"
+    # log_dir: "/tidb-deploy/prometheus-8249/log"
+
+grafana_servers:
+  - host: 10.0.1.10
+    # port: 3000
+    # deploy_dir: /tidb-deploy/grafana-3000
+
+alertmanager_servers:
+  - host: 10.0.1.10
+    # ssh_port: 22
+    # web_port: 9093
+    # cluster_port: 9094
+    # deploy_dir: "/tidb-deploy/alertmanager-9093"
+    # data_dir: "/tidb-data/alertmanager-9093"
+    # log_dir: "/tidb-deploy/alertmanager-9093/log"
diff --git a/config-templates/simple-tispark.yaml b/config-templates/simple-tispark.yaml
@@ -0,0 +1,45 @@
+# # Global variables are applied to all deployments and used as the default value of
+# # the deployments if a specific deployment value is missing.
+global:
+  user: "tidb"
+  ssh_port: 22
+  deploy_dir: "/tidb-deploy"
+  data_dir: "/tidb-data"
+
+pd_servers:
+  - host: 10.0.1.4
+  - host: 10.0.1.5
+  - host: 10.0.1.6
+
+tidb_servers:
+  - host: 10.0.1.1
+  - host: 10.0.1.2
+  - host: 10.0.1.3
+
+tikv_servers:
+  - host: 10.0.1.7
+  - host: 10.0.1.8
+  - host: 10.0.1.9
+
+
+# NOTE: TiSpark support is an experimental feature, it's not recommend to be used in
+# production at present.
+# To use TiSpark, you need to manually install Java Runtime Environment (JRE) 8 on the
+# host, see the OpenJDK doc for a reference: https://openjdk.java.net/install/
+# NOTE: Only 1 master node is supported for now
+tispark_masters:
+  - host: 10.0.1.21
+
+# NOTE: multiple worker nodes on the same host is not supported by Spark
+tispark_workers:
+  - host: 10.0.1.22
+  - host: 10.0.1.23
+
+monitoring_servers:
+  - host: 10.0.1.10
+
+grafana_servers:
+  - host: 10.0.1.10
+
+alertmanager_servers:
+  - host: 10.0.1.10
diff --git a/production-deployment-using-tiup.md b/production-deployment-using-tiup.md
@@ -97,6 +97,10 @@ The following topology documents provide a cluster configuration template for ea
 
     This is to deploy TiDB Binlog along with the minimal cluster topology. TiDB Binlog is the widely used component for replicating incremental data. It provides near real-time backup and replication.
 
+- [TiSpark deployment topology](/tispark-deployment-topology.md)
+
+    This is to deploy TiSpark along with the minimal cluster topology. TiSpark is a component built for running Apache Spark on top of TiDB/TiKV to answer the OLAP queries. Currently, TiUP cluster's support for TiSpark is still **experimental**.
+
 - [Hybrid deployment topology](/hybrid-deployment-topology.md)
 
     This is to deploy multiple instances on a single machine. You need to add extra configurations for the directory, port, resource ratio, and label.

diff --git a/tispark-deployment-topology.md b/tispark-deployment-topology.md
@@ -0,0 +1,44 @@
+---
+title: TiSpark Deployment Topology
+summary: Learn the deployment topology of TiSpark using TiUP based on the minimal TiDB topology.
+---
+
+# TiSpark Deployment Topology
+
+> **Warning:**
+>
+> TiSpark support in the TiUP cluster is still an experimental feature. It is **NOT** recommended to use it in the production environment.
+
+This document introduces the TiSpark deployment topology and how to deploy TiSpark based on the minimum cluster topology.
+
+TiSpark is a component built for running Apache Spark on top of TiDB/TiKV to answer complex OLAP queries. It brings benefits of both the Spark platform and the distributed TiKV cluster to TiDB and makes TiDB a one-stop solution for both online transactions and analytics.
+
+For more information about TiSpark, see [TiSpark User Guide](/tispark-overview.md).
+
+## Topology information
+
+| Instance | Count | Physical machine configuration | IP | Configuration |
+| :-- | :-- | :-- | :-- | :-- |
+| TiDB | 3 | 16 VCore 32GB * 1 | 10.0.1.1 <br/> 10.0.1.2 <br/> 10.0.1.3 | Default port <br/>  Global directory configuration |
+| PD | 3 | 4 VCore 8GB * 1 |10.0.1.4 <br/> 10.0.1.5 <br/> 10.0.1.6 | Default port <br/> Global directory configuration |
+| TiKV | 3 | 16 VCore 32GB 2TB (nvme ssd) * 1 | 10.0.1.7 <br/> 10.0.1.8 <br/> 10.0.1.9 | Default port <br/> Global directory configuration |
+| TiSpark | 3 | 8 VCore 16GB * 1 | 10.0.1.21 (master) <br/> 10.0.1.22 (worker) <br/> 10.0.1.23 (worker) | Default port <br/> Global directory configuration |
+| Monitoring & Grafana | 1 | 4 VCore 8GB * 1 500GB (ssd) | 10.0.1.11 | Default port <br/> Global directory configuration |
+
+## Topology templates
+
+- [Simple TiSpark topology template](/config-templates/simple-tispark.yaml)
+- [Complex TiSpark topology template](/config-templates/complex-tispark.yaml)
+
+> **Note:**
+>
+> - You do not need to manually create the `tidb` user in the configuration file. The TiUP cluster component automatically creates the `tidb` user on the target machines. You can customize the user, or keep the user consistent with the control machine.
+> - If you configure the deployment directory as a relative path, the cluster will be deployed in the home directory of the user.
+
+## Prerequisites
+
+TiSpark is based on the Apache Spark cluster, so before you start the TiDB cluster that contains TiSpark, you must ensure that Java Runtime Environment (JRE) 8 is installed on the server that deploys TiSpark. Otherwise, TiSpark cannot be started.
+
+TiUP does not support installing JRE automatically. You need to install it on your own. For detailed installation instruction, see [How to download and install prebuilt OpenJDK packages](https://openjdk.java.net/install/).
+
+If JRE 8 has already been installed on the deployment server but is not in the path of the system's default package management tool, you can specify the path of the JRE environment to be used by setting the `java_home` parameter in the topology configuration. This parameter corresponds to the `JAVA_HOME` system environment variable.
diff --git a/tiup/tiup-cluster.md b/tiup/tiup-cluster.md
@@ -430,6 +430,10 @@ tiup cluster patch test-cluster /tmp/tidb-hotfix.tar.gz -N 172.16.4.5:4000
 
 ## Import TiDB Ansible cluster
 
+> **Note:**
+>
+> Currently, TiUP cluster's support for TiSpark is still **experimental**. It is not supported to import a TiDB cluster with TiSpark enabled.
+
 Before TiUP is released, TiDB Ansible is often used to deploy TiDB clusters. To enable TiUP to take over the cluster deployed by TiDB Ansible, use the `import` command.
 
 The usage of the `import` command is as follows: