From 592d1de648d1b0e2d4df445e1adf1be694332fda Mon Sep 17 00:00:00 2001 From: Tennix Date: Wed, 15 Aug 2018 16:50:58 +0800 Subject: [PATCH] readme, tispark: update TiSpark and enable sparkR and pyspark (#27) * update tispark to 1.0 * add TiSparkR * upgrade tispark version to 1.0.1 --- README.md | 9 +++++++++ tispark/Dockerfile | 23 ++++++++++++++++++---- tispark/R/DESCRIPTION | 11 +++++++++++ tispark/R/NAMESPACE | 1 + tispark/R/R/tisparkR.R | 41 +++++++++++++++++++++++++++++++++++++++ tispark/R/README.md | 44 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 tispark/R/DESCRIPTION create mode 100644 tispark/R/NAMESPACE create mode 100644 tispark/R/R/tisparkR.R create mode 100644 tispark/R/README.md diff --git a/README.md b/README.md index f926181..c7c13fd 100644 --- a/README.md +++ b/README.md @@ -184,3 +184,12 @@ scala> spark.sql("select count(*) from lineitem").show | 60175| +--------+ ``` + +You can also access Spark with Python or R using the following commands: + +``` +docker-compose exec tispark-master /opt/spark/bin/pyspark +docker-compose exec tispark-master /opt/spark/bin/sparkR +``` + +More documents about TiSpark can be found [here](https://github.com/pingcap/tispark). diff --git a/tispark/Dockerfile b/tispark/Dockerfile index 750e377..8271564 100644 --- a/tispark/Dockerfile +++ b/tispark/Dockerfile @@ -2,25 +2,40 @@ FROM anapsix/alpine-java:8 ENV SPARK_VERSION=2.1.1 \ HADOOP_VERSION=2.7 \ - TISPARK_VERSION=0.1.0-SNAPSHOT \ + TISPARK_VERSION=1.0.1 \ + TISPARK_R_VERSION=1.1 \ + TISPARK_PYTHON_VERSION=1.0.1 \ SPARK_HOME=/opt/spark \ SPARK_NO_DAEMONIZE=true \ SPARK_MASTER_PORT=7077 \ SPARK_MASTER_HOST=0.0.0.0 \ SPARK_MASTER_WEBUI_PORT=8080 +ADD R /TiSparkR + # base image only contains busybox version nohup and ps # spark scripts needs nohup in coreutils and ps in procps # and we can use mysql-client to test tidb connection -RUN apk --no-cache add coreutils procps mysql-client python py-pip R \ - && pip install pytispark==1.0.1 pyspark==2.1.2 +RUN apk --no-cache add \ + coreutils \ + mysql-client \ + procps \ + python \ + py-pip \ + R \ + && pip install --no-cache-dir pytispark==${TISPARK_PYTHON_VERSION} \ + && R CMD build TiSparkR \ + && R CMD INSTALL TiSparkR_${TISPARK_R_VERSION}.tar.gz \ + && rm -rf /TiSparkR_${TISPARK_R_VERSION}.tar.gz /TiSparkR RUN wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ && tar zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /opt/ \ && ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} ${SPARK_HOME} \ - && wget -q http://download.pingcap.org/tispark-${TISPARK_VERSION}-jar-with-dependencies.jar -P ${SPARK_HOME}/jars \ + && wget -q https://github.com/pingcap/tispark/releases/download/${TISPARK_VERSION}/tispark-core-${TISPARK_VERSION}-jar-with-dependencies.jar -P ${SPARK_HOME}/jars \ && wget -q http://download.pingcap.org/tispark-sample-data.tar.gz \ && tar zxf tispark-sample-data.tar.gz -C ${SPARK_HOME}/data/ \ && rm -rf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz tispark-sample-data.tar.gz +ENV PYTHONPATH=${SPARK_HOME}/python/lib/py4j-0.10.4-src.zip:${SPARK_HOME}/python:$PYTHONPATH + WORKDIR ${SPARK_HOME} diff --git a/tispark/R/DESCRIPTION b/tispark/R/DESCRIPTION new file mode 100644 index 0000000..fbfbe62 --- /dev/null +++ b/tispark/R/DESCRIPTION @@ -0,0 +1,11 @@ +Package: TiSparkR +Type: Package +Title: TiSpark for R +Version: 1.1 +Author: PingCAP +Maintainer: Novemser +Description: A shabby thin layer to support TiSpark in R language. +License: Apache 2.0 +Copyright: 2017 PingCAP, Inc. +Encoding: UTF-8 +LazyData: true diff --git a/tispark/R/NAMESPACE b/tispark/R/NAMESPACE new file mode 100644 index 0000000..d75f824 --- /dev/null +++ b/tispark/R/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/tispark/R/R/tisparkR.R b/tispark/R/R/tisparkR.R new file mode 100644 index 0000000..ac7a272 --- /dev/null +++ b/tispark/R/R/tisparkR.R @@ -0,0 +1,41 @@ +# +# Copyright 2017 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# Title : TiSparkR +# Objective : TiSpark entry for R +# Created by: novemser +# Created on: 17-11-1 + +# Function:createTiContext +# Create a new TiContext via the spark session passed in +# +# @return A new TiContext created on session +# @param session A Spark Session for TiContext creation +createTiContext <- function(session) { + sparkR.newJObject("org.apache.spark.sql.TiContext", session) +} + +# Function:tidbMapDatabase +# Mapping TiContext designated database to `dbName`. +# +# @param tiContext TiSpark context +# @param dbName Database name to map +# @param isPrefix Whether to use dbName As Prefix +# @param loadStatistics Whether to use statistics information from TiDB +tidbMapDatabase <- function(tiContext, dbName, isPrefix=FALSE, loadStatistics=TRUE) { + sparkR.callJMethod(tiContext, "tidbMapDatabase", dbName, isPrefix, loadStatistics) + paste("Mapping to database:", dbName) +} diff --git a/tispark/R/README.md b/tispark/R/README.md new file mode 100644 index 0000000..fab5d3f --- /dev/null +++ b/tispark/R/README.md @@ -0,0 +1,44 @@ +## TiSparkR +TiSparkR is a thin layer built to support the R language with TiSpark. + +### Usage +1. Download the TiSparkR source code and build a binary package (run `R CMD build R` in TiSpark root directory). Install it to your local R library (e.g. via `R CMD INSTALL TiSparkR_1.0.0.tar.gz`) + +2. Build or download TiSpark dependency jar `tispark-core-1.0-RC1-jar-with-dependencies.jar` [here](https://github.com/pingcap/tispark). + +3. `cd` to your Spark home directory, and run: + ``` + ./bin/sparkR --jars /where-ever-it-is/tispark-core-${version}-jar-with-dependencies.jar + ``` + Note that you should replace the `TiSpark` jar path with your own. + +4. Use as below in your R console: + ```R + # import tisparkR library + > library(TiSparkR) + # create a TiContext instance + > ti <- createTiContext(spark) + # Map TiContext to database:tpch_test + > tidbMapDatabase(ti, "tpch_test") + + # Run a sql query + > customers <- sql("select * from customer") + # Print schema + > printSchema(customers) + root + |-- c_custkey: long (nullable = true) + |-- c_name: string (nullable = true) + |-- c_address: string (nullable = true) + |-- c_nationkey: long (nullable = true) + |-- c_phone: string (nullable = true) + |-- c_acctbal: decimal(15,2) (nullable = true) + |-- c_mktsegment: string (nullable = true) + |-- c_comment: string (nullable = true) + + # Run a count query + > count <- sql("select count(*) from customer") + # Print count result + > head(count) + count(1) + 1 150 + ```