remove python docs (they're added to the python project in PR#)

fix some links and docs fix docker build for datafusion-cli and update docs improve left nav link naming for clarity consolidated the documentation for the CLI into one page per issue apache#1352 fix the CSV schema inference in datafusion-cli docs per apache#3001
kmitchener · Aug 1, 2022 · 86cd179 · 86cd179
1 parent c179102
commit 86cd179
Show file tree

Hide file tree

Showing 26 changed files with 280 additions and 706 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1,2 @@
 .git
-**target
+**/target
diff --git a/datafusion-cli/Dockerfile b/datafusion-cli/Dockerfile
@@ -17,19 +17,15 @@
 
 FROM rust:1.59 as builder
 
-COPY ./datafusion /usr/src/datafusion
-
-COPY ./datafusion-cli /usr/src/datafusion-cli
-
-WORKDIR /usr/src/datafusion-cli
-
-RUN rustup component add rustfmt
+WORKDIR /usr/src/build
+COPY . .
+WORKDIR /usr/src/build/datafusion-cli
 
 RUN cargo build --release
 
 FROM debian:bullseye-slim
 
-COPY --from=builder /usr/src/datafusion-cli/target/release/datafusion-cli /usr/local/bin
+COPY --from=builder /usr/src/build/datafusion-cli/target/release/datafusion-cli /usr/local/bin
 
 ENTRYPOINT ["datafusion-cli"]
 

diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
@@ -106,7 +106,7 @@
 //! Specifically, when DataFusion receives an SQL query, there are different steps
 //! that it passes through until a result is obtained. Broadly, they are:
 //!
-//! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.6.1/sqlparser/).
+//! 1. The string is parsed to an Abstract syntax tree (AST) using [sqlparser](https://docs.rs/sqlparser/0.18.0/sqlparser/).
 //! 2. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical expressions on the AST to logical expressions [`Expr`s](logical_plan::Expr).
 //! 3. The planner [`SqlToRel`](sql::planner::SqlToRel) converts logical nodes on the AST to a [`LogicalPlan`](logical_plan::LogicalPlan).
 //! 4. [`OptimizerRules`](optimizer::optimizer::OptimizerRule) are applied to the [`LogicalPlan`](logical_plan::LogicalPlan) to optimize it.
@@ -139,12 +139,12 @@
 //!
 //! A [`ExecutionPlan`](physical_plan::ExecutionPlan) is composed by nodes (implement the trait [`ExecutionPlan`](physical_plan::ExecutionPlan)),
 //! and each node is composed by physical expressions ([`PhysicalExpr`](physical_plan::PhysicalExpr))
-//! or aggreagate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)).
+//! or aggregate expressions ([`AggregateExpr`](physical_plan::AggregateExpr)).
 //! All of these are located in the module [`physical_plan`](physical_plan).
 //!
 //! Broadly speaking,
 //!
-//! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asyncronosly returns
+//! * an [`ExecutionPlan`](physical_plan::ExecutionPlan) receives a partition number and asynchronously returns
 //!   an iterator over [`RecordBatch`](arrow::record_batch::RecordBatch)
 //!   (a node-specific struct that implements [`RecordBatchReader`](arrow::record_batch::RecordBatchReader))
 //! * a [`PhysicalExpr`](physical_plan::PhysicalExpr) receives a [`RecordBatch`](arrow::record_batch::RecordBatch)
@@ -249,5 +249,11 @@ doc_comment::doctest!("../../../README.md", readme_example_test);
 #[cfg(doctest)]
 doc_comment::doctest!(
     "../../../docs/source/user-guide/example-usage.md",
-    user_guid_example_tests
+    user_guide_example_usage
+);
+
+#[cfg(doctest)]
+doc_comment::doctest!(
+    "../../../docs/source/user-guide/library.md",
+    user_guide_library
 );
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -16,5 +16,4 @@
 # under the License.
 
 build
-source/python/generated
 venv/
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -15,9 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-datafusion
 sphinx
-pydata-sphinx-theme==0.8.0
+pydata-sphinx-theme==0.8.1
 myst-parser
 maturin
 jinja2
diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html
@@ -10,10 +10,6 @@
 
 <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
   <div class="bd-toc-item active">
-    {% if "python/api" in pagename or "python/generated" in pagename %}
-    {{ generate_nav_html("sidebar", startdepth=0, maxdepth=3, collapse=False, includehidden=True, titles_only=True) }}
-    {% else %}
     {{ generate_nav_html("sidebar", startdepth=0, maxdepth=4, collapse=False, includehidden=True, titles_only=True) }}
-    {% endif %}
   </div>
 </nav>
diff --git a/docs/source/cli/index.md b/docs/source/cli/index.md
@@ -0,0 +1,146 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Command-line
+
+The Arrow DataFusion CLI is a command-line interactive SQL utility that allows
+queries to be executed against CSV, Json, Parquet, and Avro files. It embeds the DataFusion library
+in a command line utility and is a convenient way to try DataFusion with your own data sources.
+
+## Installation
+### Via Cargo
+
+If you already have the Rust toolchain installed, the easiest way to install DataFusion CLI is to install it via cargo:
+
+```
+cargo install datafusion-cli
+```
+
+Or you can build from source, taking the latest code from master:
+
+```
+git clone https://github.com/apache/arrow-datafusion && cd arrow-datafusion\datafusion-cli
+cargo install --path .
+```
+
+### Via Docker
+
+If you don't have the Rust toolchain installed, but you do have Docker, you can build DataFusion CLI inside Docker. 
+There is no officially published Docker image for the DataFusion CLI, so it is necessary to build from source
+instead.
+
+Use the following commands to clone this repository and build a Docker image containing the CLI tool.
+
+```
+    git clone https://github.com/apache/arrow-datafusion && cd arrow-datafusion
+    docker build -f datafusion-cli/Dockerfile . --tag datafusion-cli
+    docker run -it -v $(your_data_location):/data datafusion-cli
+```
+
+### Via Homebrew (on MacOS)
+
+DataFusion CLI can also be installed via Homebrew (on MacOS). Install it as any other pre-built software like this:
+
+```
+    brew install datafusion
+    # ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/manifests/5.0.0
+    # ######################################################################## 100.0%
+    # ==> Downloading https://ghcr.io/v2/homebrew/core/datafusion/blobs/sha256:9ecc8a01be47ceb9a53b39976696afa87c0a8
+    # ==> Downloading from https://pkg-containers.githubusercontent.com/ghcr1/blobs/sha256:9ecc8a01be47ceb9a53b39976
+    # ######################################################################## 100.0%
+    # ==> Pouring datafusion--5.0.0.big_sur.bottle.tar.gz
+    # 🍺  /usr/local/Cellar/datafusion/5.0.0: 9 files, 17.4MB
+    
+    datafusion-cli
+```
+
+## Usage
+
+```
+    Apache Arrow <dev@arrow.apache.org>
+    Command Line Client for DataFusion query engine.
+    
+    USAGE:
+        datafusion-cli [OPTIONS]
+    
+    OPTIONS:
+        -c, --batch-size <BATCH_SIZE>    The batch size of each query, or use DataFusion default
+        -f, --file <FILE>...             Execute commands from file(s), then exit
+            --format <FORMAT>            [default: table] [possible values: csv, tsv, table, json,
+                                         nd-json]
+        -h, --help                       Print help information
+        -p, --data-path <DATA_PATH>      Path to your data, default to current directory
+        -q, --quiet                      Reduce printing other than the results and work quietly
+        -r, --rc <RC>...                 Run the provided files on startup instead of ~/.datafusionrc
+        -V, --version                    Print version information
+```
+
+Type `\q` to exit the CLI.
+
+
+### Registering Parquet Data Sources
+
+Parquet data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is not necessary to provide schema information for Parquet files.
+
+```sql
+    CREATE EXTERNAL TABLE taxi
+    STORED AS PARQUET
+    LOCATION '/mnt/nyctaxi/tripdata.parquet';
+```
+
+### Registering CSV Data Sources
+
+CSV data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement. It is necessary to provide schema information for CSV files since DataFusion does not automatically infer the schema when using SQL to query CSV files.
+
+```sql
+    CREATE EXTERNAL TABLE test (
+        c1  VARCHAR NOT NULL,
+        c2  INT NOT NULL,
+        c3  SMALLINT NOT NULL,
+        c4  SMALLINT NOT NULL,
+        c5  INT NOT NULL,
+        c6  BIGINT NOT NULL,
+        c7  SMALLINT NOT NULL,
+        c8  INT NOT NULL,
+        c9  BIGINT NOT NULL,
+        c10 VARCHAR NOT NULL,
+        c11 FLOAT NOT NULL,
+        c12 DOUBLE NOT NULL,
+        c13 VARCHAR NOT NULL
+    )
+    STORED AS CSV
+    WITH HEADER ROW
+    LOCATION '/path/to/aggregate_test_100.csv';
+```
+
+## CLI Commands
+
+Available commands inside DataFusion CLI are:
+
+| Command                | Description                         |
+|:-----------------------|-------------------------------------|
+| `\q`                   | quit                                |
+| `\?`                   | help                                |
+| `\d`                   | list tables                         |
+| `\d table_name`        | describe table                      |
+| `\quiet <true/false>`  | enable/disable quiet mode           |
+| `\h`                   | list available commands             |
+| `\h function`          | get help for specific command       |
+| `\pset [NAME [VALUE]]` | set option (eg: `\pset format csv`) |
+
diff --git a/docs/source/cli/index.rst b/docs/source/cli/index.rst
diff --git a/docs/source/community/communication.md b/docs/source/community/communication.md
@@ -75,9 +75,8 @@ We will send a summary of all sync ups to the dev@arrow.apache.org mailing list.
 Our source code is hosted on
 [GitHub](https://github.com/apache/arrow-datafusion). More information on contributing is in
 the [Contribution Guide](https://github.com/apache/arrow-datafusion/blob/master/CONTRIBUTING.md)
-, and we have curated a [good-first-issue]
-(https://github.com/apache/arrow-datafusion/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
-list to help you get started. You can find datafusion's major designs in docs/source/specification.
+, and we have curated a [good-first-issue](https://github.com/apache/arrow-datafusion/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+list to help you get started. You can find DataFusion's major designs in docs/source/specification.
 
 We use GitHub issues for maintaining a queue of development work and as the
 public record. We often use Google docs, Github issues and pull requests for

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -31,8 +31,6 @@
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
-import datafusion
-
 # -- Project information -----------------------------------------------------
 
 project = 'Arrow DataFusion'
@@ -88,6 +86,7 @@
 
 html_theme_options = {
     "use_edit_page_button": True,
+    "show_toc_level": 2,
 }
 
 html_context = {