Fix documentation warnings and error if anymore occur (#14952)

AmosAidoo · alamb · web-flow · commit ec222096b9d7 · 2025-03-04T06:38:16.000-05:00
* feat: treat sphinx-build warnings as errors

this should fail during build until warnings are fixed

* fix: ./gen is not valid link in current context

replaces the link with a code backticks

* fix: replace with valid docs.rs link

* fix: respect hierarchy from H2 -&gt; H3

* apply prettier fixes

* Add build.sh to the CI check

* tweak

* fix

* Update spans elsewhere

---------

Co-authored-by: Andrew Lamb &lt;andrew@nerdnetworks.org&gt;
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 on:
   push:
     branches:
diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Tests for Docs that runs on PRs
 name: Docs
 
 concurrency:
@@ -48,7 +49,34 @@ jobs:
         uses: ./.github/actions/setup-builder
         with:
           rust-version: stable
-      - name: Run doctests
+      - name: Run doctests (embedded rust examples)
         run: cargo test --doc --features avro,json
       - name: Verify Working Directory Clean
         run: git diff --exit-code
+
+  # Test doc build
+  linux-test-doc-build:
+    name: Test doc build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+      - name: Install doc dependencies
+        run: |
+          set -x
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r docs/requirements.txt
+      - name: Build docs html and check for warnings
+        run: |
+          set -x
+          source venv/bin/activate
+          cd docs
+          ./build.sh # fails on errors
+
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -253,7 +253,7 @@ config_namespace! {
         pub support_varchar_with_length: bool, default = true
 
         /// When set to true, the source locations relative to the original SQL
-        /// query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected
+        /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected
         /// and recorded in the logical plan nodes.
         pub collect_spans: bool, default = false
 
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -355,7 +355,7 @@ datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a
 datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level
 datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail
 datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys
-datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes.
+datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.
 datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
 datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
 datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.
diff --git a/docs/build.sh b/docs/build.sh
@@ -28,4 +28,4 @@ sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/
 
 python rustdoc_trim.py
 
-make SOURCEDIR=`pwd`/temp html
+make SOURCEDIR=`pwd`/temp SPHINXOPTS=-W html
diff --git a/docs/source/contributor-guide/howtos.md b/docs/source/contributor-guide/howtos.md
@@ -141,9 +141,9 @@ taplo fmt
 
 ## How to update protobuf/gen dependencies
 
-The prost/tonic code can be generated by running `./regen.sh`, which in turn invokes the Rust binary located in [gen](./gen)
+The prost/tonic code can be generated by running `./regen.sh`, which in turn invokes the Rust binary located in `./gen`
 
-This is necessary after modifying the protobuf definitions or altering the dependencies of [gen](./gen), and requires a
+This is necessary after modifying the protobuf definitions or altering the dependencies of `./gen`, and requires a
 valid installation of [protoc] (see [installation instructions] for details).
 
 ```bash
diff --git a/docs/source/library-user-guide/query-optimizer.md b/docs/source/library-user-guide/query-optimizer.md
@@ -401,7 +401,7 @@ interval arithmetic to take an expression such as `a > 2500 AND a <= 5000` and
 build an accurate selectivity estimate that can then be used to find more efficient
 plans.
 
-#### `AnalysisContext` API
+### `AnalysisContext` API
 
 The `AnalysisContext` serves as a shared knowledge base during expression evaluation
 and boundary analysis. Think of it as a dynamic repository that maintains information about:
@@ -414,7 +414,7 @@ What makes `AnalysisContext` particularly powerful is its ability to propagate i
 through the expression tree. As each node in the expression tree is analyzed, it can both
 read from and write to this shared context, allowing for sophisticated boundary analysis and inference.
 
-#### `ColumnStatistics` for Cardinality Estimation
+### `ColumnStatistics` for Cardinality Estimation
 
 Column statistics form the foundation of optimization decisions. Rather than just tracking
 simple metrics, DataFusion's `ColumnStatistics` provides a rich set of information including:
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -127,5 +127,5 @@ Environment variables are read during `SessionConfig` initialisation so they mus
 | datafusion.sql_parser.enable_options_value_normalization                | false                     | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.                                                                                                                                                                                                                                                                                                                                                    |
 | datafusion.sql_parser.dialect                                           | generic                   | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.                                                                                                                                                                                                                                                                                                                                                                |
 | datafusion.sql_parser.support_varchar_with_length                       | true                      | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.                                                                                                                                                                                                                                                                                                    |
-| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                                                                          |
+| datafusion.sql_parser.collect_spans                                     | false                     | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes.                                                                                                                                                                                                                                                                                                                                               |
 | datafusion.sql_parser.recursion_limit                                   | 50                        | Specifies the recursion depth limit when parsing complex SQL Queries                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |

Original file line number	Diff line number	Diff line change
`@@ -28,4 +28,4 @@ sed -i -e 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/`
`28`	`28`
`29`	`29`	`python rustdoc_trim.py`
`30`	`30`
`31`		-make SOURCEDIR=`pwd`/temp html
	`31`	+make SOURCEDIR=`pwd`/temp SPHINXOPTS=-W html