Skip to content

Commit 1e44417

Browse files
andygroveloic-sharmaalamb
authored
Add DataFrame reference to the user guide (#3067)
* DataFrame reference for user guide * Add DataFrame methods * improvements * improvements * improvements * formatting * Update docs/source/user-guide/dataframe.md Co-authored-by: Loïc Sharma <737941+loic-sharma@users.noreply.github.com> * Update docs/source/user-guide/dataframe.md Co-authored-by: Loïc Sharma <737941+loic-sharma@users.noreply.github.com> * more docs * more docs * Update docs/source/user-guide/dataframe.md Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update docs/source/user-guide/dataframe.md Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update docs/source/user-guide/dataframe.md Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update docs/source/user-guide/dataframe.md Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * add note on lazy evaluation Co-authored-by: Loïc Sharma <737941+loic-sharma@users.noreply.github.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 098f0b0 commit 1e44417

File tree

4 files changed

+315
-24
lines changed

4 files changed

+315
-24
lines changed

datafusion/core/src/dataframe.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ impl DataFrame {
621621
plan_to_json(&state, plan, path).await
622622
}
623623

624-
/// Create a projection based on arbitrary expressions.
624+
/// Add an additional column to the DataFrame.
625625
///
626626
/// ```
627627
/// # use datafusion::prelude::*;

datafusion/expr/src/expr_fn.rs

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -251,12 +251,10 @@ pub fn rollup(exprs: Vec<Expr>) -> Expr {
251251
Expr::GroupingSet(GroupingSet::Rollup(exprs))
252252
}
253253

254-
// TODO(kszucs): this seems buggy, unary_scalar_expr! is used for many
255-
// varying arity functions
256254
/// Create an convenience function representing a unary scalar function
257255
macro_rules! unary_scalar_expr {
258-
($ENUM:ident, $FUNC:ident) => {
259-
#[doc = concat!("Unary scalar function definition for ", stringify!($FUNC) ) ]
256+
($ENUM:ident, $FUNC:ident, $DOC:expr) => {
257+
#[doc = $DOC ]
260258
pub fn $FUNC(e: Expr) -> Expr {
261259
Expr::ScalarFunction {
262260
fun: built_in_function::BuiltinScalarFunction::$ENUM,
@@ -293,25 +291,32 @@ macro_rules! nary_scalar_expr {
293291
// generate methods for creating the supported unary/binary expressions
294292

295293
// math functions
296-
unary_scalar_expr!(Sqrt, sqrt);
297-
unary_scalar_expr!(Sin, sin);
298-
unary_scalar_expr!(Cos, cos);
299-
unary_scalar_expr!(Tan, tan);
300-
unary_scalar_expr!(Asin, asin);
301-
unary_scalar_expr!(Acos, acos);
302-
unary_scalar_expr!(Atan, atan);
303-
unary_scalar_expr!(Floor, floor);
304-
unary_scalar_expr!(Ceil, ceil);
305-
unary_scalar_expr!(Now, now);
306-
unary_scalar_expr!(Round, round);
307-
unary_scalar_expr!(Trunc, trunc);
308-
unary_scalar_expr!(Abs, abs);
309-
unary_scalar_expr!(Signum, signum);
310-
unary_scalar_expr!(Exp, exp);
311-
unary_scalar_expr!(Log2, log2);
312-
unary_scalar_expr!(Log10, log10);
313-
unary_scalar_expr!(Ln, ln);
314-
unary_scalar_expr!(NullIf, nullif);
294+
unary_scalar_expr!(Sqrt, sqrt, "square root of a number");
295+
unary_scalar_expr!(Sin, sin, "sine");
296+
unary_scalar_expr!(Cos, cos, "cosine");
297+
unary_scalar_expr!(Tan, tan, "tangent");
298+
unary_scalar_expr!(Asin, asin, "inverse sine");
299+
unary_scalar_expr!(Acos, acos, "inverse cosine");
300+
unary_scalar_expr!(Atan, atan, "inverse tangent");
301+
unary_scalar_expr!(
302+
Floor,
303+
floor,
304+
"nearest integer less than or equal to argument"
305+
);
306+
unary_scalar_expr!(
307+
Ceil,
308+
ceil,
309+
"nearest integer greater than or equal to argument"
310+
);
311+
unary_scalar_expr!(Round, round, "round to nearest integer");
312+
unary_scalar_expr!(Trunc, trunc, "truncate toward zero");
313+
unary_scalar_expr!(Abs, abs, "absolute value");
314+
unary_scalar_expr!(Signum, signum, "sign of the argument (-1, 0, +1) ");
315+
unary_scalar_expr!(Exp, exp, "base 2 logarithm");
316+
unary_scalar_expr!(Log2, log2, "base 10 logarithm");
317+
unary_scalar_expr!(Log10, log10, "base 10 logarithm");
318+
unary_scalar_expr!(Ln, ln, "natural logarithm");
319+
unary_scalar_expr!(NullIf, nullif, "The NULLIF function returns a null value if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression."); //TODO this is not a unary expression https://github.com/apache/arrow-datafusion/issues/3069
315320
scalar_expr!(Power, power, base, exponent);
316321
scalar_expr!(Atan2, atan2, y, x);
317322

@@ -357,6 +362,7 @@ nary_scalar_expr!(Concat, concat_expr);
357362
nary_scalar_expr!(Now, now_expr);
358363

359364
// date functions
365+
unary_scalar_expr!(Now, now, "current time"); //TODO this is not a unary expression https://github.com/apache/arrow-datafusion/issues/3069
360366
scalar_expr!(DatePart, date_part, part, date);
361367
scalar_expr!(DateTrunc, date_trunc, part, date);
362368
scalar_expr!(DateBin, date_bin, stride, source, origin);

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Table of Contents
4242
user-guide/example-usage
4343
user-guide/library
4444
user-guide/cli
45+
user-guide/dataframe
4546
user-guide/sql/index
4647
user-guide/configs
4748
user-guide/faq
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
<!---
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# DataFrame API
21+
22+
A DataFrame represents a logical set of rows with the same named columns, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or
23+
[Spark DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html).
24+
25+
DataFrames are typically created by calling a method on
26+
`SessionContext`, such as `read_csv`, and can then be modified
27+
by calling the transformation methods, such as `filter`, `select`, `aggregate`, and `limit`
28+
to build up a query definition.
29+
30+
The query can be executed by calling the `collect` method.
31+
32+
The API is well documented in the [API reference on docs.rs](https://docs.rs/datafusion/latest/datafusion/dataframe/struct.DataFrame.html)
33+
34+
The DataFrame struct is part of DataFusion's prelude and can be imported with the following statement.
35+
36+
```rust
37+
use datafusion::prelude::*;
38+
```
39+
40+
Here is a minimal example showing the execution of a query using the DataFrame API.
41+
42+
```rust
43+
let ctx = SessionContext::new();
44+
let df = ctx.read_csv("tests/example.csv", CsvReadOptions::new()).await?;
45+
let df = df.filter(col("a").lt_eq(col("b")))?
46+
.aggregate(vec![col("a")], vec![min(col("b"))])?
47+
.limit(None, Some(100))?;
48+
// Print results
49+
df.show();
50+
```
51+
52+
## DataFrame Transformations
53+
54+
These methods create a new DataFrame after applying a transformation to the logical plan that the DataFrame represents.
55+
56+
DataFusion DataFrames use lazy evaluation, meaning that each transformation is just creating a new query plan and
57+
not actually performing any transformations. This approach allows for the overall plan to be optimized before
58+
execution. The plan is evaluated (executed) when an action method is invoked, such as `collect`.
59+
60+
| Function | Notes |
61+
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
62+
| aggregate | Perform an aggregate query with optional grouping expressions. |
63+
| distinct | Filter out duplicate rows. |
64+
| except | Calculate the exception of two DataFrames. The two DataFrames must have exactly the same schema |
65+
| filter | Filter a DataFrame to only include rows that match the specified filter expression. |
66+
| intersect | Calculate the intersection of two DataFrames. The two DataFrames must have exactly the same schema |
67+
| join | Join this DataFrame with another DataFrame using the specified columns as join keys. |
68+
| limit | Limit the number of rows returned from this DataFrame. |
69+
| repartition | Repartition a DataFrame based on a logical partitioning scheme. |
70+
| sort | Sort the DataFrame by the specified sorting expressions. Any expression can be turned into a sort expression by calling its `sort` method. |
71+
| select | Create a projection based on arbitrary expressions. Example: `df..select(vec![col("c1"), abs(col("c2"))])?` |
72+
| select_columns | Create a projection based on column names. Example: `df.select_columns(&["id", "name"])?`. |
73+
| union | Calculate the union of two DataFrames, preserving duplicate rows. The two DataFrames must have exactly the same schema. |
74+
| union_distinct | Calculate the distinct union of two DataFrames. The two DataFrames must have exactly the same schema. |
75+
| with_column | Add an additional column to the DataFrame. |
76+
| with_column_renamed | Rename one column by applying a new projection. |
77+
78+
## DataFrame Actions
79+
80+
These methods execute the logical plan represented by the DataFrame and either collects the results into memory, prints them to stdout, or writes them to disk.
81+
82+
| Function | Notes |
83+
| -------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
84+
| collect | Executes this DataFrame and collects all results into a vector of RecordBatch. |
85+
| collect_partitioned | Executes this DataFrame and collects all results into a vector of vector of RecordBatch maintaining the input partitioning. |
86+
| execute_stream | Executes this DataFrame and returns a stream over a single partition. |
87+
| execute_stream_partitioned | Executes this DataFrame and returns one stream per partition. |
88+
| show | Execute this DataFrame and print the results to stdout. |
89+
| show_limit | Execute this DataFrame and print a subset of results to stdout. |
90+
| write_csv | Execute this DataFrame and write the results to disk in CSV format. |
91+
| write_json | Execute this DataFrame and write the results to disk in JSON format. |
92+
| write_parquet | Execute this DataFrame and write the results to disk in Parquet format. |
93+
94+
## Other DataFrame Methods
95+
96+
| Function | Notes |
97+
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
98+
| explain | Return a DataFrame with the explanation of its plan so far. |
99+
| registry | Return a `FunctionRegistry` used to plan udf's calls. |
100+
| schema | Returns the schema describing the output of this DataFrame in terms of columns returned, where each column has a name, data type, and nullability attribute. |
101+
| to_logical_plan | Return the logical plan represented by this DataFrame. |
102+
103+
# Expressions
104+
105+
DataFrame methods such as `select` and `filter` accept one or more logical expressions and there are many functions
106+
available for creating logical expressions. These are documented below.
107+
108+
Expressions can be chained together using a fluent-style API:
109+
110+
```rust
111+
// create the expression `(a > 5) AND (b < 7)`
112+
col("a").gt(lit(5)).and(col("b").lt(lit(7)))
113+
```
114+
115+
## Identifiers
116+
117+
| Function | Notes |
118+
| -------- | -------------------------------------------- |
119+
| col | Reference a column in a dataframe `col("a")` |
120+
121+
## Literal Values
122+
123+
| Function | Notes |
124+
| -------- | -------------------------------------------------- |
125+
| lit | Literal value such as `lit(123)` or `lit("hello")` |
126+
127+
## Boolean Expressions
128+
129+
| Function | Notes |
130+
| -------- | ----------------------------------------- |
131+
| and | `and(expr1, expr2)` or `expr1.and(expr2)` |
132+
| or | `or(expr1, expr2)` or `expr1.or(expr2)` |
133+
| not | `not(expr)` or `expr.not()` |
134+
135+
## Comparison Expressions
136+
137+
| Function | Notes |
138+
| -------- | --------------------- |
139+
| eq | `expr1.eq(expr2)` |
140+
| gt | `expr1.gt(expr2)` |
141+
| gt_eq | `expr1.gt_eq(expr2)` |
142+
| lt | `expr1.lt(expr2)` |
143+
| lt_eq | `expr1.lt_eq(expr2)` |
144+
| not_eq | `expr1.not_eq(expr2)` |
145+
146+
## Math Functions
147+
148+
In addition to the math functions listed here, some Rust operators are implemented for expressions, allowing
149+
expressions such as `col("a") + col("b")` to be used.
150+
151+
| Function | Notes |
152+
| --------------------- | ------------------------------------------------- |
153+
| abs(x) | absolute value |
154+
| acos(x) | inverse cosine |
155+
| asin(x) | inverse sine |
156+
| atan(x) | inverse tangent |
157+
| atan2(y, x) | inverse tangent of y / x |
158+
| ceil(x) | nearest integer greater than or equal to argument |
159+
| cos(x) | cosine |
160+
| exp(x) | exponential |
161+
| floor(x) | nearest integer less than or equal to argument |
162+
| ln(x) | natural logarithm |
163+
| log10(x) | base 10 logarithm |
164+
| log2(x) | base 2 logarithm |
165+
| power(base, exponent) | base raised to the power of exponent |
166+
| round(x) | round to nearest integer |
167+
| signum(x) | sign of the argument (-1, 0, +1) |
168+
| sin(x) | sine |
169+
| sqrt(x) | square root |
170+
| tan(x) | tangent |
171+
| trunc(x) | truncate toward zero |
172+
173+
## Conditional Expressions
174+
175+
| Function | Notes |
176+
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
177+
| coalesce | Returns the first of its arguments that is not null. Null is returned only if all arguments are null. It is often used to substitute a default value for null values when data is retrieved for display. |
178+
| case | CASE expression. Example: `case(expr).when(expr, expr).when(expr, expr).otherwise(expr).end()`. |
179+
| nullif | Returns a null value if `value1` equals `value2`; otherwise it returns `value1`. This can be used to perform the inverse operation of the `coalesce` expression. |
180+
181+
## String Expressions
182+
183+
| Function | Notes |
184+
| ---------------- | ----- |
185+
| ascii | |
186+
| bit_length | |
187+
| btrim | |
188+
| char_length | |
189+
| character_length | |
190+
| concat | |
191+
| concat_ws | |
192+
| chr | |
193+
| initcap | |
194+
| left | |
195+
| length | |
196+
| lower | |
197+
| lpad | |
198+
| ltrim | |
199+
| md5 | |
200+
| octet_length | |
201+
| repeat | |
202+
| replace | |
203+
| reverse | |
204+
| right | |
205+
| rpad | |
206+
| rtrim | |
207+
| digest | |
208+
| split_part | |
209+
| starts_with | |
210+
| strpos | |
211+
| substr | |
212+
| translate | |
213+
| trim | |
214+
| upper | |
215+
216+
## Regular Expressions
217+
218+
| Function | Notes |
219+
| -------------- | ----- |
220+
| regexp_match | |
221+
| regexp_replace | |
222+
223+
## Temporal Expressions
224+
225+
| Function | Notes |
226+
| -------------------- | ------------ |
227+
| date_part | |
228+
| date_trunc | |
229+
| from_unixtime | |
230+
| to_timestamp | |
231+
| to_timestamp_millis | |
232+
| to_timestamp_micros | |
233+
| to_timestamp_seconds | |
234+
| now() | current time |
235+
236+
## Other Expressions
237+
238+
| Function | Notes |
239+
| -------- | ----- |
240+
| array | |
241+
| in_list | |
242+
| random | |
243+
| sha224 | |
244+
| sha256 | |
245+
| sha384 | |
246+
| sha512 | |
247+
| struct | |
248+
| to_hex | |
249+
250+
## Aggregate Functions
251+
252+
| Function | Notes |
253+
| ---------------------------------- | ----- |
254+
| avg | |
255+
| approx_distinct | |
256+
| approx_median | |
257+
| approx_percentile_cont | |
258+
| approx_percentile_cont_with_weight | |
259+
| count | |
260+
| count_distinct | |
261+
| cube | |
262+
| grouping_set | |
263+
| max | |
264+
| median | |
265+
| min | |
266+
| rollup | |
267+
| sum | |
268+
269+
## Subquery Expressions
270+
271+
| Function | Notes |
272+
| --------------- | --------------------------------------------------------------------------------------------- |
273+
| exists | |
274+
| in_subquery | `df1.filter(in_subquery(col("foo"), df2))?` is the equivalent of the SQL `WHERE foo IN <df2>` |
275+
| not_exists | |
276+
| not_in_subquery | |
277+
| scalar_subquery | |
278+
279+
## User-Defined Function Expressions
280+
281+
| Function | Notes |
282+
| ----------- | ----- |
283+
| create_udf | |
284+
| create_udaf | |

0 commit comments

Comments
 (0)