Skip to content

SQL: use a calendar interval for histograms over 1 month intervals #52586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/reference/sql/functions/grouping.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ actually used will be `INTERVAL '2' DAY`. If the interval specified is less than

[IMPORTANT]
All intervals specified for a date/time HISTOGRAM will use a <<search-aggregations-bucket-datehistogram-aggregation,fixed interval>>
in their `date_histogram` aggregation definition, with the notable exception of `INTERVAL '1' YEAR` where a calendar interval is used.
The choice for a calendar interval was made for having a more intuitive result for YEAR groupings. Calendar intervals consider a one year
in their `date_histogram` aggregation definition, with the notable exceptions of `INTERVAL '1' YEAR` AND `INTERVAL '1' MONTH` where a calendar interval is used.
The choice for a calendar interval was made for having a more intuitive result for YEAR and MONTH groupings. In the case of YEAR, for example, the calendar intervals consider a one year
bucket as the one starting on January 1st that specific year, whereas a fixed interval one-year-bucket considers one year as a number
of milliseconds (for example, `31536000000ms` corresponding to 365 days, 24 hours per day, 60 minutes per hour etc.). With fixed intervals,
the day of February 5th, 2019 for example, belongs to a bucket that starts on December 20th, 2018 and {es} (and implicitly {es-sql}) would
Expand Down
28 changes: 28 additions & 0 deletions x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,34 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
null |10
;

histogramOneMonth
schema::h:ts|c:l|birth_date:ts
SELECT HISTOGRAM(birth_date, INTERVAL 1 MONTH) AS h, COUNT(*) as c, birth_date FROM test_emp GROUP BY h, birth_date HAVING c >= 1 ORDER BY h ASC LIMIT 20;

h | c | birth_date
------------------------+---------------+------------------------
null |10 |null
1952-02-01T00:00:00.000Z|1 |1952-02-27T00:00:00.000Z
1952-04-01T00:00:00.000Z|1 |1952-04-19T00:00:00.000Z
1952-05-01T00:00:00.000Z|1 |1952-05-15T00:00:00.000Z
1952-06-01T00:00:00.000Z|1 |1952-06-13T00:00:00.000Z
1952-07-01T00:00:00.000Z|1 |1952-07-08T00:00:00.000Z
1952-08-01T00:00:00.000Z|1 |1952-08-06T00:00:00.000Z
1952-11-01T00:00:00.000Z|1 |1952-11-13T00:00:00.000Z
1952-12-01T00:00:00.000Z|1 |1952-12-24T00:00:00.000Z
1953-01-01T00:00:00.000Z|1 |1953-01-07T00:00:00.000Z
1953-01-01T00:00:00.000Z|1 |1953-01-23T00:00:00.000Z
1953-02-01T00:00:00.000Z|1 |1953-02-08T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-03T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-20T00:00:00.000Z
1953-04-01T00:00:00.000Z|1 |1953-04-21T00:00:00.000Z
1953-07-01T00:00:00.000Z|1 |1953-07-28T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-02T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-19T00:00:00.000Z
1953-09-01T00:00:00.000Z|1 |1953-09-29T00:00:00.000Z
1953-11-01T00:00:00.000Z|1 |1953-11-07T00:00:00.000Z
;

histogramDateTimeWithMonthOnTop
schema::h:i|c:l
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

package org.elasticsearch.xpack.sql.expression.function.grouping;

import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.expression.Expressions.ParamOrdinal;
import org.elasticsearch.xpack.ql.expression.Literal;
Expand All @@ -28,6 +29,8 @@ public class Histogram extends GroupingFunction {

private final Literal interval;
private final ZoneId zoneId;
public static String YEAR_INTERVAL = DateHistogramInterval.YEAR.toString();
public static String MONTH_INTERVAL = DateHistogramInterval.MONTH.toString();

public Histogram(Source source, Expression field, Expression interval, ZoneId zoneId) {
super(source, field, Collections.singletonList(interval));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
*/
package org.elasticsearch.xpack.sql.expression.function.scalar.datetime;

import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.xpack.ql.expression.Expression;
import org.elasticsearch.xpack.ql.tree.NodeInfo.NodeCtor2;
import org.elasticsearch.xpack.ql.tree.Source;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeProcessor.DateTimeExtractor;

import java.time.ZoneId;
Expand All @@ -18,8 +18,6 @@
*/
public class Year extends DateTimeHistogramFunction {

public static String YEAR_INTERVAL = DateHistogramInterval.YEAR.toString();

public Year(Source source, Expression field, ZoneId zoneId) {
super(source, field, zoneId, DateTimeExtractor.YEAR);
}
Expand All @@ -41,6 +39,6 @@ public String dateTimeFormat() {

@Override
public String calendarInterval() {
return YEAR_INTERVAL;
return Histogram.YEAR_INTERVAL;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import org.elasticsearch.xpack.sql.expression.function.aggregate.TopHits;
import org.elasticsearch.xpack.sql.expression.function.grouping.Histogram;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.DateTimeHistogramFunction;
import org.elasticsearch.xpack.sql.expression.function.scalar.datetime.Year;
import org.elasticsearch.xpack.sql.expression.literal.interval.IntervalYearMonth;
import org.elasticsearch.xpack.sql.expression.literal.interval.Intervals;
import org.elasticsearch.xpack.sql.plan.logical.Pivot;
Expand Down Expand Up @@ -91,6 +90,8 @@
import java.util.concurrent.atomic.AtomicReference;

import static org.elasticsearch.xpack.ql.util.CollectionUtils.combine;
import static org.elasticsearch.xpack.sql.expression.function.grouping.Histogram.MONTH_INTERVAL;
import static org.elasticsearch.xpack.sql.expression.function.grouping.Histogram.YEAR_INTERVAL;
import static org.elasticsearch.xpack.sql.planner.QueryTranslator.toAgg;
import static org.elasticsearch.xpack.sql.planner.QueryTranslator.toQuery;
import static org.elasticsearch.xpack.sql.type.SqlDataTypes.DATE;
Expand Down Expand Up @@ -283,7 +284,6 @@ static GroupingContext groupBy(List<? extends Expression> groupings) {
field = field.exactAttribute();
key = new GroupByValue(aggId, field.name());
}

// handle functions
else if (exp instanceof Function) {
// dates are handled differently because of date histograms
Expand Down Expand Up @@ -322,13 +322,17 @@ else if (exp instanceof GroupingFunction) {
// date histogram
if (isDateBased(h.dataType())) {
Object value = h.interval().value();
// interval of exactly 1 year
if (value instanceof IntervalYearMonth
&& ((IntervalYearMonth) value).interval().equals(Period.ofYears(1))) {
String calendarInterval = Year.YEAR_INTERVAL;

// When the histogram is `INTERVAL '1' YEAR`, the interval used in the ES date_histogram will be
// a calendar_interval with value "1y". All other intervals will be fixed_intervals expressed in ms.
// interval of exactly 1 year or 1 month
if (value instanceof IntervalYearMonth &&
(((IntervalYearMonth) value).interval().equals(Period.ofYears(1))
|| ((IntervalYearMonth) value).interval().equals(Period.ofMonths(1)))) {
Period yearMonth = ((IntervalYearMonth) value).interval();
String calendarInterval = yearMonth.equals(Period.ofYears(1)) ? YEAR_INTERVAL : MONTH_INTERVAL;

// When the histogram is `INTERVAL '1' YEAR` or `INTERVAL '1' MONTH`, the interval used in
// the ES date_histogram will be a calendar_interval with value "1y" or "1M" respectively.
// All other intervals will be fixed_intervals expressed in ms.
if (field instanceof FieldAttribute) {
key = new GroupByDateHistogram(aggId, QueryTranslator.nameOf(field), calendarInterval, h.zoneId());
} else if (field instanceof Function) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,30 @@ public void testGroupByYearQueryTranslator() {
endsWith("\"date_histogram\":{\"field\":\"date\",\"missing_bucket\":true,\"value_type\":\"date\",\"order\":\"asc\","
+ "\"calendar_interval\":\"1y\",\"time_zone\":\"Z\"}}}]}}}"));
}

public void testGroupByOneMonthHistogramQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT HISTOGRAM(date, INTERVAL 1 MONTH) AS h FROM test GROUP BY h");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec eqe = (EsQueryExec) p;
assertEquals(1, eqe.output().size());
assertEquals("h", eqe.output().get(0).qualifiedName());
assertEquals(DATETIME, eqe.output().get(0).dataType());
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("\"date_histogram\":{\"field\":\"date\",\"missing_bucket\":true,\"value_type\":\"date\",\"order\":\"asc\","
+ "\"calendar_interval\":\"1M\",\"time_zone\":\"Z\"}}}]}}}"));
}

public void testGroupByMoreMonthsHistogramQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT HISTOGRAM(date, INTERVAL 5 MONTH) AS h FROM test GROUP BY h");
assertEquals(EsQueryExec.class, p.getClass());
EsQueryExec eqe = (EsQueryExec) p;
assertEquals(1, eqe.output().size());
assertEquals("h", eqe.output().get(0).qualifiedName());
assertEquals(DATETIME, eqe.output().get(0).dataType());
assertThat(eqe.queryContainer().aggs().asAggBuilder().toString().replaceAll("\\s+", ""),
endsWith("\"date_histogram\":{\"field\":\"date\",\"missing_bucket\":true,\"value_type\":\"date\",\"order\":\"asc\","
+ "\"fixed_interval\":\"12960000000ms\",\"time_zone\":\"Z\"}}}]}}}"));
}

public void testGroupByYearAndScalarsQueryTranslator() {
PhysicalPlan p = optimizeAndPlan("SELECT YEAR(CAST(date + INTERVAL 5 months AS DATE)) FROM test GROUP BY 1");
Expand Down