Skip to content

Incorrectly losing OFFSETs for LIMIT queries #12423

Closed
@alamb

Description

@alamb

Describe the bug

#12003 introduced a bug where sometimes the OFFSET clause is accidentally removed

I am filing this ticket to track the regression / know it exists

To Reproduce

@wiedld created a reproducer here b6fd751

Add this to the order.slt file:

# all results
query II
SELECT b, sum(a) FROM ordered_table GROUP BY b order by b desc;
----
3 25
2 25
1 0
0 0

# limit only
query II
SELECT b, sum(a) FROM ordered_table GROUP BY b order by b desc LIMIT 3;
----
3 25
2 25
1 0

# offset only
query II
SELECT b, sum(a) FROM ordered_table GROUP BY b order by b desc OFFSET 1;
----
2 25
1 0
0 0

# TODO: fix this to properly apply offset
# offset + limit
query II
SELECT b, sum(a) FROM ordered_table GROUP BY b order by b desc OFFSET 1 LIMIT 2;
----
3 25
2 25
1 0

# TODO: fix this to not remove the skip=1 during the limit pushdown
# Applying offset & limit when multiple streams from groupby
query TT
EXPLAIN SELECT b, sum(a) FROM ordered_table GROUP BY b order by b desc OFFSET 1 LIMIT 2;
----
logical_plan
01)Limit: skip=1, fetch=2
02)--Sort: ordered_table.b DESC NULLS FIRST, fetch=3
03)----Aggregate: groupBy=[[ordered_table.b]], aggr=[[sum(CAST(ordered_table.a AS Int64))]]
04)------TableScan: ordered_table projection=[a, b]
physical_plan
01)SortPreservingMergeExec: [b@0 DESC], fetch=3
02)--SortExec: TopK(fetch=3), expr=[b@0 DESC], preserve_partitioning=[true]
03)----AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[sum(ordered_table.a)]
04)------CoalesceBatchesExec: target_batch_size=8192
05)--------RepartitionExec: partitioning=Hash([b@0], 2), input_partitions=2
06)----------AggregateExec: mode=Partial, gby=[b@1 as b], aggr=[sum(ordered_table.a)]
07)------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
08)--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], has_header=true

# TODO: fix this to not remove the skip=4 during the limit pushdown
# Applying offset & limit when multiple streams from union
query TT
explain select * FROM (
  select c FROM ordered_table
  UNION ALL
  select d FROM ordered_table
) order by 1 desc LIMIT 10 OFFSET 4;
----
logical_plan
01)Limit: skip=4, fetch=10
02)--Sort: ordered_table.c DESC NULLS FIRST, fetch=14
03)----Union
04)------Projection: CAST(ordered_table.c AS Int64) AS c
05)--------TableScan: ordered_table projection=[c]
06)------Projection: CAST(ordered_table.d AS Int64) AS c
07)--------TableScan: ordered_table projection=[d]
physical_plan
01)SortPreservingMergeExec: [c@0 DESC], fetch=14
02)--UnionExec
03)----SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
04)------ProjectionExec: expr=[CAST(c@0 AS Int64) as c]
05)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[c], output_ordering=[c@0 ASC NULLS LAST], has_header=true
07)----SortExec: TopK(fetch=14), expr=[c@0 DESC], preserve_partitioning=[true]
08)------ProjectionExec: expr=[CAST(d@0 AS Int64) as c]
09)--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
10)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[d], has_header=true

# ApplyingmLIMIT & OFFSET to subquery.
query III
select t1.b, c, c2 FROM (
  select b, c FROM ordered_table ORDER BY b desc, c desc OFFSET 1 LIMIT 4
) as t1 INNER JOIN (
  select b, c as c2 FROM ordered_table ORDER BY b desc, d desc OFFSET 1 LIMIT 4
) as t2
ON t1.b = t2.b
ORDER BY t1.b desc, c desc, c2 desc;
----
3 98 96
3 98 89
3 98 82
3 98 79
3 97 96
3 97 89
3 97 82
3 97 79
3 96 96
3 96 89
3 96 82
3 96 79
3 95 96
3 95 89
3 95 82
3 95 79

# TODO: fix this does not correctly work.
# Apply OFFSET & LIMIT to both parent and child (subquery).
query III
select t1.b, c, c2 FROM (
  select b, c FROM ordered_table ORDER BY b desc, c desc OFFSET 1 LIMIT 4
) as t1 INNER JOIN (
  select b, c as c2 FROM ordered_table ORDER BY b desc, d desc OFFSET 1 LIMIT 4
) as t2
ON t1.b = t2.b
ORDER BY t1.b desc, c desc, c2 desc
OFFSET 3 LIMIT 2;
----
3 99 96
3 99 89
3 99 87
3 99 82
3 99 79

Expected behavior

Query should not lose the offset

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingregressionSomething that used to work no longer does

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions