-
Notifications
You must be signed in to change notification settings - Fork 10.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
perf(gatsby): Support lte
for indexed fast filters
#22932
Changes from 14 commits
8a15a90
c1a5984
8849d7a
ca5576e
c0b3c5e
18f78f8
4074561
93a3243
1c56b81
333dd02
2f85a1d
5603814
9232f9a
04349bb
c3525c2
493b85d
b746293
8aebc8f
54e9cf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,9 +3,23 @@ import { IGatsbyNode } from "./types" | |
import { createPageDependency } from "./actions/add-page-dependency" | ||
import { IDbQueryElemMatch } from "../db/common/query" | ||
|
||
// Only list supported ops here. "CacheableFilterOp" | ||
type FilterOp = "$eq" | "$lte" | ||
type FilterValue = string | number | boolean | ||
export type FilterCacheKey = string | ||
export type FilterCache = Map<string | number | boolean, Set<IGatsbyNode>> | ||
export type FiltersCache = Map<FilterCacheKey, FilterCache> | ||
export interface IFilterCache { | ||
op: FilterOp | ||
byValue: Map<FilterValue, Set<IGatsbyNode>> | ||
meta: { | ||
// Ordered set of all values found by this filter | ||
valuesAsc?: Array<FilterValue> | ||
// Flat set of nodes, ordered by valueAsc, but not ordered per value group | ||
nodesByValueAsc?: Array<IGatsbyNode> | ||
// Ranges of nodes per value, maps to the nodesByValueAsc array | ||
valueRanges?: Map<FilterValue, [number, number]> | ||
} | ||
} | ||
export type FiltersCache = Map<FilterCacheKey, IFilterCache> | ||
|
||
/** | ||
* Get all nodes from redux store. | ||
|
@@ -152,32 +166,65 @@ export const addResolvedNodes = ( | |
return resolvedNodes | ||
} | ||
|
||
export const postIndexingMetaSetup = (filterCache: IFilterCache): void => { | ||
// Create an ordered array of individual nodes, ordered (grouped) by the | ||
// value to which the filter resolves. Nodes are not ordered per value. | ||
// This way non-eq ops can simply slice the array to get a range. | ||
|
||
const entries: Array<[FilterValue, Set<IGatsbyNode>]> = [ | ||
...filterCache.byValue.entries(), | ||
] | ||
|
||
// Sort all sets by its value, asc. Ignore/allow potential type casting. | ||
entries.sort(([a], [b]) => (a <= b ? -1 : a > b ? 1 : 0)) | ||
pvdz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
const orderedNodes: Array<IGatsbyNode> = [] | ||
const orderedValues: Array<FilterValue> = [] | ||
const offsets: Map<FilterValue, [number, number]> = new Map() | ||
entries.forEach(([v, bucket]) => { | ||
// Record the first index containing node with as filter value v | ||
offsets.set(v, [orderedNodes.length, bucket.size]) | ||
// We could do `arr.push(...bucket)` here but that's not safe with very | ||
// large sets, so we use a regular loop | ||
bucket.forEach(node => orderedNodes.push(node)) | ||
pvdz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
orderedValues.push(v) | ||
}) | ||
|
||
filterCache.meta.valuesAsc = orderedValues | ||
filterCache.meta.nodesByValueAsc = orderedNodes | ||
// The nodesByValueAsc is ordered by value, but multiple nodes per value are | ||
// not ordered. To make lt as fast as lte, we must know the start and stop | ||
// index for each value. Similarly useful for for `ne`. | ||
filterCache.meta.valueRanges = offsets | ||
} | ||
|
||
/** | ||
* Given a ("flat") filter path leading up to "eq", a set of node types, and a | ||
* Given a single non-elemMatch filter path, a set of node types, and a | ||
* cache, create a cache that for each resulting value of the filter contains | ||
* all the Nodes in a Set (or, if the property is `id`, just the Nodes). | ||
* all the Nodes in a Set. | ||
* This cache is used for applying the filter and is a massive improvement over | ||
* looping over all the nodes, when the number of pages (/nodes) scale up. | ||
* looping over all the nodes, when the number of pages (/nodes) scales up. | ||
*/ | ||
export const ensureIndexByTypedChain = ( | ||
cacheKey: FilterCacheKey, | ||
chain: string[], | ||
export const ensureIndexByQuery = ( | ||
op: FilterOp, | ||
filterCacheKey: FilterCacheKey, | ||
filterPath: string[], | ||
nodeTypeNames: string[], | ||
filtersCache: FiltersCache | ||
): void => { | ||
const state = store.getState() | ||
const resolvedNodesCache = state.resolvedNodesCache | ||
|
||
const filterCache: FilterCache = new Map() | ||
filtersCache.set(cacheKey, filterCache) | ||
const filterCache: IFilterCache = { op, byValue: new Map(), meta: {} } | ||
filtersCache.set(filterCacheKey, filterCache) | ||
|
||
// We cache the subsets of nodes by type, but only one type. So if searching | ||
// through one node type we can prevent a search through all nodes, otherwise | ||
// it's probably faster to loop through all nodes. Perhaps. Maybe. | ||
|
||
if (nodeTypeNames.length === 1) { | ||
getNodesByType(nodeTypeNames[0]).forEach(node => { | ||
addNodeToFilterCache(node, chain, filterCache, resolvedNodesCache) | ||
addNodeToFilterCache(node, filterPath, filterCache, resolvedNodesCache) | ||
}) | ||
} else { | ||
// Here we must first filter for the node type | ||
|
@@ -187,15 +234,19 @@ export const ensureIndexByTypedChain = ( | |
return | ||
} | ||
|
||
addNodeToFilterCache(node, chain, filterCache, resolvedNodesCache) | ||
addNodeToFilterCache(node, filterPath, filterCache, resolvedNodesCache) | ||
}) | ||
} | ||
|
||
if (op === `$lte`) { | ||
postIndexingMetaSetup(filterCache) | ||
} | ||
} | ||
|
||
function addNodeToFilterCache( | ||
node: IGatsbyNode, | ||
chain: Array<string>, | ||
filterCache: FilterCache, | ||
filterCache: IFilterCache, | ||
resolvedNodesCache, | ||
valueOffset: any = node | ||
): void { | ||
|
@@ -228,16 +279,17 @@ function addNodeToFilterCache( | |
return | ||
} | ||
|
||
let set = filterCache.get(v) | ||
let set = filterCache.byValue.get(v) | ||
if (!set) { | ||
set = new Set() | ||
filterCache.set(v, set) | ||
filterCache.byValue.set(v, set) | ||
} | ||
set.add(node) | ||
} | ||
|
||
export const ensureIndexByElemMatch = ( | ||
cacheKey: FilterCacheKey, | ||
op: FilterOp, | ||
filterCacheKey: FilterCacheKey, | ||
filter: IDbQueryElemMatch, | ||
nodeTypeNames: Array<string>, | ||
filtersCache: FiltersCache | ||
|
@@ -248,8 +300,8 @@ export const ensureIndexByElemMatch = ( | |
const state = store.getState() | ||
const { resolvedNodesCache } = state | ||
|
||
const filterCache: FilterCache = new Map() | ||
filtersCache.set(cacheKey, filterCache) | ||
const filterCache: IFilterCache = { op, byValue: new Map(), meta: {} } | ||
filtersCache.set(filterCacheKey, filterCache) | ||
|
||
if (nodeTypeNames.length === 1) { | ||
getNodesByType(nodeTypeNames[0]).forEach(node => { | ||
|
@@ -277,13 +329,17 @@ export const ensureIndexByElemMatch = ( | |
) | ||
}) | ||
} | ||
|
||
if (op === `$lte`) { | ||
pvdz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
postIndexingMetaSetup(filterCache) | ||
} | ||
} | ||
|
||
function addNodeToBucketWithElemMatch( | ||
node: IGatsbyNode, | ||
valueAtCurrentStep: any, // Arbitrary step on the path inside the node | ||
filter: IDbQueryElemMatch, | ||
filterCache: FilterCache, | ||
filterCache: IFilterCache, | ||
resolvedNodesCache | ||
): void { | ||
// There can be a filter that targets `__gatsby_resolved` so fix that first | ||
|
@@ -337,24 +393,109 @@ function addNodeToBucketWithElemMatch( | |
} | ||
} | ||
|
||
const binarySearch = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like this function may be handling certain edge cases incorrectly? For example with repeating values: binarySearch([0, 1], 1)
-> Array [ 1, 1 ]
binarySearch([0, 1, 1], 1)
-> Array [ 1, 1 ]
binarySearch([0, 1, 1, 1], 1)
-> Array [ 2, 2 ]
binarySearch([0, 1, 1, 1, 1], 1)
-> Array [ 2, 2 ]
binarySearch([0, 1, 1, 1, 1, 1], 1)
-> Array [ 3, 3 ]
binarySearch([0, 1, 1, 1, 1, 1, 2], 1)
-> Array [ 3, 3 ]
binarySearch([0, 1, 1, 1, 1, 1, 2, 3], 1)
-> Array [ 4, 4 ] Maybe we can use existing tools like lodash sortedIndex (it also works even if the element is not in the list)? Edit: I realized that it always receive a list with unique values but then it deserves a comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, unique values guaranteed. I'll add a comment, good point :) |
||
values: Array<FilterValue>, | ||
needle: FilterValue | ||
): [number, number] => { | ||
let min = 0 | ||
let max = values.length - 1 | ||
let pivot = Math.floor(values.length / 2) | ||
while (min <= max) { | ||
pvdz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
const value = values[pivot] | ||
if (needle < value) { | ||
// Move pivot to middle of nodes left of current pivot | ||
// assert pivot < max | ||
max = pivot | ||
} else if (needle > value) { | ||
// Move pivot to middle of nodes right of current pivot | ||
// assert pivot > min | ||
min = pivot | ||
} else { | ||
// This means needle === value | ||
// TODO: except for NaN ... and potentially certain type casting cases | ||
return [pivot, pivot] | ||
} | ||
|
||
if (max - min <= 1) { | ||
// End of search. Needle not found (as expected). Use pivot as index. | ||
// If the needle was not found, max-min==1 and max is returned. | ||
return [min, max] | ||
} | ||
|
||
pivot = Math.floor((max - min) / 2) | ||
} | ||
|
||
// Shouldn't be reachable | ||
return [0, 0] | ||
} | ||
|
||
/** | ||
* Given a ("flat") filter path leading up to "eq", a target value to filter | ||
* for, a set of node types, and a pre-generated lookup cache, return the set | ||
* of Nodes (or, if the property is `id` just the Node) which pass the filter. | ||
* This returns `undefined` if there is Node that passes the filter. | ||
* Given the cache key for a filter and a target value return the set of nodes | ||
* that resolve to this value. | ||
* This returns `undefined` if there is no such node | ||
* | ||
* Basically if the filter was {a: {b: {slug: {eq: "foo/bar"}}}} then it will | ||
* return all the nodes that have `node.slug === "foo/bar"`. That usually (but | ||
* not always) at most one node for slug, but this filter can apply to anything. | ||
* | ||
* The only exception is `id`, since internally there can be at most one node | ||
* per `id` so there's a minor optimization for that (no need for Sets). | ||
*/ | ||
export const getFilterCacheByTypedChain = ( | ||
cacheKey: FilterCacheKey, | ||
value: boolean | number | string, | ||
export const getNodesFromCacheByValue = ( | ||
filterCacheKey: FilterCacheKey, | ||
filterValue: FilterValue, | ||
filtersCache: FiltersCache | ||
): Set<IGatsbyNode> | undefined => { | ||
const byTypedKey = filtersCache?.get(cacheKey) | ||
return byTypedKey?.get(value) | ||
const filterCache = filtersCache?.get(filterCacheKey) | ||
if (!filterCache) { | ||
return undefined | ||
} | ||
|
||
const op = filterCache.op | ||
|
||
if (op === `$eq`) { | ||
return filterCache.byValue.get(filterValue) | ||
} | ||
|
||
if (op === `$lte`) { | ||
// First try a direct approach. If a value is queried that also exists then | ||
// we can prevent a binary search through the whole set, O(1) vs O(log n) | ||
const range = filterCache.meta.valueRanges!.get(filterValue) | ||
if (range) { | ||
return new Set( | ||
filterCache.meta.nodesByValueAsc!.slice(0, range[0] + range[1]) | ||
) | ||
} | ||
|
||
// Query may ask for a value that doesn't appear in the set, like if the | ||
// set is [1, 2, 5, 6] and the query is <= 3. In that case we have to | ||
// apply a search (we'll do binary) to determine the offset to slice from. | ||
|
||
// Note: for lte, the valueAsc array must be set at this point | ||
const values = filterCache.meta.valuesAsc as Array<FilterValue> | ||
// It shouldn't find the targetValue (but it might) and return the index of | ||
// the two value between which targetValue sits, or first/last element. | ||
const [pivotMin, pivotMax] = binarySearch(values, filterValue) | ||
// Each pivot index must have a value and a range | ||
// The returned min/max index may include the lower/upper bound, so we still | ||
// have to do lte checks for both values. | ||
let pivotValue = values[pivotMax] | ||
if (pivotValue > filterValue) { | ||
pivotValue = values[pivotMin] | ||
} | ||
|
||
// Note: the pivot value _shouldnt_ match the filter value because that | ||
// means the value was actually found, but those should have been indexed | ||
// so should have yielded a result in the .get() above. | ||
|
||
const [start, stop] = filterCache.meta.valueRanges!.get(pivotValue) as [ | ||
number, | ||
number | ||
] | ||
|
||
// Note: technically, `5 <= "5" === true` but `5` would not be cached. | ||
// So we have to consider weak comparison and may have to include the pivot | ||
const until = start + (pivotValue <= filterValue ? stop : 0) | ||
return new Set(filterCache.meta.nodesByValueAsc!.slice(0, until)) | ||
} | ||
|
||
// Unreachable because we checked all values of FilterOp (which op is) | ||
return undefined | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question here - do we operate on materialized view values or node values? Is it possible we get actual Date type here - and if so, can that cause problems somewhere in the code?
I tried to quickly skim through the code and seems like we only use
<=
and>
comparisons, so we should be good even if we end up with unexpected types ... as long as you can actually use comparisons there to sort values out?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I know the values are only ever str/num/bool and a hint of null.
@vladar or @freiksenet can answer this better.
This approach would later be used for
ne
andin
as well so it's good to know regardless.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did some checks and filter value is always a
string
. Even for dates. So it works fine when the node has a date field expressed as astring
.But there is a bug when a node field contains an instance of
Date
. Sift filtering is broken for this situation (as we compare string vs date). At least I tried it and it didn't work.The proper fix would be to convert the filter value to
Date
when running sift (docs).An alternative option would be to always store dates as strings in nodes (maybe a better one). But this may be a breaking change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added this PR with a failing test case (skipped for now): #23096
We should address it in v3 (as it is a breaking change). It will also ensure that filtering with indexes is consistent with sift.
Also, in the worst case, it will just fallback to querying without index so I think this shouldn't be a blocker for this PR.