Skip to content

[7.x] Allow mixing set-based and regexp-based include and exclude (#63325) #64014

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/reference/aggregations/bucket/terms-aggregation.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,8 @@ expire then we may be missing accounts of interest and have set our numbers too
Ultimately this is a balancing act between managing the Elasticsearch resources required to process a single request and the volume
of requests that the client application must issue to complete a task.

WARNING: Partitions cannot be used together with an `exclude` parameter.

==== Multi-field terms aggregation

The `terms` aggregation does not support collecting terms from multiple fields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -78,17 +79,8 @@ public static IncludeExclude merge(IncludeExclude include, IncludeExclude exclud
if (include.isPartitionBased()) {
throw new IllegalArgumentException("Cannot specify any excludes when using a partition-based include");
}
String includeMethod = include.isRegexBased() ? "regex" : "set";
String excludeMethod = exclude.isRegexBased() ? "regex" : "set";
if (includeMethod.equals(excludeMethod) == false) {
throw new IllegalArgumentException("Cannot mix a " + includeMethod + "-based include with a "
+ excludeMethod + "-based method");
}
if (include.isRegexBased()) {
return new IncludeExclude(include.include, exclude.exclude);
} else {
return new IncludeExclude(include.includeValues, exclude.excludeValues);
}

return new IncludeExclude(include.include, exclude.exclude, include.includeValues, exclude.excludeValues);
}

public static IncludeExclude parseInclude(XContentParser parser) throws IOException {
Expand Down Expand Up @@ -196,46 +188,39 @@ public boolean accept(BytesRef value) {
}
}

static class AutomatonBackedStringFilter extends StringFilter {
class SetAndRegexStringFilter extends StringFilter {

private final ByteRunAutomaton runAutomaton;

private AutomatonBackedStringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}

/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}

static class TermListBackedStringFilter extends StringFilter {

private final Set<BytesRef> valids;
private final Set<BytesRef> invalids;

TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
this.valids = includeValues;
this.invalids = excludeValues;
private SetAndRegexStringFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.runAutomaton = automaton == null ? null : new ByteRunAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}

/**
* Returns whether the given value is accepted based on the
* {@code include} &amp; {@code exclude} sets.
* Returns whether the given value is accepted based on the {@code includeValues} &amp; {@code excludeValues}
* sets, as well as the {@code include} &amp; {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
if (valids != null && valids.contains(value) == false) {
return false;
}

if (runAutomaton != null && runAutomaton.run(value.bytes, value.offset, value.length) == false) {
return false;
}

return invalids == null || invalids.contains(value) == false;
}
}

public abstract static class OrdinalsFilter extends Filter {
public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException;

}

class PartitionedOrdinalsFilter extends OrdinalsFilter {
Expand All @@ -258,59 +243,64 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
}

static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {
class SetAndRegexOrdinalsFilter extends OrdinalsFilter {

private final CompiledAutomaton compiled;
private final SortedSet<BytesRef> valids;
private final SortedSet<BytesRef> invalids;

private AutomatonBackedOrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
private SetAndRegexOrdinalsFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.compiled = automaton == null ? null : new CompiledAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}

/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
* Computes which global ordinals are accepted by this IncludeExclude instance, based on the combination of
* the {@code includeValues} &amp; {@code excludeValues} sets, as well as the {@code include} &amp;
* {@code exclude} patterns.
*/
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
return acceptedGlobalOrdinals;
}

}

static class TermListBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includeValues;
private final SortedSet<BytesRef> excludeValues;

TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}

@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (includeValues != null) {
for (BytesRef term : includeValues) {
LongBitSet acceptedGlobalOrdinals = null;
if (valids != null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
for (BytesRef term : valids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
} else if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
if (excludeValues != null) {
for (BytesRef term : excludeValues) {

if (compiled != null) {
LongBitSet automatonGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
automatonGlobalOrdinals.set(globalTermsEnum.ord());
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = automatonGlobalOrdinals;
} else {
acceptedGlobalOrdinals.and(automatonGlobalOrdinals);
}
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
}

if (invalids != null) {
for (BytesRef term : invalids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
Expand All @@ -319,9 +309,9 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
return acceptedGlobalOrdinals;
}

}


private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand All @@ -332,17 +322,36 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
* @param exclude The regular expression pattern for the terms to be excluded
*/
public IncludeExclude(RegExp include, RegExp exclude) {
if (include == null && exclude == null) {
this(include, exclude, null, null);
}

public IncludeExclude(RegExp include, RegExp exclude, SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
if (include != null && includeValues != null) {
throw new IllegalArgumentException();
}
if (exclude != null && excludeValues != null) {
throw new IllegalArgumentException();
}
this.include = include;
this.exclude = exclude;
this.includeValues = null;
this.excludeValues = null;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
}

public IncludeExclude(String include, String exclude, String[] includeValues, String[] excludeValues) {
this(
include == null ? null : new RegExp(include),
exclude == null ? null : new RegExp(exclude),
convertToBytesRefSet(includeValues),
convertToBytesRefSet(excludeValues)
);
}

public IncludeExclude(String include, String exclude) {
this(include == null ? null : new RegExp(include), exclude == null ? null : new RegExp(exclude));
}
Expand All @@ -352,15 +361,7 @@ public IncludeExclude(String include, String exclude) {
* @param excludeValues The terms to be excluded
*/
public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
this.include = null;
this.exclude = null;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this(null, null, includeValues, excludeValues);
}

public IncludeExclude(String[] includeValues, String[] excludeValues) {
Expand Down Expand Up @@ -395,18 +396,21 @@ public IncludeExclude(int partition, int numPartitions) {
*/
public IncludeExclude(StreamInput in) throws IOException {
if (in.readBoolean()) {
includeValues = null;
excludeValues = null;
incZeroBasedPartition = 0;
incNumPartitions = 0;
String includeString = in.readOptionalString();
include = includeString == null ? null : new RegExp(includeString);
String excludeString = in.readOptionalString();
exclude = excludeString == null ? null : new RegExp(excludeString);
return;
if (in.getVersion().before(Version.V_7_11_0)) {
incZeroBasedPartition = 0;
incNumPartitions = 0;
includeValues = null;
excludeValues = null;
return;
}
} else {
include = null;
exclude = null;
}
include = null;
exclude = null;
if (in.readBoolean()) {
int size = in.readVInt();
includeValues = new TreeSet<>();
Expand Down Expand Up @@ -436,26 +440,28 @@ public void writeTo(StreamOutput out) throws IOException {
if (regexBased) {
out.writeOptionalString(include == null ? null : include.getOriginalString());
out.writeOptionalString(exclude == null ? null : exclude.getOriginalString());
} else {
boolean hasIncludes = includeValues != null;
out.writeBoolean(hasIncludes);
if (hasIncludes) {
out.writeVInt(includeValues.size());
for (BytesRef value : includeValues) {
out.writeBytesRef(value);
}
if (out.getVersion().before(Version.V_7_11_0)) {
return;
}
boolean hasExcludes = excludeValues != null;
out.writeBoolean(hasExcludes);
if (hasExcludes) {
out.writeVInt(excludeValues.size());
for (BytesRef value : excludeValues) {
out.writeBytesRef(value);
}
}
boolean hasIncludes = includeValues != null;
out.writeBoolean(hasIncludes);
if (hasIncludes) {
out.writeVInt(includeValues.size());
for (BytesRef value : includeValues) {
out.writeBytesRef(value);
}
out.writeVInt(incNumPartitions);
out.writeVInt(incZeroBasedPartition);
}
boolean hasExcludes = excludeValues != null;
out.writeBoolean(hasExcludes);
if (hasExcludes) {
out.writeVInt(excludeValues.size());
for (BytesRef value : excludeValues) {
out.writeBytesRef(value);
}
}
out.writeVInt(incNumPartitions);
out.writeVInt(incZeroBasedPartition);
}

private static SortedSet<BytesRef> convertToBytesRefSet(String[] values) {
Expand Down Expand Up @@ -573,29 +579,25 @@ public boolean isPartitionBased() {

private Automaton toAutomaton() {
Automaton a = null;
if (include == null && exclude == null) {
return a;
}
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}

public StringFilter convertToStringFilter(DocValueFormat format) {
if (isRegexBased()) {
return new AutomatonBackedStringFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedStringFilter();
}
return new TermListBackedStringFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new SetAndRegexStringFilter(format);
}

private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {
Expand All @@ -612,15 +614,11 @@ private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUser
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {

if (isRegexBased()) {
return new AutomatonBackedOrdinalsFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedOrdinalsFilter();
}

return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new SetAndRegexOrdinalsFilter(format);
}

public LongFilter convertToLongFilter(DocValueFormat format) {
Expand Down
Loading