Skip to content

Commit e3ba3de

Browse files
liyafan82Pindikura Ravindra
authored andcommitted
ARROW-6013: [Java] Support range searcher
For a sorted vector, the range searcher finds the first/last occurrence of a particular element. The search is based on binary search, which takes O(logn) time. Closes #4925 from liyafan82/fly_0723_range and squashes the following commits: 4690f69 <liyafan82> Support range searcher Authored-by: liyafan82 <fan_li_ya@foxmail.com> Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
1 parent 9064571 commit e3ba3de

File tree

3 files changed

+324
-9
lines changed

3 files changed

+324
-9
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.search;
19+
20+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
21+
import org.apache.arrow.vector.ValueVector;
22+
23+
/**
24+
* Search for the range of a particular element in the target vector.
25+
*/
26+
public class VectorRangeSearcher {
27+
28+
/**
29+
* Result returned when a search fails.
30+
*/
31+
public static final int SEARCH_FAIL_RESULT = -1;
32+
33+
/**
34+
* Search for the first occurrence of an element.
35+
* The search is based on the binary search algorithm. So the target vector must be sorted.
36+
* @param targetVector the vector from which to perform the search.
37+
* @param comparator the criterion for the comparison.
38+
* @param keyVector the vector containing the element to search.
39+
* @param keyIndex the index of the search key in the key vector.
40+
* @param <V> the vector type.
41+
* @return the index of the first matched element if any, and -1 otherwise.
42+
*/
43+
public static <V extends ValueVector> int getFirstMatch(
44+
V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
45+
comparator.attachVectors(keyVector, targetVector);
46+
47+
int low = 0;
48+
int high = targetVector.getValueCount() - 1;
49+
50+
while (low <= high) {
51+
int mid = low + (high - low) / 2;
52+
int result = comparator.compare(keyIndex, mid);
53+
if (result < 0) {
54+
// the key is smaller
55+
high = mid - 1;
56+
} else if (result > 0) {
57+
// the key is larger
58+
low = mid + 1;
59+
} else {
60+
// the key equals the mid value, find the lower bound by going left-ward.
61+
62+
// compare with the left neighbour
63+
int left = mid - 1;
64+
if (left == -1) {
65+
// this is the first value in the vector
66+
return mid;
67+
} else {
68+
int leftResult = comparator.compare(keyIndex, left);
69+
if (leftResult > 0) {
70+
// the key is greater than the left neighbour, and equal to the current one
71+
// we find it
72+
return mid;
73+
} else if (leftResult == 0) {
74+
// the left neighbour is also equal, continue to go left
75+
high = mid - 1;
76+
} else {
77+
// the key is larger than the left neighbour, this is not possible
78+
throw new IllegalStateException("The target vector is not sorted ");
79+
}
80+
}
81+
}
82+
}
83+
return SEARCH_FAIL_RESULT;
84+
}
85+
86+
/**
87+
* Search for the last occurrence of an element.
88+
* The search is based on the binary search algorithm. So the target vector must be sorted.
89+
* @param targetVector the vector from which to perform the search.
90+
* @param comparator the criterion for the comparison.
91+
* @param keyVector the vector containing the element to search.
92+
* @param keyIndex the index of the search key in the key vector.
93+
* @param <V> the vector type.
94+
* @return the index of the last matched element if any, and -1 otherwise.
95+
*/
96+
public static <V extends ValueVector> int getLastMatch(
97+
V targetVector, VectorValueComparator<V> comparator, V keyVector, int keyIndex) {
98+
comparator.attachVectors(keyVector, targetVector);
99+
100+
int low = 0;
101+
int high = targetVector.getValueCount() - 1;
102+
103+
while (low <= high) {
104+
int mid = low + (high - low) / 2;
105+
int result = comparator.compare(keyIndex, mid);
106+
if (result < 0) {
107+
// the key is smaller
108+
high = mid - 1;
109+
} else if (result > 0) {
110+
// the key is larger
111+
low = mid + 1;
112+
} else {
113+
// the key equals the mid value, find the upper bound by going right-ward.
114+
115+
// compare with the right neighbour
116+
int right = mid + 1;
117+
if (right == targetVector.getValueCount()) {
118+
// this is the last value in the vector
119+
return mid;
120+
} else {
121+
int rightResult = comparator.compare(keyIndex, right);
122+
if (rightResult < 0) {
123+
// the key is smaller than the right neighbour, and equal to the current one
124+
// we find it
125+
return mid;
126+
} else if (rightResult == 0) {
127+
// the right neighbour is also equal, continue to go right
128+
low = mid + 1;
129+
} else {
130+
// the key is smaller than the right neighbour, this is not possible
131+
throw new IllegalStateException("The target vector is not sorted ");
132+
}
133+
}
134+
}
135+
}
136+
return SEARCH_FAIL_RESULT;
137+
}
138+
}

java/algorithm/src/main/java/org/apache/arrow/algorithm/search/VectorSearcher.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@
2525
*/
2626
public final class VectorSearcher {
2727

28+
/**
29+
* Result returned when a search fails.
30+
*/
31+
public static final int SEARCH_FAIL_RESULT = -1;
32+
2833
/**
2934
* Search for a particular element from the key vector in the target vector by binary search.
3035
* The target vector must be sorted.
@@ -44,13 +49,7 @@ public static <V extends ValueVector> int binarySearch(
4449
int high = targetVector.getValueCount() - 1;
4550

4651
while (low <= high) {
47-
int mid = (high + low) / 2;
48-
49-
if (mid < 0) {
50-
// overflow has occurred, so calculate the mid by converting to long first
51-
mid = (int) (((long) high + (long) low) / 2L);
52-
}
53-
52+
int mid = low + (high - low) / 2;
5453
int cmp = comparator.compare(keyIndex, mid);
5554
if (cmp < 0) {
5655
high = mid - 1;
@@ -60,7 +59,7 @@ public static <V extends ValueVector> int binarySearch(
6059
return mid;
6160
}
6261
}
63-
return -1;
62+
return SEARCH_FAIL_RESULT;
6463
}
6564

6665
/**
@@ -80,7 +79,7 @@ public static <V extends ValueVector> int linearSearch(
8079
return i;
8180
}
8281
}
83-
return -1;
82+
return SEARCH_FAIL_RESULT;
8483
}
8584

8685
private VectorSearcher() {
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.search;
19+
20+
import static org.junit.Assert.assertEquals;
21+
22+
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
23+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
24+
import org.apache.arrow.memory.BufferAllocator;
25+
import org.apache.arrow.memory.RootAllocator;
26+
import org.apache.arrow.vector.IntVector;
27+
28+
import org.junit.After;
29+
import org.junit.Before;
30+
import org.junit.Test;
31+
32+
/**
33+
* Test cases for {@link VectorRangeSearcher}.
34+
*/
35+
public class TestVectorRangeSearcher {
36+
37+
private BufferAllocator allocator;
38+
39+
@Before
40+
public void prepare() {
41+
allocator = new RootAllocator(1024 * 1024);
42+
}
43+
44+
@After
45+
public void shutdown() {
46+
allocator.close();
47+
}
48+
49+
@Test
50+
public void testGetLowerBounds() {
51+
final int maxValue = 100;
52+
final int repeat = 5;
53+
try (IntVector intVector = new IntVector("int vec", allocator)) {
54+
// allocate vector
55+
intVector.allocateNew(maxValue * repeat);
56+
intVector.setValueCount(maxValue * repeat);
57+
58+
// prepare data in sorted order
59+
// each value is repeated some times
60+
for (int i = 0; i < maxValue; i++) {
61+
for (int j = 0; j < repeat; j++) {
62+
if (i == 0) {
63+
intVector.setNull(i * repeat + j);
64+
} else {
65+
intVector.set(i * repeat + j, i);
66+
}
67+
}
68+
}
69+
70+
// do search
71+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
72+
for (int i = 0; i < maxValue; i++) {
73+
int result = VectorRangeSearcher.getFirstMatch(intVector, comparator, intVector, i * repeat);
74+
assertEquals(i * repeat, result);
75+
}
76+
}
77+
}
78+
79+
@Test
80+
public void testGetLowerBoundsNegative() {
81+
final int maxValue = 100;
82+
final int repeat = 5;
83+
try (IntVector intVector = new IntVector("int vec", allocator);
84+
IntVector negVector = new IntVector("neg vec", allocator)) {
85+
// allocate vector
86+
intVector.allocateNew(maxValue * repeat);
87+
intVector.setValueCount(maxValue * repeat);
88+
89+
negVector.allocateNew(maxValue);
90+
negVector.setValueCount(maxValue);
91+
92+
// prepare data in sorted order
93+
// each value is repeated some times
94+
for (int i = 0; i < maxValue; i++) {
95+
for (int j = 0; j < repeat; j++) {
96+
if (i == 0) {
97+
intVector.setNull(i * repeat + j);
98+
} else {
99+
intVector.set(i * repeat + j, i);
100+
}
101+
}
102+
negVector.set(i, maxValue + i);
103+
}
104+
105+
// do search
106+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
107+
for (int i = 0; i < maxValue; i++) {
108+
int result = VectorRangeSearcher.getFirstMatch(intVector, comparator, negVector, i);
109+
assertEquals(-1, result);
110+
}
111+
}
112+
}
113+
114+
@Test
115+
public void testGetUpperBounds() {
116+
final int maxValue = 100;
117+
final int repeat = 5;
118+
try (IntVector intVector = new IntVector("int vec", allocator)) {
119+
// allocate vector
120+
intVector.allocateNew(maxValue * repeat);
121+
intVector.setValueCount(maxValue * repeat);
122+
123+
// prepare data in sorted order
124+
// each value is repeated some times
125+
for (int i = 0; i < maxValue; i++) {
126+
for (int j = 0; j < repeat; j++) {
127+
if (i == 0) {
128+
intVector.setNull(i * repeat + j);
129+
} else {
130+
intVector.set(i * repeat + j, i);
131+
}
132+
}
133+
}
134+
135+
// do search
136+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
137+
for (int i = 0; i < maxValue; i++) {
138+
int result = VectorRangeSearcher.getLastMatch(intVector, comparator, intVector, i * repeat);
139+
assertEquals((i + 1) * repeat - 1, result);
140+
}
141+
}
142+
}
143+
144+
@Test
145+
public void testGetUpperBoundsNegative() {
146+
final int maxValue = 100;
147+
final int repeat = 5;
148+
try (IntVector intVector = new IntVector("int vec", allocator);
149+
IntVector negVector = new IntVector("neg vec", allocator)) {
150+
// allocate vector
151+
intVector.allocateNew(maxValue * repeat);
152+
intVector.setValueCount(maxValue * repeat);
153+
154+
negVector.allocateNew(maxValue);
155+
negVector.setValueCount(maxValue);
156+
157+
// prepare data in sorted order
158+
// each value is repeated some times
159+
for (int i = 0; i < maxValue; i++) {
160+
for (int j = 0; j < repeat; j++) {
161+
if (i == 0) {
162+
intVector.setNull(i * repeat + j);
163+
} else {
164+
intVector.set(i * repeat + j, i);
165+
}
166+
}
167+
negVector.set(i, maxValue + i);
168+
}
169+
170+
// do search
171+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(intVector);
172+
for (int i = 0; i < maxValue; i++) {
173+
int result = VectorRangeSearcher.getLastMatch(intVector, comparator, negVector, i);
174+
assertEquals(-1, result);
175+
}
176+
}
177+
}
178+
}

0 commit comments

Comments
 (0)