Skip to content

Commit 2007b87

Browse files
committed
[ARROW-5862][Java] Provide dictionary builder
1 parent e3ba3de commit 2007b87

File tree

2 files changed

+384
-0
lines changed

2 files changed

+384
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import java.util.TreeSet;
21+
22+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
23+
import org.apache.arrow.vector.ValueVector;
24+
25+
/**
26+
* A dictionary builder is intended for the scenario frequently encountered in practice:
27+
* the dictionary is not known a priori, so it is generated dynamically.
28+
* In particular, when a new value arrives, it is tested to check if it is already
29+
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
30+
*
31+
* <p>
32+
* This class builds the dictionary based on a binary search tree.
33+
* Each add operation can be finished in O(log(n)) time,
34+
* where n is the current dictionary size.
35+
* </p>
36+
* <p>
37+
* The dictionary builder is intended to build a single dictionary.
38+
* So it cannot be used for different dictionaries.
39+
* </p>
40+
* <p>Below gives the sample code for using the dictionary builder
41+
* <pre>{@code
42+
* SearchTreeBasedDictionaryBuilder dictionaryBuilder = ...
43+
* ...
44+
* dictionaryBuild.addValue(newValue);
45+
* ...
46+
* }</pre>
47+
* </p>
48+
* <p>
49+
* With the above code, the dictionary vector will be populated,
50+
* and it can be retrieved by the {@link SearchTreeBasedDictionaryBuilder#getDictionary()} method.
51+
* After that, dictionary encoding can proceed with the populated dictionary.
52+
* </p>
53+
* @param <V> the dictionary vector type.
54+
*/
55+
public class SearchTreeBasedDictionaryBuilder<V extends ValueVector> {
56+
57+
/**
58+
* The dictionary to be built.
59+
*/
60+
private final V dictionary;
61+
62+
/**
63+
* The criteria for sorting in the search tree.
64+
*/
65+
protected final VectorValueComparator<V> comparator;
66+
67+
/**
68+
* If null should be encoded.
69+
*/
70+
private final boolean encodeNull;
71+
72+
/**
73+
* The search tree for storing the value index.
74+
*/
75+
private TreeSet<Integer> searchTree;
76+
77+
/**
78+
* Construct a search tree-based dictionary builder.
79+
* @param dictionary the dictionary vector.
80+
* @param comparator the criteria for value equality.
81+
*/
82+
public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator) {
83+
this(dictionary, comparator, false);
84+
}
85+
86+
/**
87+
* Construct a search tree-based dictionary builder.
88+
* @param dictionary the dictionary vector.
89+
* @param comparator the criteria for value equality.
90+
* @param encodeNull if null values should be added to the dictionary.
91+
*/
92+
public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator, boolean encodeNull) {
93+
this.dictionary = dictionary;
94+
this.comparator = comparator;
95+
this.encodeNull = encodeNull;
96+
this.comparator.attachVector(dictionary);
97+
98+
searchTree = new TreeSet<>((index1, index2) -> comparator.compare(index1, index2));
99+
}
100+
101+
/**
102+
* Gets the dictionary built.
103+
* Please note that the dictionary is not in sorted order.
104+
* Instead, its order is determined by the order of element insertion.
105+
* To get the dictionary in sorted order, please use
106+
* {@link SearchTreeBasedDictionaryBuilder#populateSortedDictionary(ValueVector)}.
107+
* @return the dictionary.
108+
*/
109+
public V getDictionary() {
110+
return dictionary;
111+
}
112+
113+
/**
114+
* Try to add all values from the target vector to the dictionary.
115+
* @param targetVector the target vector containing values to probe.
116+
* @return the number of values actually added to the dictionary.
117+
*/
118+
public int addValues(V targetVector) {
119+
int ret = 0;
120+
for (int i = 0; i < targetVector.getValueCount(); i++) {
121+
if (!encodeNull && targetVector.isNull(i)) {
122+
continue;
123+
}
124+
if (addValue(targetVector, i)) {
125+
dictionary.setValueCount(dictionary.getValueCount() + 1);
126+
ret += 1;
127+
}
128+
}
129+
return ret;
130+
}
131+
132+
/**
133+
* Try to add an element from the target vector to the dictionary.
134+
* @param targetVector the target vector containing new element.
135+
* @param targetIndex the index of the new element in the target vector.
136+
* @return true if the element is added to the dictionary, and false otherwise.
137+
*/
138+
public boolean addValue(V targetVector, int targetIndex) {
139+
// first copy the value to the end of the dictionary
140+
int dictSize = dictionary.getValueCount();
141+
dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
142+
143+
// try to add the value to the dictionary,
144+
// if an equal element does not exist.
145+
// this operation can be done in O(logn) time.
146+
boolean ret = searchTree.add(dictSize);
147+
return ret;
148+
}
149+
150+
/**
151+
* Gets the sorted dictionary.
152+
* Note that given the binary search tree, the sort can finish in O(n).
153+
*/
154+
public void populateSortedDictionary(V sortedDictionary) {
155+
int idx = 0;
156+
for (Integer dictIdx : searchTree) {
157+
sortedDictionary.copyFromSafe(dictIdx, idx++, dictionary);
158+
}
159+
160+
sortedDictionary.setValueCount(dictionary.getValueCount());
161+
}
162+
}
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertTrue;
22+
23+
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
24+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
25+
import org.apache.arrow.memory.BufferAllocator;
26+
import org.apache.arrow.memory.RootAllocator;
27+
import org.apache.arrow.vector.IntVector;
28+
import org.apache.arrow.vector.VarCharVector;
29+
30+
import org.junit.After;
31+
import org.junit.Before;
32+
import org.junit.Test;
33+
34+
/**
35+
* Test cases for {@link SearchTreeBasedDictionaryBuilder}.
36+
*/
37+
public class TestSearchTreeBasedDictionaryBuilder {
38+
39+
private BufferAllocator allocator;
40+
41+
@Before
42+
public void prepare() {
43+
allocator = new RootAllocator(1024 * 1024);
44+
}
45+
46+
@After
47+
public void shutdown() {
48+
allocator.close();
49+
}
50+
51+
@Test
52+
public void testBuildVariableWidthDictionaryWithNull() {
53+
try (VarCharVector vec = new VarCharVector("", allocator);
54+
VarCharVector dictionary = new VarCharVector("", allocator);
55+
VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
56+
57+
vec.allocateNew(100, 10);
58+
vec.setValueCount(10);
59+
60+
dictionary.allocateNew();
61+
sortedDictionary.allocateNew();
62+
63+
// fill data
64+
vec.set(0, "hello".getBytes());
65+
vec.set(1, "abc".getBytes());
66+
vec.setNull(2);
67+
vec.set(3, "world".getBytes());
68+
vec.set(4, "12".getBytes());
69+
vec.set(5, "dictionary".getBytes());
70+
vec.setNull(6);
71+
vec.set(7, "hello".getBytes());
72+
vec.set(8, "good".getBytes());
73+
vec.set(9, "abc".getBytes());
74+
75+
VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
76+
SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
77+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
78+
79+
int result = dictionaryBuilder.addValues(vec);
80+
81+
assertEquals(7, result);
82+
assertEquals(7, dictionary.getValueCount());
83+
84+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
85+
86+
assertTrue(sortedDictionary.isNull(0));
87+
assertEquals("12", new String(sortedDictionary.get(1)));
88+
assertEquals("abc", new String(sortedDictionary.get(2)));
89+
assertEquals("dictionary", new String(sortedDictionary.get(3)));
90+
assertEquals("good", new String(sortedDictionary.get(4)));
91+
assertEquals("hello", new String(sortedDictionary.get(5)));
92+
assertEquals("world", new String(sortedDictionary.get(6)));
93+
}
94+
}
95+
96+
@Test
97+
public void testBuildVariableWidthDictionaryWithoutNull() {
98+
try (VarCharVector vec = new VarCharVector("", allocator);
99+
VarCharVector dictionary = new VarCharVector("", allocator);
100+
VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
101+
102+
vec.allocateNew(100, 10);
103+
vec.setValueCount(10);
104+
105+
dictionary.allocateNew();
106+
sortedDictionary.allocateNew();
107+
108+
// fill data
109+
vec.set(0, "hello".getBytes());
110+
vec.set(1, "abc".getBytes());
111+
vec.setNull(2);
112+
vec.set(3, "world".getBytes());
113+
vec.set(4, "12".getBytes());
114+
vec.set(5, "dictionary".getBytes());
115+
vec.setNull(6);
116+
vec.set(7, "hello".getBytes());
117+
vec.set(8, "good".getBytes());
118+
vec.set(9, "abc".getBytes());
119+
120+
VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
121+
SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
122+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
123+
124+
int result = dictionaryBuilder.addValues(vec);
125+
126+
assertEquals(6, result);
127+
assertEquals(6, dictionary.getValueCount());
128+
129+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
130+
131+
assertEquals("12", new String(sortedDictionary.get(0)));
132+
assertEquals("abc", new String(sortedDictionary.get(1)));
133+
assertEquals("dictionary", new String(sortedDictionary.get(2)));
134+
assertEquals("good", new String(sortedDictionary.get(3)));
135+
assertEquals("hello", new String(sortedDictionary.get(4)));
136+
assertEquals("world", new String(sortedDictionary.get(5)));
137+
}
138+
}
139+
140+
@Test
141+
public void testBuildFixedWidthDictionaryWithNull() {
142+
try (IntVector vec = new IntVector("", allocator);
143+
IntVector dictionary = new IntVector("", allocator);
144+
IntVector sortedDictionary = new IntVector("", allocator)) {
145+
vec.allocateNew(10);
146+
vec.setValueCount(10);
147+
148+
dictionary.allocateNew();
149+
sortedDictionary.allocateNew();
150+
151+
// fill data
152+
vec.set(0, 4);
153+
vec.set(1, 8);
154+
vec.set(2, 32);
155+
vec.set(3, 8);
156+
vec.set(4, 16);
157+
vec.set(5, 32);
158+
vec.setNull(6);
159+
vec.set(7, 4);
160+
vec.set(8, 4);
161+
vec.setNull(9);
162+
163+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
164+
SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
165+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
166+
167+
int result = dictionaryBuilder.addValues(vec);
168+
169+
assertEquals(5, result);
170+
assertEquals(5, dictionary.getValueCount());
171+
172+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
173+
174+
assertTrue(sortedDictionary.isNull(0));
175+
assertEquals(4, sortedDictionary.get(1));
176+
assertEquals(8, sortedDictionary.get(2));
177+
assertEquals(16, sortedDictionary.get(3));
178+
assertEquals(32, sortedDictionary.get(4));
179+
}
180+
}
181+
182+
@Test
183+
public void testBuildFixedWidthDictionaryWithoutNull() {
184+
try (IntVector vec = new IntVector("", allocator);
185+
IntVector dictionary = new IntVector("", allocator);
186+
IntVector sortedDictionary = new IntVector("", allocator)) {
187+
vec.allocateNew(10);
188+
vec.setValueCount(10);
189+
190+
dictionary.allocateNew();
191+
sortedDictionary.allocateNew();
192+
193+
// fill data
194+
vec.set(0, 4);
195+
vec.set(1, 8);
196+
vec.set(2, 32);
197+
vec.set(3, 8);
198+
vec.set(4, 16);
199+
vec.set(5, 32);
200+
vec.setNull(6);
201+
vec.set(7, 4);
202+
vec.set(8, 4);
203+
vec.setNull(9);
204+
205+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
206+
SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
207+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
208+
209+
int result = dictionaryBuilder.addValues(vec);
210+
211+
assertEquals(4, result);
212+
assertEquals(4, dictionary.getValueCount());
213+
214+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
215+
216+
assertEquals(4, sortedDictionary.get(0));
217+
assertEquals(8, sortedDictionary.get(1));
218+
assertEquals(16, sortedDictionary.get(2));
219+
assertEquals(32, sortedDictionary.get(3));
220+
}
221+
}
222+
}

0 commit comments

Comments
 (0)