Skip to content

Commit 1f5ebd0

Browse files
liyafan82emkornfield
authored andcommitted
ARROW-5862: [Java] Provide dictionary builder
The dictionary builder servers for the following scenario which is frequently encountered in practice when dictionary encoding is involved: the dictionary values are not known a priori, so they are determined dynamically, as new data arrive continually. In particular, when a new value arrives, it is tested to check if it is already in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary. When all values have been evaluated, the dictionary can be considered complete. So encoding can start afterward. The code snippet using a dictionary builder should be like this: DictonaryBuilder<IntVector> dictionaryBuilder = ... dictionaryBuilder.startBuild(); ... dictionaryBuild.addValue(newValue); ... dictionaryBuilder.endBuild(); Closes #4813 from liyafan82/fly_0705_build and squashes the following commits: 2007b87 <liyafan82> Provide dictionary builder Authored-by: liyafan82 <fan_li_ya@foxmail.com> Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
1 parent dd4532a commit 1f5ebd0

File tree

2 files changed

+384
-0
lines changed

2 files changed

+384
-0
lines changed
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import java.util.TreeSet;
21+
22+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
23+
import org.apache.arrow.vector.ValueVector;
24+
25+
/**
26+
* A dictionary builder is intended for the scenario frequently encountered in practice:
27+
* the dictionary is not known a priori, so it is generated dynamically.
28+
* In particular, when a new value arrives, it is tested to check if it is already
29+
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
30+
*
31+
* <p>
32+
* This class builds the dictionary based on a binary search tree.
33+
* Each add operation can be finished in O(log(n)) time,
34+
* where n is the current dictionary size.
35+
* </p>
36+
* <p>
37+
* The dictionary builder is intended to build a single dictionary.
38+
* So it cannot be used for different dictionaries.
39+
* </p>
40+
* <p>Below gives the sample code for using the dictionary builder
41+
* <pre>{@code
42+
* SearchTreeBasedDictionaryBuilder dictionaryBuilder = ...
43+
* ...
44+
* dictionaryBuild.addValue(newValue);
45+
* ...
46+
* }</pre>
47+
* </p>
48+
* <p>
49+
* With the above code, the dictionary vector will be populated,
50+
* and it can be retrieved by the {@link SearchTreeBasedDictionaryBuilder#getDictionary()} method.
51+
* After that, dictionary encoding can proceed with the populated dictionary.
52+
* </p>
53+
* @param <V> the dictionary vector type.
54+
*/
55+
public class SearchTreeBasedDictionaryBuilder<V extends ValueVector> {
56+
57+
/**
58+
* The dictionary to be built.
59+
*/
60+
private final V dictionary;
61+
62+
/**
63+
* The criteria for sorting in the search tree.
64+
*/
65+
protected final VectorValueComparator<V> comparator;
66+
67+
/**
68+
* If null should be encoded.
69+
*/
70+
private final boolean encodeNull;
71+
72+
/**
73+
* The search tree for storing the value index.
74+
*/
75+
private TreeSet<Integer> searchTree;
76+
77+
/**
78+
* Construct a search tree-based dictionary builder.
79+
* @param dictionary the dictionary vector.
80+
* @param comparator the criteria for value equality.
81+
*/
82+
public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator) {
83+
this(dictionary, comparator, false);
84+
}
85+
86+
/**
87+
* Construct a search tree-based dictionary builder.
88+
* @param dictionary the dictionary vector.
89+
* @param comparator the criteria for value equality.
90+
* @param encodeNull if null values should be added to the dictionary.
91+
*/
92+
public SearchTreeBasedDictionaryBuilder(V dictionary, VectorValueComparator<V> comparator, boolean encodeNull) {
93+
this.dictionary = dictionary;
94+
this.comparator = comparator;
95+
this.encodeNull = encodeNull;
96+
this.comparator.attachVector(dictionary);
97+
98+
searchTree = new TreeSet<>((index1, index2) -> comparator.compare(index1, index2));
99+
}
100+
101+
/**
102+
* Gets the dictionary built.
103+
* Please note that the dictionary is not in sorted order.
104+
* Instead, its order is determined by the order of element insertion.
105+
* To get the dictionary in sorted order, please use
106+
* {@link SearchTreeBasedDictionaryBuilder#populateSortedDictionary(ValueVector)}.
107+
* @return the dictionary.
108+
*/
109+
public V getDictionary() {
110+
return dictionary;
111+
}
112+
113+
/**
114+
* Try to add all values from the target vector to the dictionary.
115+
* @param targetVector the target vector containing values to probe.
116+
* @return the number of values actually added to the dictionary.
117+
*/
118+
public int addValues(V targetVector) {
119+
int ret = 0;
120+
for (int i = 0; i < targetVector.getValueCount(); i++) {
121+
if (!encodeNull && targetVector.isNull(i)) {
122+
continue;
123+
}
124+
if (addValue(targetVector, i)) {
125+
dictionary.setValueCount(dictionary.getValueCount() + 1);
126+
ret += 1;
127+
}
128+
}
129+
return ret;
130+
}
131+
132+
/**
133+
* Try to add an element from the target vector to the dictionary.
134+
* @param targetVector the target vector containing new element.
135+
* @param targetIndex the index of the new element in the target vector.
136+
* @return true if the element is added to the dictionary, and false otherwise.
137+
*/
138+
public boolean addValue(V targetVector, int targetIndex) {
139+
// first copy the value to the end of the dictionary
140+
int dictSize = dictionary.getValueCount();
141+
dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
142+
143+
// try to add the value to the dictionary,
144+
// if an equal element does not exist.
145+
// this operation can be done in O(logn) time.
146+
boolean ret = searchTree.add(dictSize);
147+
return ret;
148+
}
149+
150+
/**
151+
* Gets the sorted dictionary.
152+
* Note that given the binary search tree, the sort can finish in O(n).
153+
*/
154+
public void populateSortedDictionary(V sortedDictionary) {
155+
int idx = 0;
156+
for (Integer dictIdx : searchTree) {
157+
sortedDictionary.copyFromSafe(dictIdx, idx++, dictionary);
158+
}
159+
160+
sortedDictionary.setValueCount(dictionary.getValueCount());
161+
}
162+
}
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertTrue;
22+
23+
import org.apache.arrow.algorithm.sort.DefaultVectorComparators;
24+
import org.apache.arrow.algorithm.sort.VectorValueComparator;
25+
import org.apache.arrow.memory.BufferAllocator;
26+
import org.apache.arrow.memory.RootAllocator;
27+
import org.apache.arrow.vector.IntVector;
28+
import org.apache.arrow.vector.VarCharVector;
29+
30+
import org.junit.After;
31+
import org.junit.Before;
32+
import org.junit.Test;
33+
34+
/**
35+
* Test cases for {@link SearchTreeBasedDictionaryBuilder}.
36+
*/
37+
public class TestSearchTreeBasedDictionaryBuilder {
38+
39+
private BufferAllocator allocator;
40+
41+
@Before
42+
public void prepare() {
43+
allocator = new RootAllocator(1024 * 1024);
44+
}
45+
46+
@After
47+
public void shutdown() {
48+
allocator.close();
49+
}
50+
51+
@Test
52+
public void testBuildVariableWidthDictionaryWithNull() {
53+
try (VarCharVector vec = new VarCharVector("", allocator);
54+
VarCharVector dictionary = new VarCharVector("", allocator);
55+
VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
56+
57+
vec.allocateNew(100, 10);
58+
vec.setValueCount(10);
59+
60+
dictionary.allocateNew();
61+
sortedDictionary.allocateNew();
62+
63+
// fill data
64+
vec.set(0, "hello".getBytes());
65+
vec.set(1, "abc".getBytes());
66+
vec.setNull(2);
67+
vec.set(3, "world".getBytes());
68+
vec.set(4, "12".getBytes());
69+
vec.set(5, "dictionary".getBytes());
70+
vec.setNull(6);
71+
vec.set(7, "hello".getBytes());
72+
vec.set(8, "good".getBytes());
73+
vec.set(9, "abc".getBytes());
74+
75+
VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
76+
SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
77+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
78+
79+
int result = dictionaryBuilder.addValues(vec);
80+
81+
assertEquals(7, result);
82+
assertEquals(7, dictionary.getValueCount());
83+
84+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
85+
86+
assertTrue(sortedDictionary.isNull(0));
87+
assertEquals("12", new String(sortedDictionary.get(1)));
88+
assertEquals("abc", new String(sortedDictionary.get(2)));
89+
assertEquals("dictionary", new String(sortedDictionary.get(3)));
90+
assertEquals("good", new String(sortedDictionary.get(4)));
91+
assertEquals("hello", new String(sortedDictionary.get(5)));
92+
assertEquals("world", new String(sortedDictionary.get(6)));
93+
}
94+
}
95+
96+
@Test
97+
public void testBuildVariableWidthDictionaryWithoutNull() {
98+
try (VarCharVector vec = new VarCharVector("", allocator);
99+
VarCharVector dictionary = new VarCharVector("", allocator);
100+
VarCharVector sortedDictionary = new VarCharVector("", allocator)) {
101+
102+
vec.allocateNew(100, 10);
103+
vec.setValueCount(10);
104+
105+
dictionary.allocateNew();
106+
sortedDictionary.allocateNew();
107+
108+
// fill data
109+
vec.set(0, "hello".getBytes());
110+
vec.set(1, "abc".getBytes());
111+
vec.setNull(2);
112+
vec.set(3, "world".getBytes());
113+
vec.set(4, "12".getBytes());
114+
vec.set(5, "dictionary".getBytes());
115+
vec.setNull(6);
116+
vec.set(7, "hello".getBytes());
117+
vec.set(8, "good".getBytes());
118+
vec.set(9, "abc".getBytes());
119+
120+
VectorValueComparator<VarCharVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
121+
SearchTreeBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
122+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
123+
124+
int result = dictionaryBuilder.addValues(vec);
125+
126+
assertEquals(6, result);
127+
assertEquals(6, dictionary.getValueCount());
128+
129+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
130+
131+
assertEquals("12", new String(sortedDictionary.get(0)));
132+
assertEquals("abc", new String(sortedDictionary.get(1)));
133+
assertEquals("dictionary", new String(sortedDictionary.get(2)));
134+
assertEquals("good", new String(sortedDictionary.get(3)));
135+
assertEquals("hello", new String(sortedDictionary.get(4)));
136+
assertEquals("world", new String(sortedDictionary.get(5)));
137+
}
138+
}
139+
140+
@Test
141+
public void testBuildFixedWidthDictionaryWithNull() {
142+
try (IntVector vec = new IntVector("", allocator);
143+
IntVector dictionary = new IntVector("", allocator);
144+
IntVector sortedDictionary = new IntVector("", allocator)) {
145+
vec.allocateNew(10);
146+
vec.setValueCount(10);
147+
148+
dictionary.allocateNew();
149+
sortedDictionary.allocateNew();
150+
151+
// fill data
152+
vec.set(0, 4);
153+
vec.set(1, 8);
154+
vec.set(2, 32);
155+
vec.set(3, 8);
156+
vec.set(4, 16);
157+
vec.set(5, 32);
158+
vec.setNull(6);
159+
vec.set(7, 4);
160+
vec.set(8, 4);
161+
vec.setNull(9);
162+
163+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
164+
SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
165+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, true);
166+
167+
int result = dictionaryBuilder.addValues(vec);
168+
169+
assertEquals(5, result);
170+
assertEquals(5, dictionary.getValueCount());
171+
172+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
173+
174+
assertTrue(sortedDictionary.isNull(0));
175+
assertEquals(4, sortedDictionary.get(1));
176+
assertEquals(8, sortedDictionary.get(2));
177+
assertEquals(16, sortedDictionary.get(3));
178+
assertEquals(32, sortedDictionary.get(4));
179+
}
180+
}
181+
182+
@Test
183+
public void testBuildFixedWidthDictionaryWithoutNull() {
184+
try (IntVector vec = new IntVector("", allocator);
185+
IntVector dictionary = new IntVector("", allocator);
186+
IntVector sortedDictionary = new IntVector("", allocator)) {
187+
vec.allocateNew(10);
188+
vec.setValueCount(10);
189+
190+
dictionary.allocateNew();
191+
sortedDictionary.allocateNew();
192+
193+
// fill data
194+
vec.set(0, 4);
195+
vec.set(1, 8);
196+
vec.set(2, 32);
197+
vec.set(3, 8);
198+
vec.set(4, 16);
199+
vec.set(5, 32);
200+
vec.setNull(6);
201+
vec.set(7, 4);
202+
vec.set(8, 4);
203+
vec.setNull(9);
204+
205+
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);
206+
SearchTreeBasedDictionaryBuilder<IntVector> dictionaryBuilder =
207+
new SearchTreeBasedDictionaryBuilder<>(dictionary, comparator, false);
208+
209+
int result = dictionaryBuilder.addValues(vec);
210+
211+
assertEquals(4, result);
212+
assertEquals(4, dictionary.getValueCount());
213+
214+
dictionaryBuilder.populateSortedDictionary(sortedDictionary);
215+
216+
assertEquals(4, sortedDictionary.get(0));
217+
assertEquals(8, sortedDictionary.get(1));
218+
assertEquals(16, sortedDictionary.get(2));
219+
assertEquals(32, sortedDictionary.get(3));
220+
}
221+
}
222+
}

0 commit comments

Comments
 (0)