Skip to content

Commit faf614b

Browse files
liyafan82kou
authored andcommitted
ARROW-6185: [Java] Provide hash table based dictionary builder
This is related ARROW-5862. We provide another type of dictionary builder based on hash table. Compared with a search based dictionary encoder, a hash table based encoder process each new element in O(1) time, but require extra memory space. Closes #5054 from liyafan82/fly_0809_hashbuild and squashes the following commits: 77e24531e <liyafan82> Provide hash table based dictionary builder Authored-by: liyafan82 <fan_li_ya@foxmail.com> Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
1 parent c72b1dc commit faf614b

File tree

2 files changed

+377
-0
lines changed

2 files changed

+377
-0
lines changed
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import java.util.HashMap;
21+
22+
import org.apache.arrow.memory.util.ArrowBufPointer;
23+
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
24+
import org.apache.arrow.memory.util.hash.SimpleHasher;
25+
import org.apache.arrow.vector.ElementAddressableVector;
26+
27+
/**
28+
* A dictionary builder is intended for the scenario frequently encountered in practice:
29+
* the dictionary is not known a priori, so it is generated dynamically.
30+
* In particular, when a new value arrives, it is tested to check if it is already
31+
* in the dictionary. If so, it is simply neglected, otherwise, it is added to the dictionary.
32+
*
33+
* <p>
34+
* This class builds the dictionary based on a hash table.
35+
* Each add operation can be finished in O(1) time,
36+
* where n is the current dictionary size.
37+
* </p>
38+
* <p>
39+
* The dictionary builder is intended to build a single dictionary.
40+
* So it cannot be used for different dictionaries.
41+
* </p>
42+
* <p>Below gives the sample code for using the dictionary builder
43+
* <pre>{@code
44+
* HashTableBasedDictionaryBuilder dictionaryBuilder = ...
45+
* ...
46+
* dictionaryBuild.addValue(newValue);
47+
* ...
48+
* }</pre>
49+
* </p>
50+
* <p>
51+
* With the above code, the dictionary vector will be populated,
52+
* and it can be retrieved by the {@link HashTableBasedDictionaryBuilder#getDictionary()} method.
53+
* After that, dictionary encoding can proceed with the populated dictionary encoder.
54+
* </p>
55+
*
56+
* @param <V> the dictionary vector type.
57+
*/
58+
public class HashTableBasedDictionaryBuilder<V extends ElementAddressableVector> {
59+
60+
/**
61+
* The dictionary to be built.
62+
*/
63+
private final V dictionary;
64+
65+
/**
66+
* If null should be encoded.
67+
*/
68+
private final boolean encodeNull;
69+
70+
/**
71+
* The hash map for distinct dictionary entries.
72+
* The key is the pointer to the dictionary element, whereas the value is the index in the dictionary.
73+
*/
74+
private HashMap<ArrowBufPointer, Integer> hashMap = new HashMap<>();
75+
76+
/**
77+
* The hasher used for calculating the hash code.
78+
*/
79+
private final ArrowBufHasher hasher;
80+
81+
/**
82+
* Next pointer to try to add to the hash table.
83+
*/
84+
private ArrowBufPointer nextPointer;
85+
86+
/**
87+
* Constructs a hash table based dictionary builder.
88+
*
89+
* @param dictionary the dictionary to populate.
90+
*/
91+
public HashTableBasedDictionaryBuilder(V dictionary) {
92+
this(dictionary, false);
93+
}
94+
95+
/**
96+
* Constructs a hash table based dictionary builder.
97+
*
98+
* @param dictionary the dictionary to populate.
99+
* @param encodeNull if null values should be added to the dictionary.
100+
*/
101+
public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull) {
102+
this(dictionary, encodeNull, SimpleHasher.INSTANCE);
103+
}
104+
105+
/**
106+
* Constructs a hash table based dictionary builder.
107+
*
108+
* @param dictionary the dictionary to populate.
109+
* @param encodeNull if null values should be added to the dictionary.
110+
* @param hasher the hasher used to compute the hash code.
111+
*/
112+
public HashTableBasedDictionaryBuilder(V dictionary, boolean encodeNull, ArrowBufHasher hasher) {
113+
this.dictionary = dictionary;
114+
this.encodeNull = encodeNull;
115+
this.hasher = hasher;
116+
this.nextPointer = new ArrowBufPointer(hasher);
117+
}
118+
119+
/**
120+
* Gets the dictionary built.
121+
*
122+
* @return the dictionary.
123+
*/
124+
public V getDictionary() {
125+
return dictionary;
126+
}
127+
128+
/**
129+
* Try to add all values from the target vector to the dictionary.
130+
*
131+
* @param targetVector the target vector containing values to probe.
132+
* @return the number of values actually added to the dictionary.
133+
*/
134+
public int addValues(V targetVector) {
135+
int ret = 0;
136+
for (int i = 0; i < targetVector.getValueCount(); i++) {
137+
if (!encodeNull && targetVector.isNull(i)) {
138+
continue;
139+
}
140+
if (addValue(targetVector, i)) {
141+
ret += 1;
142+
}
143+
}
144+
return ret;
145+
}
146+
147+
/**
148+
* Try to add an element from the target vector to the dictionary.
149+
*
150+
* @param targetVector the target vector containing new element.
151+
* @param targetIndex the index of the new element in the target vector.
152+
* @return true if the element is added to the dictionary, and false otherwise.
153+
*/
154+
public boolean addValue(V targetVector, int targetIndex) {
155+
targetVector.getDataPointer(targetIndex, nextPointer);
156+
157+
if (!hashMap.containsKey(nextPointer)) {
158+
// a new dictionary element is found
159+
160+
// insert it to the dictionary
161+
int dictSize = dictionary.getValueCount();
162+
dictionary.copyFromSafe(targetIndex, dictSize, targetVector);
163+
dictionary.setValueCount(dictSize + 1);
164+
dictionary.getDataPointer(dictSize, nextPointer);
165+
166+
// insert it to the hash map
167+
hashMap.put(nextPointer, dictSize);
168+
nextPointer = new ArrowBufPointer(hasher);
169+
170+
return true;
171+
}
172+
return false;
173+
}
174+
}
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.arrow.algorithm.dictionary;
19+
20+
import static junit.framework.TestCase.assertTrue;
21+
import static org.junit.Assert.assertNull;
22+
import static org.junit.jupiter.api.Assertions.assertEquals;
23+
24+
import org.apache.arrow.memory.BufferAllocator;
25+
import org.apache.arrow.memory.RootAllocator;
26+
import org.apache.arrow.vector.IntVector;
27+
import org.apache.arrow.vector.VarCharVector;
28+
29+
import org.junit.After;
30+
import org.junit.Before;
31+
import org.junit.Test;
32+
33+
/**
34+
* Test cases for {@link HashTableBasedDictionaryBuilder}.
35+
*/
36+
public class TestHashTableBasedDictionaryEncoder {
37+
38+
private BufferAllocator allocator;
39+
40+
@Before
41+
public void prepare() {
42+
allocator = new RootAllocator(1024 * 1024);
43+
}
44+
45+
@After
46+
public void shutdown() {
47+
allocator.close();
48+
}
49+
50+
@Test
51+
public void testBuildVariableWidthDictionaryWithNull() {
52+
try (VarCharVector vec = new VarCharVector("", allocator);
53+
VarCharVector dictionary = new VarCharVector("", allocator)) {
54+
55+
vec.allocateNew(100, 10);
56+
vec.setValueCount(10);
57+
58+
dictionary.allocateNew();
59+
60+
// fill data
61+
vec.set(0, "hello".getBytes());
62+
vec.set(1, "abc".getBytes());
63+
vec.setNull(2);
64+
vec.set(3, "world".getBytes());
65+
vec.set(4, "12".getBytes());
66+
vec.set(5, "dictionary".getBytes());
67+
vec.setNull(6);
68+
vec.set(7, "hello".getBytes());
69+
vec.set(8, "good".getBytes());
70+
vec.set(9, "abc".getBytes());
71+
72+
HashTableBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
73+
new HashTableBasedDictionaryBuilder<>(dictionary, true);
74+
75+
int result = dictionaryBuilder.addValues(vec);
76+
77+
assertEquals(7, result);
78+
assertEquals(7, dictionary.getValueCount());
79+
80+
assertEquals("hello", new String(dictionary.get(0)));
81+
assertEquals("abc", new String(dictionary.get(1)));
82+
assertNull(dictionary.get(2));
83+
assertEquals("world", new String(dictionary.get(3)));
84+
assertEquals("12", new String(dictionary.get(4)));
85+
assertEquals("dictionary", new String(dictionary.get(5)));
86+
assertEquals("good", new String(dictionary.get(6)));
87+
}
88+
}
89+
90+
@Test
91+
public void testBuildVariableWidthDictionaryWithoutNull() {
92+
try (VarCharVector vec = new VarCharVector("", allocator);
93+
VarCharVector dictionary = new VarCharVector("", allocator)) {
94+
95+
vec.allocateNew(100, 10);
96+
vec.setValueCount(10);
97+
98+
dictionary.allocateNew();
99+
100+
// fill data
101+
vec.set(0, "hello".getBytes());
102+
vec.set(1, "abc".getBytes());
103+
vec.setNull(2);
104+
vec.set(3, "world".getBytes());
105+
vec.set(4, "12".getBytes());
106+
vec.set(5, "dictionary".getBytes());
107+
vec.setNull(6);
108+
vec.set(7, "hello".getBytes());
109+
vec.set(8, "good".getBytes());
110+
vec.set(9, "abc".getBytes());
111+
112+
HashTableBasedDictionaryBuilder<VarCharVector> dictionaryBuilder =
113+
new HashTableBasedDictionaryBuilder<>(dictionary, false);
114+
115+
int result = dictionaryBuilder.addValues(vec);
116+
117+
assertEquals(6, result);
118+
assertEquals(6, dictionary.getValueCount());
119+
120+
assertEquals("hello", new String(dictionary.get(0)));
121+
assertEquals("abc", new String(dictionary.get(1)));
122+
assertEquals("world", new String(dictionary.get(2)));
123+
assertEquals("12", new String(dictionary.get(3)));
124+
assertEquals("dictionary", new String(dictionary.get(4)));
125+
assertEquals("good", new String(dictionary.get(5)));
126+
127+
}
128+
}
129+
130+
@Test
131+
public void testBuildFixedWidthDictionaryWithNull() {
132+
try (IntVector vec = new IntVector("", allocator);
133+
IntVector dictionary = new IntVector("", allocator)) {
134+
vec.allocateNew(10);
135+
vec.setValueCount(10);
136+
137+
dictionary.allocateNew();
138+
139+
// fill data
140+
vec.set(0, 4);
141+
vec.set(1, 8);
142+
vec.set(2, 32);
143+
vec.set(3, 8);
144+
vec.set(4, 16);
145+
vec.set(5, 32);
146+
vec.setNull(6);
147+
vec.set(7, 4);
148+
vec.set(8, 4);
149+
vec.setNull(9);
150+
151+
HashTableBasedDictionaryBuilder<IntVector> dictionaryBuilder =
152+
new HashTableBasedDictionaryBuilder<>(dictionary, true);
153+
154+
int result = dictionaryBuilder.addValues(vec);
155+
156+
assertEquals(5, result);
157+
assertEquals(5, dictionary.getValueCount());
158+
159+
assertEquals(4, dictionary.get(0));
160+
assertEquals(8, dictionary.get(1));
161+
assertEquals(32, dictionary.get(2));
162+
assertEquals(16, dictionary.get(3));
163+
assertTrue(dictionary.isNull(4));
164+
}
165+
}
166+
167+
@Test
168+
public void testBuildFixedWidthDictionaryWithoutNull() {
169+
try (IntVector vec = new IntVector("", allocator);
170+
IntVector dictionary = new IntVector("", allocator)) {
171+
vec.allocateNew(10);
172+
vec.setValueCount(10);
173+
174+
dictionary.allocateNew();
175+
176+
// fill data
177+
vec.set(0, 4);
178+
vec.set(1, 8);
179+
vec.set(2, 32);
180+
vec.set(3, 8);
181+
vec.set(4, 16);
182+
vec.set(5, 32);
183+
vec.setNull(6);
184+
vec.set(7, 4);
185+
vec.set(8, 4);
186+
vec.setNull(9);
187+
188+
HashTableBasedDictionaryBuilder<IntVector> dictionaryBuilder =
189+
new HashTableBasedDictionaryBuilder<>(dictionary, false);
190+
191+
int result = dictionaryBuilder.addValues(vec);
192+
193+
assertEquals(4, result);
194+
assertEquals(4, dictionary.getValueCount());
195+
196+
assertEquals(4, dictionary.get(0));
197+
assertEquals(8, dictionary.get(1));
198+
assertEquals(32, dictionary.get(2));
199+
assertEquals(16, dictionary.get(3));
200+
201+
}
202+
}
203+
}

0 commit comments

Comments
 (0)