Skip to content

Commit 6c2fc65

Browse files
committed
LUCENE-3295: fix several issues in BitVector.writeClearedDgaps
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1144942 13f79535-47bb-0310-9956-ffa450edef68
1 parent 74f4f29 commit 6c2fc65

File tree

3 files changed

+91
-12
lines changed

3 files changed

+91
-12
lines changed

lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,6 @@ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
213213
System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF));
214214
}
215215
}
216-
217216
builder.add(text, new BytesRef(spare));
218217
termCount++;
219218
}

lucene/src/java/org/apache/lucene/util/BitVector.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ public final int count() {
162162
}
163163
count = c;
164164
}
165+
assert count <= size: "count=" + count + " size=" + size;
165166
return count;
166167
}
167168

@@ -227,6 +228,7 @@ public final void write(Directory d, String name, IOContext context) throws IOEx
227228
} else {
228229
writeBits(output);
229230
}
231+
assert verifyCount();
230232
} finally {
231233
output.close();
232234
}
@@ -278,14 +280,13 @@ private void writeClearedDgaps(IndexOutput output) throws IOException {
278280
output.writeInt(count()); // write count
279281
int last=0;
280282
int numCleared = size()-count();
281-
int m = bits.length;
282-
for (int i=0; i<m && numCleared>0; i++) {
283-
if (bits[i]!=0xff) {
283+
for (int i=0; i<bits.length && numCleared>0; i++) {
284+
if (bits[i] != (byte) 0xff) {
284285
output.writeVInt(i-last);
285286
output.writeByte(bits[i]);
286287
last = i;
287288
numCleared -= (8-BYTE_COUNTS[bits[i] & 0xFF]);
288-
assert numCleared >= 0;
289+
assert numCleared >= 0 || (i == (bits.length-1) && numCleared == -(8-(size&7)));
289290
}
290291
}
291292
}
@@ -319,7 +320,7 @@ private boolean isSparse() {
319320
final int bytesPerSetBit = expectedDGapBytes + 1;
320321

321322
// note: adding 32 because we start with ((int) -1) to indicate d-gaps format.
322-
final long expectedBits = 32 + 8 * bytesPerSetBit * count();
323+
final long expectedBits = 32 + 8 * bytesPerSetBit * clearedCount;
323324

324325
// note: factor is for read/write of byte-arrays being faster than vints.
325326
final long factor = 10;
@@ -352,11 +353,21 @@ public BitVector(Directory d, String name, IOContext context) throws IOException
352353
} else {
353354
readBits(input);
354355
}
356+
assert verifyCount();
355357
} finally {
356358
input.close();
357359
}
358360
}
359361

362+
// asserts only
363+
private boolean verifyCount() {
364+
assert count != -1;
365+
final int countSav = count;
366+
count = -1;
367+
assert countSav == count(): "saved count was " + countSav + " but recomputed count is " + count;
368+
return true;
369+
}
370+
360371
/** Read as a bit set */
361372
private void readBits(IndexInput input) throws IOException {
362373
count = input.readInt(); // read count
@@ -368,7 +379,7 @@ private void readBits(IndexInput input) throws IOException {
368379
private void readSetDgaps(IndexInput input) throws IOException {
369380
size = input.readInt(); // (re)read size
370381
count = input.readInt(); // read count
371-
bits = new byte[(size >> 3) + 1]; // allocate bits
382+
bits = new byte[getNumBytes(size)]; // allocate bits
372383
int last=0;
373384
int n = count();
374385
while (n>0) {
@@ -383,7 +394,7 @@ private void readSetDgaps(IndexInput input) throws IOException {
383394
private void readClearedDgaps(IndexInput input) throws IOException {
384395
size = input.readInt(); // (re)read size
385396
count = input.readInt(); // read count
386-
bits = new byte[(size >> 3) + 1]; // allocate bits
397+
bits = new byte[getNumBytes(size)]; // allocate bits
387398
Arrays.fill(bits, (byte) 0xff);
388399
clearUnusedBits();
389400
int last=0;
@@ -392,7 +403,7 @@ private void readClearedDgaps(IndexInput input) throws IOException {
392403
last += input.readVInt();
393404
bits[last] = input.readByte();
394405
numCleared -= 8-BYTE_COUNTS[bits[last] & 0xFF];
395-
assert numCleared >= 0;
406+
assert numCleared >= 0 || (last == (bits.length-1) && numCleared == -(8-(size&7)));
396407
}
397408
}
398409
}

lucene/src/test/org/apache/lucene/util/TestBitVector.java

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import java.io.IOException;
2121

22-
import org.apache.lucene.store.IOContext;
22+
import org.apache.lucene.store.Directory;
2323
import org.apache.lucene.store.MockDirectoryWrapper;
2424
import org.apache.lucene.store.RAMDirectory;
2525

@@ -153,7 +153,7 @@ private void doTestWriteRead(int n) throws Exception {
153153
assertTrue(doCompare(bv,compare));
154154
}
155155
}
156-
156+
157157
/**
158158
* Test r/w when size/count cause switching between bit-set and d-gaps file formats.
159159
*/
@@ -165,6 +165,26 @@ public void testDgaps() throws IOException {
165165
doTestDgaps(10000,40,43);
166166
doTestDgaps(100000,415,418);
167167
doTestDgaps(1000000,3123,3126);
168+
// now exercise skipping of fully populated byte in the bitset (they are omitted if bitset is sparse)
169+
MockDirectoryWrapper d = new MockDirectoryWrapper(random, new RAMDirectory());
170+
d.setPreventDoubleWrite(false);
171+
BitVector bv = new BitVector(10000);
172+
bv.set(0);
173+
for (int i = 8; i < 16; i++) {
174+
bv.set(i);
175+
} // make sure we have once byte full of set bits
176+
for (int i = 32; i < 40; i++) {
177+
bv.set(i);
178+
} // get a second byte full of set bits
179+
// add some more bits here
180+
for (int i = 40; i < 10000; i++) {
181+
if (random.nextInt(1000) == 0) {
182+
bv.set(i);
183+
}
184+
}
185+
bv.write(d, "TESTBV", newIOContext(random));
186+
BitVector compare = new BitVector(d, "TESTBV", newIOContext(random));
187+
assertTrue(doCompare(bv,compare));
168188
}
169189

170190
private void doTestDgaps(int size, int count1, int count2) throws IOException {
@@ -183,7 +203,7 @@ private void doTestDgaps(int size, int count1, int count2) throws IOException {
183203
assertTrue(doCompare(bv,bv2));
184204
bv = bv2;
185205
bv.clear(i);
186-
assertEquals(i+1,size-bv.count());
206+
assertEquals(i+1, size-bv.count());
187207
bv.write(d, "TESTBV", newIOContext(random));
188208
}
189209
// now start decreasing number of set bits
@@ -196,6 +216,54 @@ private void doTestDgaps(int size, int count1, int count2) throws IOException {
196216
bv.write(d, "TESTBV", newIOContext(random));
197217
}
198218
}
219+
220+
public void testSparseWrite() throws IOException {
221+
Directory d = newDirectory();
222+
final int numBits = 10240;
223+
BitVector bv = new BitVector(numBits);
224+
bv.invertAll();
225+
int numToClear = random.nextInt(5);
226+
for(int i=0;i<numToClear;i++) {
227+
bv.clear(random.nextInt(numBits));
228+
}
229+
bv.write(d, "test", newIOContext(random));
230+
final long size = d.fileLength("test");
231+
assertTrue("size=" + size, size < 100);
232+
d.close();
233+
}
234+
235+
public void testClearedBitNearEnd() throws IOException {
236+
Directory d = newDirectory();
237+
final int numBits = _TestUtil.nextInt(random, 7, 1000);
238+
BitVector bv = new BitVector(numBits);
239+
bv.invertAll();
240+
bv.clear(numBits-_TestUtil.nextInt(random, 1, 7));
241+
bv.write(d, "test", newIOContext(random));
242+
assertEquals(numBits-1, bv.count());
243+
d.close();
244+
}
245+
246+
public void testMostlySet() throws IOException {
247+
Directory d = newDirectory();
248+
final int numBits = _TestUtil.nextInt(random, 30, 1000);
249+
for(int numClear=0;numClear<20;numClear++) {
250+
BitVector bv = new BitVector(numBits);
251+
bv.invertAll();
252+
int count = 0;
253+
while(count < numClear) {
254+
final int bit = random.nextInt(numBits);
255+
// Don't use getAndClear, so that count is recomputed
256+
if (bv.get(bit)) {
257+
bv.clear(bit);
258+
count++;
259+
assertEquals(numBits-count, bv.count());
260+
}
261+
}
262+
}
263+
264+
d.close();
265+
}
266+
199267
/**
200268
* Compare two BitVectors.
201269
* This should really be an equals method on the BitVector itself.
@@ -211,6 +279,7 @@ private boolean doCompare(BitVector bv, BitVector compare) {
211279
break;
212280
}
213281
}
282+
assertEquals(bv.count(), compare.count());
214283
return equal;
215284
}
216285
}

0 commit comments

Comments
 (0)