Skip to content

Commit 0f4bd8a

Browse files
committed
implement RangePostingsList serialization
1 parent 1a81e36 commit 0f4bd8a

File tree

5 files changed

+173
-17
lines changed

5 files changed

+173
-17
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using System.IO;
2+
3+
using Xunit;
4+
5+
using InformationRetrieval.Indexing.PostingsList;
6+
using Corpus;
7+
using System.Linq;
8+
9+
namespace InformationRetrieval.Test.Indexing.PostingsList
10+
{
11+
public class PostingsListTest
12+
{
13+
[Fact]
14+
public void UncompressedReadWriteTest()
15+
{
16+
var stream = new MemoryStream();
17+
18+
var postings = GetDocIds(0, 1, 2, 10, 11, 12, 13, 14, 15, 100, 111);
19+
20+
using var writer = new PostingsListWriter(stream);
21+
writer.Write(postings);
22+
23+
stream.Seek(0, SeekOrigin.Begin);
24+
25+
using var reader = new PostingsListReader(stream, leaveOpen: false);
26+
var count = reader.ReadCount(0);
27+
var deserialized = reader.Read(0);
28+
29+
Assert.Equal(postings.Length, count);
30+
Assert.False(deserialized is RangePostingsList);
31+
Assert.Equal(postings, deserialized);
32+
}
33+
34+
[Fact]
35+
public void RangeReadWriteTest()
36+
{
37+
var stream = new MemoryStream();
38+
39+
var rangePostings = new RangePostingsList()
40+
{
41+
0, 1, 2, 10, 11, 12, 13, 14, 15, 100, 111
42+
};
43+
44+
using var writer = new PostingsListWriter(stream);
45+
writer.Write(rangePostings);
46+
47+
stream.Seek(0, SeekOrigin.Begin);
48+
49+
using var reader = new PostingsListReader(stream, leaveOpen: false);
50+
var count = reader.ReadCount(0);
51+
var deserialized = reader.Read(0);
52+
53+
Assert.Equal(rangePostings.Count, count);
54+
Assert.True(deserialized is RangePostingsList);
55+
Assert.Equal(rangePostings, deserialized);
56+
}
57+
58+
private DocumentId[] GetDocIds(params uint[] ids)
59+
{
60+
return ids.Select(id => new DocumentId(id)).ToArray();
61+
}
62+
}
63+
}

src/InformationRetrieval/Indexing/PostingsList/PostingsListReader.cs

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@ public IReadOnlyCollection<DocumentId> Read(long position)
2222
{
2323
stream.Seek(position, SeekOrigin.Begin);
2424
var length = reader.ReadInt32();
25-
var postings = new DocumentId[length];
26-
for (int i = 0; i < length; ++i)
25+
PostingsListType type = (PostingsListType)reader.ReadByte();
26+
27+
switch (type)
2728
{
28-
postings[i] = new DocumentId(reader.ReadUInt32());
29+
case PostingsListType.Ranged:
30+
return ReadRanged(length);
31+
32+
default:
33+
return ReadUmcompressed(length);
2934
}
30-
return postings;
3135
}
3236

3337
public int ReadCount(long position)
@@ -40,5 +44,41 @@ public void Dispose()
4044
{
4145
reader.Dispose();
4246
}
47+
48+
private IReadOnlyCollection<DocumentId> ReadUmcompressed(int length)
49+
{
50+
var postings = new DocumentId[length];
51+
for (int i = 0; i < length; ++i)
52+
{
53+
postings[i] = new DocumentId(reader.ReadUInt32());
54+
}
55+
56+
return postings;
57+
}
58+
59+
private RangePostingsList ReadRanged(int count)
60+
{
61+
ushort length = reader.ReadUInt16();
62+
var blocks = new List<DocumentIdRangeBlock>(length);
63+
for (int i = 0; i < length; ++i)
64+
{
65+
blocks.Add(ReadBlock());
66+
}
67+
68+
return new RangePostingsList(count, blocks);
69+
}
70+
71+
private DocumentIdRangeBlock ReadBlock()
72+
{
73+
ushort blockId = reader.ReadUInt16();
74+
ushort length = reader.ReadUInt16();
75+
var ranges = new List<uint>(length);
76+
for (int i = 0; i < length; ++i)
77+
{
78+
ranges.Add(reader.ReadUInt32());
79+
}
80+
81+
return new DocumentIdRangeBlock(blockId, ranges);
82+
}
4383
}
4484
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
namespace InformationRetrieval.Indexing.PostingsList
2+
{
3+
public enum PostingsListType : byte
4+
{
5+
Uncompressed,
6+
Ranged
7+
};
8+
}

src/InformationRetrieval/Indexing/PostingsList/PostingsListWriter.cs

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,20 @@ public long Write(IReadOnlyCollection<DocumentId> postings)
2222
{
2323
var position = stream.Position;
2424
writer.Write(postings.Count);
25-
foreach (var id in postings)
25+
26+
switch (postings)
2627
{
27-
writer.Write(id.Id);
28+
case RangePostingsList range:
29+
writer.Write((byte)PostingsListType.Ranged);
30+
WriteRanged(range.Blocks);
31+
break;
32+
33+
default:
34+
writer.Write((byte)PostingsListType.Uncompressed);
35+
WriteUncompressed(postings);
36+
break;
2837
}
38+
2939
writer.Flush();
3040

3141
return position;
@@ -44,5 +54,32 @@ public void Dispose()
4454
{
4555
writer.Dispose();
4656
}
57+
58+
private void WriteUncompressed(IReadOnlyCollection<DocumentId> postings)
59+
{
60+
foreach (var id in postings)
61+
{
62+
writer.Write(id.Id);
63+
}
64+
}
65+
66+
private void WriteRanged(IList<DocumentIdRangeBlock> list)
67+
{
68+
writer.Write((ushort)list.Count);
69+
foreach (var block in list)
70+
{
71+
WriteBlock(block);
72+
}
73+
}
74+
75+
private void WriteBlock(DocumentIdRangeBlock block)
76+
{
77+
writer.Write(block.BlockId);
78+
writer.Write((ushort)block.Ranges.Count);
79+
foreach (uint val in block.Ranges)
80+
{
81+
writer.Write(val);
82+
}
83+
}
4784
}
4885
}

src/InformationRetrieval/Indexing/PostingsList/RangePostingsList.cs

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,31 @@ public class RangePostingsList : IReadOnlyCollection<DocumentId>
1717
private const int DefaultCapacity = 8;
1818

1919
[ProtoMember(1)]
20-
private readonly IList<DocumentIdRangeBlock> list = new List<DocumentIdRangeBlock>(DefaultCapacity);
20+
public IList<DocumentIdRangeBlock> Blocks { get; }
2121

2222
[ProtoMember(2)]
23-
public int Count { get; private set; } = 0;
23+
public int Count { get; private set; }
24+
25+
public RangePostingsList() : this(0, new List<DocumentIdRangeBlock>(DefaultCapacity)) { }
26+
27+
public RangePostingsList(int count, IList<DocumentIdRangeBlock> blocks)
28+
{
29+
this.Count = count;
30+
this.Blocks = blocks;
31+
}
2432

2533
public void Add(DocumentId id)
2634
{
2735
DocumentIdRangeBlock block;
2836

29-
if (list.Count == 0 || list[^1].BlockId != id.BlockId)
37+
if (Blocks.Count == 0 || Blocks[^1].BlockId != id.BlockId)
3038
{
3139
block = new DocumentIdRangeBlock(id.BlockId);
32-
list.Add(block);
40+
Blocks.Add(block);
3341
}
3442
else
3543
{
36-
block = list[^1];
44+
block = Blocks[^1];
3745
}
3846

3947
if (block.Add(id.LocalId)) Count++;
@@ -46,7 +54,7 @@ public void Add(uint id)
4654

4755
public IEnumerator<DocumentId> GetEnumerator()
4856
{
49-
foreach (var block in list)
57+
foreach (var block in Blocks)
5058
{
5159
foreach (var r in block.Ranges)
5260
{
@@ -97,14 +105,14 @@ public class DocumentIdRangeBlock
97105
public IList<uint> Ranges { get; }
98106

99107
// for protobuf deserialization
100-
private DocumentIdRangeBlock() : this(0)
101-
{
102-
}
108+
private DocumentIdRangeBlock() : this(0) { }
109+
110+
public DocumentIdRangeBlock(ushort id) : this(id, new List<uint>(DefaultCapacity)) { }
103111

104-
public DocumentIdRangeBlock(ushort id)
112+
public DocumentIdRangeBlock(ushort id, IList<uint> ranges)
105113
{
106114
BlockId = id;
107-
Ranges = new List<uint>(DefaultCapacity);
115+
Ranges = ranges;
108116
}
109117

110118
public bool Add(ushort localId)

0 commit comments

Comments
 (0)