Use actual filter size in read ahead

acogoluegnes · kjnilsson · commit f32f49ed41d7 · 2025-08-26T10:29:50.000+01:00
Start with default filter size and use actual size after non-zero value
has been read.

Attempt to read ahead from parse_header instead of returning the header
and no data. The decision is based on the availability of previous read
ahead data.
diff --git a/src/osiris_log.erl b/src/osiris_log.erl
@@ -66,7 +66,7 @@
          make_chunk/7,
          orphaned_segments/1,
          read_header0/1,
-         last_data_size/2,
+         read_ahead_hints/3,
          update_read/4
         ]).
 
@@ -430,6 +430,7 @@
          position = 0 :: non_neg_integer(),
          filter :: undefined | osiris_bloom:mstate(),
          last_data_size = undefined :: undefined | non_neg_integer(),
+         filter_size = ?DEFAULT_FILTER_SIZE :: osiris_bloom:filter_size(),
          read_ahead_data = undefined :: undefined | binary()}).
 -record(write,
         {type = writer :: writer | acceptor,
@@ -1665,7 +1666,8 @@ send_file(Sock, State) ->
 send_file(Sock,
           #?MODULE{mode = #read{type = RType,
                                 chunk_selector = Selector,
-                                transport = Transport}} = State0,
+                                transport = Transport,
+                                filter_size = RaFs}} = State0,
           Callback) ->
     case catch read_header0(State0) of
         {ok, #{type := ChType,
@@ -1687,7 +1689,9 @@ send_file(Sock,
                 true ->
                     Read = Read0#read{next_offset = ChId + NumRecords,
                                       position = NextPos,
-                                      last_data_size = DataSize},
+                                      last_data_size = DataSize,
+                                      filter_size = read_ahead_fsize(RaFs,
+                                                                     FilterSize)},
                     case MaybeData of
                         undefined ->
                             %% read header
@@ -2903,10 +2907,11 @@ read_header_with_ra(#?MODULE{cfg = #cfg{directory = Dir,
                              mode = #read{next_offset = NextChId0,
                                           position = Pos,
                                           last_data_size = Lds,
+                                          filter_size = FilterSize,
                                           read_ahead_data = undefined} = Read0,
                              current_file = CurFile,
                              fd = Fd} = State) ->
-    ReadAheadSize = read_ahead_size(Lds),
+    ReadAheadSize = read_ahead_size(Lds, FilterSize),
 
     case file:pread(Fd, Pos, ?HEADER_SIZE_B + ReadAheadSize) of
         {ok, Bin} when byte_size(Bin) >= ?HEADER_SIZE_B ->
@@ -2981,7 +2986,6 @@ read_header_with_ra(#?MODULE{mode = #read{last_data_size = Lds,
             read_header_with_ra(State#?MODULE{mode = Read1})
     end.
 
-
 parse_header(<<?MAGIC:4/unsigned,
                ?VERSION:4/unsigned,
                ChType:8/unsigned,
@@ -2998,7 +3002,9 @@ parse_header(<<?MAGIC:4/unsigned,
                MaybeFilterAndRest/binary>> = HeaderData0,
              #?MODULE{mode = #read{type = RType,
                                    chunk_selector = Selector,
-                                   next_offset = NextChId0} = Read0} = State) ->
+                                   next_offset = NextChId0,
+                                   read_ahead_data = RAD,
+                                   filter_size = LFS} = Read0} = State) ->
     {ToSkip, ToSend} = select_amount_to_send(RType, Selector, ChType,
                                              FilterSize, DataSize,
                                              TrailerSize),
@@ -3010,8 +3016,8 @@ parse_header(<<?MAGIC:4/unsigned,
                     <<_Skip:ToSkip/binary,
                       Ctnt:ToSend/binary,
                       Rest/binary>>
-                      when byte_size(Rest) > ?HEADER_SIZE_B + ?DEFAULT_FILTER_SIZE ->
-                        %% remained is larger than 64 bytes so worth keeping
+                      when byte_size(Rest) > ?HEADER_SIZE_B + LFS ->
+                        %% remaining is larger than 64 bytes so worth keeping
                         %% around
                         {Rest, Ctnt};
                     <<_Skip:ToSkip/binary,
@@ -3026,25 +3032,38 @@ parse_header(<<?MAGIC:4/unsigned,
                                 Epoch, NextChId0, Crc, DataSize, TrailerSize,
                                 FilterSize);
         false ->
-            %% having to throw away the read ahead data here
-            Read1 = Read0#read{read_ahead_data = undefined},
-            maybe_return_header(State#?MODULE{mode = Read1},
-                                HeaderData0, MaybeFilterAndRest, undefined,
-                                ChType, NumEntries, NumRecords, Timestamp,
-                                Epoch, NextChId0, Crc, DataSize, TrailerSize,
-                                FilterSize)
+            case RAD of
+                undefined ->
+                    %% we just read the data, we could not read ahead the whole chunk
+                    %% let's move on to see whether the chunk should be filtered or not
+                    maybe_return_header(State,
+                                        HeaderData0, MaybeFilterAndRest, undefined,
+                                        ChType, NumEntries, NumRecords, Timestamp,
+                                        Epoch, NextChId0, Crc, DataSize, TrailerSize,
+                                        FilterSize);
+                _ ->
+                    %% the data were from a previous read
+                    %% we can ditch them and try to read the chunk ahead
+                    need_more_data
+            end
     end;
 parse_header(_, _) ->
     need_more_data.
 
-read_ahead_size(LastDataSize) ->
+%% keep the previous value if the current one is 0 (i.e. no filter in the chunk)
+read_ahead_fsize(Previous, 0) ->
+    Previous;
+read_ahead_fsize(_, Current) ->
+    Current.
+
+read_ahead_size(LastDataSize, FilterSize) ->
     case LastDataSize =/= undefined andalso
          LastDataSize =< ?READ_AHEAD_LIMIT of
         true ->
             %% the previous chunk was small, try to read
             %% the next chunk fully in one read
             %% this can save us a system call later
-            ?DEFAULT_FILTER_SIZE + ?READ_AHEAD_LIMIT;
+            FilterSize + ?READ_AHEAD_LIMIT;
         false ->
             %% optimistically read the default filter size.
             %% this amounts to 64 bytes with the header (small binary)
@@ -3107,10 +3126,14 @@ maybe_return_header(#?MODULE{cfg = #cfg{counter = CntRef},
             read_header0(State#?MODULE{mode = Read})
     end.
 
--spec last_data_size(state(), non_neg_integer()) -> state().
-last_data_size(#?MODULE{mode = R = #read{}} = S, Lds) ->
-    S#?MODULE{mode = R#read{last_data_size = Lds}}.
+%% for testing
+-spec read_ahead_hints(state(), non_neg_integer(), osiris_bloom:filter_size()) ->
+    state().
+read_ahead_hints(#?MODULE{mode = R = #read{}} = S, Lds, FilterSize) ->
+    S#?MODULE{mode = R#read{last_data_size = Lds,
+                            filter_size = FilterSize}}.
 
+%% for testing
 -spec update_read(state(), offset(), offset(), non_neg_integer()) -> state().
 update_read(#?MODULE{mode = R0 = #read{}} = S, ChId, NumRecords, Pos) ->
     R = R0#read{next_offset = ChId + NumRecords,
diff --git a/test/osiris_log_SUITE.erl b/test/osiris_log_SUITE.erl
@@ -1890,24 +1890,23 @@ read_header_ahead_offset_reader(Config) ->
              %% the messages are large enough to be larger than the default
              %% filter size which is always read ahead (16 bytes)
              {_, W1} = write_committed([<<"hiiiiiiiii">>, <<"hooooooo">>], W0),
-             ct:pal("R0 ~p", [R0]),
              {ok, H, Content, R1} = osiris_log:read_header0(R0),
              ?assertEqual(undefined, Content),
              {H, W1, R1}
      end,
-     fun(#{w := W0, r := R0}) ->
+     fun(#{w := W0, r := R0, fsize := FSize}) ->
              %% previous chunk too large to read ahead
-             R1 = osiris_log:last_data_size(R0, RAL * 2),
+             R1 = osiris_log:read_ahead_hints(R0, RAL * 2, FSize),
              {_, W1} = write_committed([<<"hiiiiiiiii">>, <<"hooooooo">>], W0),
              {ok, H, Content, R2} = osiris_log:read_header0(R1),
              ?assertEqual(undefined, Content),
              {H, W1, R2}
      end,
-     fun(#{w := W0, r := R0}) ->
+     fun(#{w := W0, r := R0, fsize := FSize}) ->
              %% trigger reading ahead by setting a small value for the
              %% last chunk read.
              %% this setting stays the same for the rest of the test
-             R1 = osiris_log:last_data_size(R0, 1),
+             R1 = osiris_log:read_ahead_hints(R0, 1, FSize),
              Entries = [<<"foo">>, <<"bar">>],
              {_, W1} = write_committed(Entries, W0),
              {ok, H, Content, R2} = osiris_log:read_header0(R1),
@@ -1941,10 +1940,13 @@ read_header_ahead_offset_reader(Config) ->
              {H, W1, R1}
      end,
      fun(#{w := W0, r := R0}) ->
-             Entries1 = [binary:copy(<<"a">>, 16)],
+             Entries1 = [binary:copy(<<"a">>, 2000)],
              {_, W1} = write_committed(Entries1, W0),
-             Entries2 = [binary:copy(<<"b">>, 32)],
+             Entries2 = [binary:copy(<<"b">>, 2000)],
              {_, W2} = write_committed(Entries2, W1),
+             %% this one is too big to be read ahead fully
+             Entries3 = [binary:copy(<<"c">>, 5000)],
+             {_, W3} = write_committed(Entries3, W2),
 
              {ok, H1, Content1, R1} = osiris_log:read_header0(R0),
              [_, _, D1, _] = fake_chunk(Entries1, ?LINE, 1, 100),
@@ -1953,7 +1955,11 @@ read_header_ahead_offset_reader(Config) ->
              {ok, H2, Content2, R2} = osiris_log:read_header0(update_read(H1, R1)),
              [_, _, D2, _] = fake_chunk(Entries2, ?LINE, 1, 100),
              ?assertEqual(iolist_to_binary(D2), Content2),
-             {H2, W2, R2}
+
+             {ok, H3, Content3, R3} = osiris_log:read_header0(update_read(H2, R2)),
+             ?assertEqual(undefined, Content3),
+
+             {H3, W3, R3}
      end
     ],
 
@@ -1972,7 +1978,7 @@ read_header_ahead_offset_reader(Config) ->
                                                                        FSize, Wr0, Rd0),
                           osiris_log:close(Rd1),
                           osiris_log:close(Wr1)
-                  end, [{FSize, RType} || FSize <- FilterSizes, RType <- [offset, data]]),
+                  end, [{FSize, RType} || FSize <- FilterSizes, RType <- [data, offset]]),
     ok.
 
 read_header_ahead_offset_reader_filter(Config) ->
@@ -1992,30 +1998,85 @@ read_header_ahead_offset_reader_filter(Config) ->
               Shared = osiris_log:get_shared(Wr0),
               Conf = Conf1#{shared => Shared},
               {ok, Rd0} = osiris_log:init_offset_reader(first, Conf),
-              Rd1 = osiris_log:last_data_size(Rd0, 1),
-              %% we always read ahead the default filter size.
-              %% with a larger-than-default filter, we must consider
-              %% the extra bytes that belong to the filter,
-              %% that is (actual filter size) - (default filter size)
-              %% this reduces the max entry size we can read ahead
-              MES = MaxEntrySize - (FSize - DFS),
+              %% we start by using the default filter size in the read ahead hints
+              Rd1 = osiris_log:read_ahead_hints(Rd0, 1, DFS),
+              %% compute the max entry size
+              %% (meaning we don't read ahead enough above this entry size)
+              %% first we don't know the actual filter size in the stream,
+              %% so we assume the default filter size
+              %% this "reduces" the max size of data we can read in the case
+              %% of a larger-than-default filter size, because of the extra
+              %% bytes that belong to the filter
+              MES1 = MaxEntrySize - (FSize - DFS),
+              %% then the max entry becomes accurate, whatever the actual filter size
+              MES2 = MaxEntrySize,
 
               Tests =
               [
                fun(#{w := W0, r := R0}) ->
-                       %% chunk with a non-empty filter
                        %% data do not fit in the read ahead
-                       EData = binary:copy(<<"a">>, MES + 1),
+                       EData = binary:copy(<<"a">>, MES1 + 1),
+                       Entries = [{<<"banana">>, EData}],
+                       {_, W1} = write_committed(Entries, W0),
+                       {ok, H, Content, R1} = osiris_log:read_header0(R0),
+                       ?assertEqual(undefined, Content),
+                       {H, W1, R1}
+               end,
+               fun(#{w := W0, r := R0}) ->
+                       %% data exactly fits in the read ahead
+                       EData = binary:copy(<<"a">>, MES1),
+                       Entries = [{<<"banana">>, EData}],
+                       {_, W1} = write_committed(Entries, W0),
+                       {ok, H, Content, R1} = osiris_log:read_header0(R0),
+                       [_, _, D, _] = fake_chunk(Entries, ?LINE, 1, 100),
+                       ?assertEqual(iolist_to_binary(D), Content),
+                       {H, W1, R1}
+               end,
+               fun(#{w := W0, r := R}) ->
+                       %% assume we are now using the correct filter size
+                       %% (this setting stays the same for the next tests)
+                       R0 = osiris_log:read_ahead_hints(R, 1, FSize),
+                       %% data just bigger than the first limit
+                       EData = binary:copy(<<"a">>, MES1 + 1),
+                       Entries = [{<<"banana">>, EData}],
+                       {_, W1} = write_committed(Entries, W0),
+                       {ok, H, Content, R1} = osiris_log:read_header0(R0),
+                       case FSize =:= DFS of
+                           true ->
+                               %% default filter size: still does not fit
+                               ?assertEqual(undefined, Content);
+                           false ->
+                               %% with the correct filter size, we now read
+                               %% a bit further than with the first limit
+                               [_, _, D, _] = fake_chunk(Entries, ?LINE, 1, 100),
+                               ?assertEqual(iolist_to_binary(D), Content)
+                       end,
+                       {H, W1, R1}
+               end,
+               fun(#{w := W0, r := R0}) ->
+                       %% data exactly fits in the read ahead
+                       EData = binary:copy(<<"a">>, MES1),
+                       Entries = [{<<"banana">>, EData}],
+                       {_, W1} = write_committed(Entries, W0),
+                       {ok, H, Content, R1} = osiris_log:read_header0(R0),
+                       [_, _, D, _] = fake_chunk(Entries, ?LINE, 1, 100),
+                       ?assertEqual(iolist_to_binary(D), Content),
+                       {H, W1, R1}
+               end,
+               fun(#{w := W0, r := R0}) ->
+                       %% we use the "new" max entry size
+                       %% data do not fit in the read ahead
+                       EData = binary:copy(<<"a">>, MES2 + 1),
                        Entries = [{<<"banana">>, EData}],
                        {_, W1} = write_committed(Entries, W0),
                        {ok, H, Content, R1} = osiris_log:read_header0(R0),
                        ?assertEqual(undefined, Content),
                        {H, W1, R1}
                end,
                fun(#{w := W0, r := R0}) ->
-                       %% chunk with a non-empty filter
+                       %% we use the "new" max entry size
                        %% data exactly fits in the read ahead
-                       EData = binary:copy(<<"a">>, MES),
+                       EData = binary:copy(<<"a">>, MES2),
                        Entries = [{<<"banana">>, EData}],
                        {_, W1} = write_committed(Entries, W0),
                        {ok, H, Content, R1} = osiris_log:read_header0(R0),