Skip to content

Commit

Permalink
Merge pull request tursodatabase#1777 from tursodatabase/libsql-wal-r…
Browse files Browse the repository at this point in the history
…eplicate-from-db-file

libsql wal replicate from db file
  • Loading branch information
MarinPostma authored Oct 9, 2024
2 parents 21ae561 + 18e7c0c commit c050655
Show file tree
Hide file tree
Showing 7 changed files with 184 additions and 96 deletions.
11 changes: 11 additions & 0 deletions libsql-wal/src/io/buf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,17 @@ impl<T> ZeroCopyBoxIoBuf<T> {
Self { init: 0, inner }
}

/// same as new_uninit, but partially fills the buffer starting at offset
///
/// # Safety: The caller must ensure that the remaining bytes are initialized
pub unsafe fn new_uninit_partial(inner: Box<T>, offset: usize) -> Self {
assert!(offset < size_of::<T>());
Self {
inner,
init: offset,
}
}

fn is_init(&self) -> bool {
self.init == size_of::<T>()
}
Expand Down
22 changes: 12 additions & 10 deletions libsql-wal/src/io/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,16 +193,17 @@ impl FileExt for File {
let (buffer, ret) = tokio::task::spawn_blocking(move || {
// let mut read = 0;

let len = buf.bytes_total();
let init = buf.bytes_init();
let chunk = unsafe {
let len = buf.bytes_total();
let ptr = buf.stable_mut_ptr();
std::slice::from_raw_parts_mut(ptr, len)
let ptr = buf.stable_mut_ptr().offset(init as _);
std::slice::from_raw_parts_mut(ptr, len - init)
};

let ret = file.read_exact_at(chunk, offset);
if ret.is_ok() {
unsafe {
buf.set_init(buf.bytes_total());
buf.set_init(init + chunk.len());
}
}
(buf, ret)
Expand All @@ -222,16 +223,17 @@ impl FileExt for File {
let (buffer, ret) = tokio::task::spawn_blocking(move || {
// let mut read = 0;

let len = buf.bytes_total();
let init = buf.bytes_init();
let chunk = unsafe {
let len = buf.bytes_total();
let ptr = buf.stable_mut_ptr();
std::slice::from_raw_parts_mut(ptr, len)
let ptr = buf.stable_mut_ptr().offset(init as _);
std::slice::from_raw_parts_mut(ptr, len - init)
};

let ret = file.read_at(chunk, offset);
if let Ok(n) = ret {
unsafe {
buf.set_init(n);
buf.set_init(init + n);
}
}
(buf, ret)
Expand Down Expand Up @@ -358,13 +360,13 @@ mod test {
file.write_all(&[1; 12345]).unwrap();
file.write_all(&[2; 50]).unwrap();

let buf = vec![0u8; 12345];
let buf = Vec::with_capacity(12345);
let (buf, ret) = file.read_exact_at_async(buf, 0).await;
ret.unwrap();
assert_eq!(buf.len(), 12345);
assert!(buf.iter().all(|x| *x == 1));

let buf = vec![2u8; 50];
let buf = Vec::with_capacity(50);
let (buf, ret) = file.read_exact_at_async(buf, 12345).await;
ret.unwrap();
assert_eq!(buf.len(), 50);
Expand Down
38 changes: 24 additions & 14 deletions libsql-wal/src/replication/replicator.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use std::pin::Pin;
use std::sync::Arc;

use roaring::RoaringBitmap;
use tokio::sync::watch;
use tokio_stream::{Stream, StreamExt};
use tokio_stream::{Stream, StreamExt as _};

use crate::io::Io;
use crate::replication::Error;
Expand Down Expand Up @@ -58,13 +59,13 @@ impl<IO: Io> Replicator<IO> {
tracing::debug!(most_recent_frame_no, "new frame_no available");

let mut commit_frame_no = 0;
let tx = self.shared.begin_read(u64::MAX);
// we have stuff to replicate
if most_recent_frame_no >= self.next_frame_no {
// first replicate the most recent version of each page from the current
// segment. We also return how far back we have replicated from the current log
let current = self.shared.current.load();
let mut seen = RoaringBitmap::new();
let (stream, replicated_until, size_after) = current.frame_stream_from(self.next_frame_no, &mut seen);
let (stream, replicated_until) = tx.current.frame_stream_from(self.next_frame_no, &mut seen, &tx);
let should_replicate_from_tail = replicated_until != self.next_frame_no;

{
Expand All @@ -78,7 +79,7 @@ impl<IO: Io> Replicator<IO> {
let mut frame = frame.map_err(|e| Error::CurrentSegment(e.into()))?;
commit_frame_no = frame.header().frame_no().max(commit_frame_no);
if stream.peek().await.is_none() && !should_replicate_from_tail {
frame.header_mut().set_size_after(size_after);
frame.header_mut().set_size_after(tx.db_size);
self.next_frame_no = commit_frame_no + 1;
}

Expand All @@ -90,9 +91,9 @@ impl<IO: Io> Replicator<IO> {
// wee need to take frames from the sealed segments.
if should_replicate_from_tail {
let replicated_until = {
let (stream, replicated_until) = current
let (stream, replicated_until) = tx.current
.tail()
.stream_pages_from(replicated_until, self.next_frame_no, &mut seen).await;
.stream_pages_from(replicated_until, self.next_frame_no, &mut seen, &tx).await;
tokio::pin!(stream);

tracing::debug!(replicated_until, "replicating from tail");
Expand All @@ -105,7 +106,7 @@ impl<IO: Io> Replicator<IO> {
let mut frame = frame.map_err(|e| Error::SealedSegment(e.into()))?;
commit_frame_no = frame.header().frame_no().max(commit_frame_no);
if stream.peek().await.is_none() && !should_replicate_from_storage {
frame.header_mut().set_size_after(size_after);
frame.header_mut().set_size_after(tx.db_size);
self.next_frame_no = commit_frame_no + 1;
}

Expand All @@ -118,12 +119,21 @@ impl<IO: Io> Replicator<IO> {
// Replicating from sealed segments was not enough, so we replicate from
// durable storage
if let Some(replicated_until) = replicated_until {
tracing::debug!("replicating from durable storage");
let stream = self
.shared
.stored_segments
.stream(&mut seen, replicated_until, self.next_frame_no)
.peekable();
let stream: Pin<Box<dyn Stream<Item = _> + Send>> = if self.next_frame_no == 1 {
// we're replicating from scratch, read straight from the main db
// file
tracing::debug!("replicating main db file");
Box::pin(self.shared.replicate_from_db_file(&mut seen, &tx, replicated_until))
} else {
tracing::debug!("replicating from durable storage");
Box::pin(self
.shared
.stored_segments
.stream(&mut seen, replicated_until, self.next_frame_no)
.peekable())
};

let stream = stream.peekable();

tokio::pin!(stream);

Expand All @@ -132,7 +142,7 @@ impl<IO: Io> Replicator<IO> {
let mut frame = frame?;
commit_frame_no = frame.header().frame_no().max(commit_frame_no);
if stream.peek().await.is_none() {
frame.header_mut().set_size_after(size_after);
frame.header_mut().set_size_after(tx.db_size);
self.next_frame_no = commit_frame_no + 1;
}

Expand Down
1 change: 1 addition & 0 deletions libsql-wal/src/replication/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ where
let segment = match maybe_seg {
Some(ref seg) => seg,
None => {
tracing::debug!(key = %key, "fetching segment");
maybe_seg = Some(storage.fetch_segment_data(&namespace, &key, None).await?);
maybe_seg.as_ref().unwrap()
},
Expand Down
63 changes: 33 additions & 30 deletions libsql-wal/src/segment/current.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::io::file::FileExt;
use crate::io::Inspect;
use crate::segment::{checked_frame_offset, SegmentFlags};
use crate::segment::{frame_offset, page_offset, sealed::SealedSegment};
use crate::transaction::{Transaction, TxGuardOwned, TxGuardShared};
use crate::transaction::{ReadTransaction, Transaction, TxGuardOwned, TxGuardShared};
use crate::{LIBSQL_MAGIC, LIBSQL_PAGE_SIZE, LIBSQL_WAL_VERSION};

use super::list::SegmentList;
Expand Down Expand Up @@ -507,24 +507,25 @@ impl<F> CurrentSegment<F> {
&'a self,
start_frame_no: u64,
seen: &'a mut RoaringBitmap,
) -> (impl Stream<Item = Result<Box<Frame>>> + 'a, u64, u32)
// not actually used, but ensures that a read lock is held while this method id called
tx: &'a ReadTransaction<F>,
) -> (impl Stream<Item = Result<Box<Frame>>> + 'a, u64)
where
F: FileExt,
{
let (seg_start_frame_no, last_committed, db_size) =
self.with_header(|h| (h.start_frame_no.get(), h.last_committed(), h.size_after()));
let seg_start_frame_no = tx.current.with_header(|h| h.start_frame_no.get());
let replicated_until = seg_start_frame_no
// if current is empty, start_frame_no doesn't exist
.min(last_committed)
.min(tx.max_frame_no)
.max(start_frame_no);

// TODO: optim, we could read less frames if we had a mapping from frame_no to page_no in
// the index
let stream = async_stream::try_stream! {
if !self.is_empty() {
let mut frame_offset = (last_committed - seg_start_frame_no) as u32;
let mut frame_offset = (tx.max_frame_no - seg_start_frame_no) as u32;
loop {
let buf = ZeroCopyBoxIoBuf::new(Frame::new_box_zeroed());
let buf = ZeroCopyBoxIoBuf::new_uninit(Frame::new_box_zeroed());
let (buf, res) = self.read_frame_offset_async(frame_offset, buf).await;
res?;

Expand All @@ -551,7 +552,7 @@ impl<F> CurrentSegment<F> {
}
};

(stream, replicated_until, db_size)
(stream, replicated_until)
}

fn recompute_checksum(&self, start_offset: u32, until_offset: u32) -> Result<u32>
Expand Down Expand Up @@ -714,18 +715,20 @@ mod test {
.unwrap();
}

let mut seen = RoaringBitmap::new();
let current = shared.current.load();
let (stream, replicated_until, size_after) = current.frame_stream_from(1, &mut seen);
tokio::pin!(stream);
assert_eq!(replicated_until, 1);
assert_eq!(size_after, 6);

let mut tmp = tempfile().unwrap();
while let Some(frame) = stream.next().await {
let frame = frame.unwrap();
let offset = (frame.header().page_no() - 1) * 4096;
tmp.write_all_at(frame.data(), offset as _).unwrap();
{
let tx = shared.begin_read(u64::MAX);
let mut seen = RoaringBitmap::new();
let (stream, replicated_until) = tx.current.frame_stream_from(1, &mut seen, &tx);
tokio::pin!(stream);
assert_eq!(replicated_until, 1);
assert_eq!(tx.db_size, 6);

while let Some(frame) = stream.next().await {
let frame = frame.unwrap();
let offset = (frame.header().page_no() - 1) * 4096;
tmp.write_all_at(frame.data(), offset as _).unwrap();
}
}

seal_current_segment(&shared);
Expand Down Expand Up @@ -768,11 +771,11 @@ mod test {

let mut seen = RoaringBitmap::new();
{
let current = shared.current.load();
let (stream, replicated_until, size_after) = current.frame_stream_from(1, &mut seen);
let tx = shared.begin_read(u64::MAX);
let (stream, replicated_until) = tx.current.frame_stream_from(1, &mut seen, &tx);
tokio::pin!(stream);
assert_eq!(replicated_until, 60);
assert_eq!(size_after, 9);
assert_eq!(tx.db_size, 9);
assert_eq!(stream.fold(0, |count, _| count + 1).await, 6);
}
assert_debug_snapshot!(seen);
Expand All @@ -787,12 +790,12 @@ mod test {
conn.execute("create table test (x)", ()).unwrap();

let mut seen = RoaringBitmap::new();
let current = shared.current.load();
let (stream, replicated_until, size_after) = current.frame_stream_from(100, &mut seen);
let tx = shared.begin_read(u64::MAX);
let (stream, replicated_until) = tx.current.frame_stream_from(100, &mut seen, &tx);
tokio::pin!(stream);
assert_eq!(replicated_until, 100);
assert_eq!(stream.fold(0, |count, _| count + 1).await, 0);
assert_eq!(size_after, 2);
assert_eq!(tx.db_size, 2);
}

#[tokio::test]
Expand All @@ -805,11 +808,11 @@ mod test {
seal_current_segment(&shared);

let mut seen = RoaringBitmap::new();
let current = shared.current.load();
let (stream, replicated_until, size_after) = current.frame_stream_from(1, &mut seen);
let tx = shared.begin_read(u64::MAX);
let (stream, replicated_until) = tx.current.frame_stream_from(1, &mut seen, &tx);
tokio::pin!(stream);
assert_eq!(replicated_until, 2);
assert_eq!(size_after, 2);
assert_eq!(tx.db_size, 2);
assert_eq!(stream.fold(0, |count, _| count + 1).await, 0);
}

Expand Down Expand Up @@ -1014,8 +1017,8 @@ mod test {
}
}

fn db_payload(db: &[u8]) -> &[u8] {
fn db_payload(db: &[u8]) -> u32 {
let size = (db.len() / 4096) * 4096;
&db[..size]
crc32fast::hash(&db[..size])
}
}
Loading

0 comments on commit c050655

Please sign in to comment.