Skip to content

Commit

Permalink
extsort: increased performance
Browse files Browse the repository at this point in the history
-  use 10% of memory or 100 mb, whichever is higher, for in mem sorting
- use 1mb read/write buffer when accessing chunks in tmp dir
  • Loading branch information
jqnatividad committed May 7, 2022
1 parent 762fd35 commit e2f013f
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 4 deletions.
22 changes: 22 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ self_update = { version = "0.30", features = [
serde = { version = "1", features = ["derive"] }
serde_json = { version = "1.0", features = ["preserve_order"] }
strsim = { version = "0.10", optional = true }
sysinfo = "0.23"
tabwriter = "1.2"
test-data-generation = { version = "0.3", optional = true }
thiserror = { version = "1.0", optional = true }
Expand Down
21 changes: 17 additions & 4 deletions src/cmd/extsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::util;
use crate::CliResult;
use ext_sort::{buffer::mem::MemoryLimitedBufferBuilder, ExternalSorter, ExternalSorterBuilder};
use serde::Deserialize;
use sysinfo::{System, SystemExt};

static USAGE: &str = "
Sort an arbitrarily large CSV/text file using a multithreaded external sort algorithm.
Expand Down Expand Up @@ -40,19 +41,31 @@ struct Args {
flag_jobs: Option<usize>,
flag_no_headers: bool,
}
// buffer to use for sorting in memory,
// if the file is larger, addl sorting done on tmp_dir

const MEMORY_LIMITED_BUFFER: u64 = 100 * 1_000_000; // 100 MB
const RW_BUFFER_CAPACITY: usize = 1000 * (1 << 10); // 1 MB

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;

// buffer to use for sorting in memory,
// use 10% of total memory if we can detect it, otherwise
// set it to MEMORY_LIMITED_BUFFER
let mem_limited_buffer = if System::IS_SUPPORTED {
let mut sys = System::new_all();
sys.refresh_memory();
(sys.total_memory() * 1000) / 10 // 10 percent of total memory
} else {
MEMORY_LIMITED_BUFFER
};

let mut input_reader = io::BufReader::new(fs::File::open(&args.arg_input)?);

let sorter: ExternalSorter<String, io::Error, MemoryLimitedBufferBuilder> =
ExternalSorterBuilder::new()
.with_tmp_dir(path::Path::new("./"))
.with_buffer(MemoryLimitedBufferBuilder::new(MEMORY_LIMITED_BUFFER))
.with_buffer(MemoryLimitedBufferBuilder::new(mem_limited_buffer))
.with_rw_buf_size(RW_BUFFER_CAPACITY)
.with_threads_number(util::njobs(args.flag_jobs))
.build()
.unwrap();
Expand All @@ -72,6 +85,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
for item in sorted.map(Result::unwrap) {
output_writer.write_all(format!("{item}\n").as_bytes())?;
}
output_writer.flush().unwrap();
output_writer.flush()?;
Ok(())
}

0 comments on commit e2f013f

Please sign in to comment.