Skip to content

Commit

Permalink
sort: refactor --ignore-case option
Browse files Browse the repository at this point in the history
- use same fn used in dedup for case-insensitive iter_cmp_ignore_case
- move back transform helper fn to join, as we no longer need it with the dedup based iter_cmp_ignore_case
- also, we don't need to checkutf8 for sort function as we use byterecords
  • Loading branch information
jqnatividad committed Dec 20, 2022
1 parent 589af43 commit d9a372e
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 54 deletions.
31 changes: 23 additions & 8 deletions src/cmd/join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ use crate::{
config::{Config, Delimiter, SeekRead},
index::Indexed,
select::{SelectColumns, Selection},
util,
util::ByteString,
CliResult,
util, CliResult,
};

#[derive(Deserialize)]
Expand Down Expand Up @@ -408,10 +406,7 @@ impl<R: io::Read + io::Seek> ValueIndex<R> {
// indexes in one pass.
row_idx.write_u64::<BigEndian>(row.position().unwrap().byte())?;

let fields: Vec<_> = sel
.select(&row)
.map(|v| util::transform(v, casei))
.collect();
let fields: Vec<_> = sel.select(&row).map(|v| transform(v, casei)).collect();
if nulls || !fields.iter().any(std::vec::Vec::is_empty) {
match val_idx.entry(fields) {
Entry::Vacant(v) => {
Expand Down Expand Up @@ -457,5 +452,25 @@ impl<R> fmt::Debug for ValueIndex<R> {

#[inline]
fn get_row_key(sel: &Selection, row: &csv::ByteRecord, casei: bool) -> Vec<ByteString> {
sel.select(row).map(|v| util::transform(v, casei)).collect()
sel.select(row).map(|v| transform(v, casei)).collect()
}

pub type ByteString = Vec<u8>;

#[inline]
pub fn transform(bs: &[u8], casei: bool) -> ByteString {
if let Ok(s) = str::from_utf8(bs) {
if casei {
let norm: String = s
.trim()
.chars()
.map(|c| c.to_lowercase().next().unwrap())
.collect();
norm.into_bytes()
} else {
s.trim().as_bytes().to_vec()
}
} else {
bs.to_vec()
}
}
30 changes: 4 additions & 26 deletions src/cmd/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use serde::Deserialize;

use self::Number::{Float, Int};
use crate::{
cmd::dedup::iter_cmp_ignore_case,
config::{Config, Delimiter},
select::SelectColumns,
util, CliResult,
Expand Down Expand Up @@ -75,6 +76,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.checkutf8(false)
.select(args.flag_select);

let mut rdr = rconfig.reader()?;
Expand Down Expand Up @@ -105,7 +107,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let a = sel.select(r1);
let b = sel.select(r2);
if ignore_case {
iter_cmp_case_insensitive(a, b)
iter_cmp_ignore_case(a, b)
} else {
iter_cmp(a, b)
}
Expand All @@ -119,7 +121,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let a = sel.select(r1);
let b = sel.select(r2);
if ignore_case {
iter_cmp_case_insensitive(b, a)
iter_cmp_ignore_case(b, a)
} else {
iter_cmp(b, a)
}
Expand Down Expand Up @@ -177,30 +179,6 @@ where
}
}

/// Order `a` and `b` case-insensitively using `Ord`
#[inline]
pub fn iter_cmp_case_insensitive<'a, L, R>(mut a: L, mut b: R) -> cmp::Ordering
where
L: Iterator<Item = &'a [u8]>,
R: Iterator<Item = &'a [u8]>,
{
loop {
match (a.next(), b.next()) {
(None, None) => return cmp::Ordering::Equal,
(None, _) => return cmp::Ordering::Less,
(_, None) => return cmp::Ordering::Greater,
(Some(x), Some(y)) => {
let match_x = util::transform(x, true);
let match_y = util::transform(y, true);
match match_x.cmp(&match_y) {
cmp::Ordering::Equal => (),
non_eq => return non_eq,
}
}
}
}
}

/// Try parsing `a` and `b` as numbers when ordering
#[inline]
pub fn iter_cmp_num<'a, L, R>(mut a: L, mut b: R) -> cmp::Ordering
Expand Down
20 changes: 0 additions & 20 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -892,23 +892,3 @@ impl ColumnNameParser {
name
}
}

pub type ByteString = Vec<u8>;

#[inline]
pub fn transform(bs: &[u8], casei: bool) -> ByteString {
if let Ok(s) = str::from_utf8(bs) {
if casei {
let norm: String = s
.trim()
.chars()
.map(|c| c.to_lowercase().next().unwrap())
.collect();
norm.into_bytes()
} else {
s.trim().as_bytes().to_vec()
}
} else {
bs.to_vec()
}
}

0 comments on commit d9a372e

Please sign in to comment.