Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 173 additions & 23 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html

// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef localeconv

mod buffer_hint;
mod check;
Expand Down Expand Up @@ -57,6 +57,11 @@ use uucore::version_cmp::version_cmp;
use crate::buffer_hint::automatic_buffer_size;
use crate::tmp_dir::TmpDirWrapper;

#[cfg(unix)]
use nix::libc;
#[cfg(unix)]
use std::ffi::CStr;

mod options {
pub mod modes {
pub const SORT: &str = "sort";
Expand Down Expand Up @@ -294,9 +299,35 @@ pub struct GlobalSettings {
buffer_size_is_explicit: bool,
compress_prog: Option<String>,
merge_batch_size: usize,
numeric_locale: NumericLocaleSettings,
precomputed: Precomputed,
}

#[derive(Clone, Copy, Debug)]
struct NumericLocaleSettings {
thousands_sep: Option<u8>,
decimal_pt: Option<u8>,
}

impl Default for NumericLocaleSettings {
fn default() -> Self {
Self {
thousands_sep: None,
decimal_pt: Some(DECIMAL_PT),
}
}
}

impl NumericLocaleSettings {
fn num_info_settings(&self, accept_si_units: bool) -> NumInfoParseSettings {
NumInfoParseSettings {
accept_si_units,
thousands_separator: self.thousands_sep,
decimal_pt: self.decimal_pt,
}
}
}

/// Data needed for sorting. Should be computed once before starting to sort
/// by calling `GlobalSettings::init_precomputed`.
#[derive(Clone, Debug, Default)]
Expand All @@ -307,6 +338,8 @@ struct Precomputed {
selections_per_line: usize,
fast_lexicographic: bool,
fast_ascii_insensitive: bool,
tokenize_blank_thousands_sep: bool,
tokenize_allow_unit_after_blank: bool,
}

impl GlobalSettings {
Expand Down Expand Up @@ -348,6 +381,20 @@ impl GlobalSettings {
.filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric))
.count();

let uses_numeric = self
.selectors
.iter()
.any(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric));
let uses_human_numeric = self
.selectors
.iter()
.any(|s| matches!(s.settings.mode, SortMode::HumanNumeric));
self.precomputed.tokenize_blank_thousands_sep = self.separator.is_none()
&& uses_numeric
&& self.numeric_locale.thousands_sep == Some(b' ');
self.precomputed.tokenize_allow_unit_after_blank =
self.precomputed.tokenize_blank_thousands_sep && uses_human_numeric;

self.precomputed.fast_lexicographic = self.can_use_fast_lexicographic();
self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive();
}
Expand Down Expand Up @@ -415,6 +462,7 @@ impl Default for GlobalSettings {
buffer_size_is_explicit: false,
compress_prog: None,
merge_batch_size: default_merge_batch_size(),
numeric_locale: NumericLocaleSettings::default(),
precomputed: Precomputed::default(),
}
}
Expand Down Expand Up @@ -524,7 +572,12 @@ impl<'a> Line<'a> {
) -> Self {
token_buffer.clear();
if settings.precomputed.needs_tokens {
tokenize(line, settings.separator, token_buffer);
tokenize(
line,
settings.separator,
token_buffer,
&settings.precomputed,
);
}
if settings.mode == SortMode::Numeric {
// exclude inf, nan, scientific notation
Expand All @@ -534,11 +587,12 @@ impl<'a> Line<'a> {
.and_then(|s| s.parse::<f64>().ok());
line_data.line_num_floats.push(line_num_float);
}
for (selector, selection) in settings
.selectors
.iter()
.map(|selector| (selector, selector.get_selection(line, token_buffer)))
{
for (selector, selection) in settings.selectors.iter().map(|selector| {
(
selector,
selector.get_selection(line, token_buffer, &settings.numeric_locale),
)
}) {
match selection {
Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float),
Selection::WithNumInfo(str, num_info) => {
Expand Down Expand Up @@ -587,18 +641,22 @@ impl<'a> Line<'a> {
writeln!(writer)?;

let mut fields = vec![];
tokenize(self.line, settings.separator, &mut fields);
tokenize(
self.line,
settings.separator,
&mut fields,
&settings.precomputed,
);
for selector in &settings.selectors {
let mut selection = selector.get_range(self.line, Some(&fields));
match selector.settings.mode {
SortMode::Numeric | SortMode::HumanNumeric => {
// find out which range is used for numeric comparisons
let (_, num_range) = NumInfo::parse(
&self.line[selection.clone()],
&NumInfoParseSettings {
accept_si_units: selector.settings.mode == SortMode::HumanNumeric,
..Default::default()
},
&settings
.numeric_locale
.num_info_settings(selector.settings.mode == SortMode::HumanNumeric),
);
let initial_selection = selection.clone();

Expand Down Expand Up @@ -716,24 +774,50 @@ impl<'a> Line<'a> {
}

/// Tokenize a line into fields. The result is stored into `token_buffer`.
fn tokenize(line: &[u8], separator: Option<u8>, token_buffer: &mut Vec<Field>) {
fn tokenize(
line: &[u8],
separator: Option<u8>,
token_buffer: &mut Vec<Field>,
precomputed: &Precomputed,
) {
assert!(token_buffer.is_empty());
if let Some(separator) = separator {
tokenize_with_separator(line, separator, token_buffer);
} else {
tokenize_default(line, token_buffer);
tokenize_default(
line,
token_buffer,
precomputed.tokenize_blank_thousands_sep,
precomputed.tokenize_allow_unit_after_blank,
);
}
}

/// By default fields are separated by the first whitespace after non-whitespace.
/// Whitespace is included in fields at the start.
/// The result is stored into `token_buffer`.
fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
fn tokenize_default(
line: &[u8],
token_buffer: &mut Vec<Field>,
blank_thousands_sep: bool,
allow_unit_after_blank: bool,
) {
token_buffer.push(0..0);
// pretend that there was whitespace in front of the line
let mut previous_was_whitespace = true;
for (idx, char) in line.iter().enumerate() {
if char.is_ascii_whitespace() {
let is_whitespace = char.is_ascii_whitespace();
let treat_as_separator = if is_whitespace {
if blank_thousands_sep && *char == b' ' {
!is_blank_thousands_sep(line, idx, allow_unit_after_blank)
} else {
true
}
} else {
false
};

if treat_as_separator {
if !previous_was_whitespace {
token_buffer.last_mut().unwrap().end = idx;
token_buffer.push(idx..0);
Expand All @@ -746,6 +830,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec<Field>) {
token_buffer.last_mut().unwrap().end = line.len();
}

fn is_blank_thousands_sep(line: &[u8], idx: usize, allow_unit_after_blank: bool) -> bool {
if line.get(idx) != Some(&b' ') {
return false;
}

let prev_is_digit = idx
.checked_sub(1)
.and_then(|prev_idx| line.get(prev_idx))
.is_some_and(u8::is_ascii_digit);
if !prev_is_digit {
return false;
}

let next = line.get(idx + 1).copied();
match next {
Some(c) if c.is_ascii_digit() => true,
Some(b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q')
if allow_unit_after_blank =>
{
true
}
_ => false,
}
}

/// Split between separators. These separators are not included in fields.
/// The result is stored into `token_buffer`.
fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec<Field>) {
Expand Down Expand Up @@ -943,7 +1052,12 @@ impl FieldSelector {

/// Get the selection that corresponds to this selector for the line.
/// If `needs_fields` returned false, tokens may be empty.
fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> {
fn get_selection<'a>(
&self,
line: &'a [u8],
tokens: &[Field],
numeric_locale: &NumericLocaleSettings,
) -> Selection<'a> {
// `get_range` expects `None` when we don't need tokens and would get confused by an empty vector.
let tokens = if self.needs_tokens {
Some(tokens)
Expand All @@ -955,10 +1069,7 @@ impl FieldSelector {
// Parse NumInfo for this number.
let (info, num_range) = NumInfo::parse(
range_str,
&NumInfoParseSettings {
accept_si_units: self.settings.mode == SortMode::HumanNumeric,
..Default::default()
},
&numeric_locale.num_info_settings(self.settings.mode == SortMode::HumanNumeric),
);
// Shorten the range to what we need to pass to numeric_str_cmp later.
range_str = &range_str[num_range];
Expand Down Expand Up @@ -1067,6 +1178,41 @@ impl FieldSelector {
}
}

#[cfg(unix)]
fn detect_numeric_locale() -> NumericLocaleSettings {
unsafe {
libc::setlocale(libc::LC_NUMERIC, c"".as_ptr());
let mut settings = NumericLocaleSettings::default();
let conv = libc::localeconv();
if conv.is_null() {
return settings;
}

let decimal_ptr = (*conv).decimal_point;
if !decimal_ptr.is_null() {
let decimal_point = CStr::from_ptr(decimal_ptr).to_bytes();
if decimal_point.len() == 1 {
settings.decimal_pt = Some(decimal_point[0]);
}
}

let thousands_ptr = (*conv).thousands_sep;
if !thousands_ptr.is_null() {
let thousands_sep = CStr::from_ptr(thousands_ptr).to_bytes();
if thousands_sep.len() == 1 {
settings.thousands_sep = Some(thousands_sep[0]);
}
}

settings
}
}

#[cfg(not(unix))]
fn detect_numeric_locale() -> NumericLocaleSettings {
NumericLocaleSettings::default()
}

/// Creates an `Arg` that conflicts with all other sort modes.
fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg {
Arg::new(mode)
Expand Down Expand Up @@ -1274,7 +1420,10 @@ fn default_merge_batch_size() -> usize {
#[uucore::main]
#[allow(clippy::cognitive_complexity)]
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let mut settings = GlobalSettings::default();
let mut settings = GlobalSettings {
numeric_locale: detect_numeric_locale(),
..Default::default()
};

let matches = uucore::clap_localization::handle_clap_result_with_exit_code(
uu_app(),
Expand Down Expand Up @@ -2278,7 +2427,8 @@ mod tests {

fn tokenize_helper(line: &[u8], separator: Option<u8>) -> Vec<Field> {
let mut buffer = vec![];
tokenize(line, separator, &mut buffer);
let precomputed = Precomputed::default();
tokenize(line, separator, &mut buffer, &precomputed);
buffer
}

Expand Down
Loading