diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 01ddc63fbe0..30e10a2db07 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -7,7 +7,7 @@ // https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html // https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html -// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD +// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD localeconv mod buffer_hint; mod check; @@ -284,9 +284,35 @@ pub struct GlobalSettings { buffer_size_is_explicit: bool, compress_prog: Option, merge_batch_size: usize, + numeric_locale: NumericLocaleSettings, precomputed: Precomputed, } +#[derive(Clone, Copy, Debug)] +struct NumericLocaleSettings { + thousands_sep: Option, + decimal_pt: Option, +} + +impl Default for NumericLocaleSettings { + fn default() -> Self { + Self { + thousands_sep: None, + decimal_pt: Some(DECIMAL_PT), + } + } +} + +impl NumericLocaleSettings { + fn num_info_settings(&self, accept_si_units: bool) -> NumInfoParseSettings { + NumInfoParseSettings { + accept_si_units, + thousands_separator: self.thousands_sep, + decimal_pt: self.decimal_pt, + } + } +} + /// Data needed for sorting. Should be computed once before starting to sort /// by calling `GlobalSettings::init_precomputed`. #[derive(Clone, Debug, Default)] @@ -297,6 +323,8 @@ struct Precomputed { selections_per_line: usize, fast_lexicographic: bool, fast_ascii_insensitive: bool, + tokenize_blank_thousands_sep: bool, + tokenize_allow_unit_after_blank: bool, } impl GlobalSettings { @@ -341,6 +369,20 @@ impl GlobalSettings { .filter(|s| matches!(s.settings.mode, SortMode::GeneralNumeric)) .count(); + let uses_numeric = self + .selectors + .iter() + .any(|s| matches!(s.settings.mode, SortMode::Numeric | SortMode::HumanNumeric)); + let uses_human_numeric = self + .selectors + .iter() + .any(|s| matches!(s.settings.mode, SortMode::HumanNumeric)); + self.precomputed.tokenize_blank_thousands_sep = self.separator.is_none() + && uses_numeric + && self.numeric_locale.thousands_sep == Some(b' '); + self.precomputed.tokenize_allow_unit_after_blank = + self.precomputed.tokenize_blank_thousands_sep && uses_human_numeric; + self.precomputed.fast_lexicographic = !disable_fast_lexicographic && self.can_use_fast_lexicographic(); self.precomputed.fast_ascii_insensitive = self.can_use_fast_ascii_insensitive(); @@ -413,6 +455,7 @@ impl Default for GlobalSettings { buffer_size_is_explicit: false, compress_prog: None, merge_batch_size: default_merge_batch_size(), + numeric_locale: NumericLocaleSettings::default(), precomputed: Precomputed::default(), } } @@ -597,7 +640,12 @@ impl<'a> Line<'a> { } token_buffer.clear(); if settings.precomputed.needs_tokens { - tokenize(line, settings.separator, token_buffer); + tokenize( + line, + settings.separator, + token_buffer, + &settings.precomputed, + ); } if settings.mode == SortMode::Numeric { // exclude inf, nan, scientific notation @@ -607,11 +655,12 @@ impl<'a> Line<'a> { .and_then(|s| s.parse::().ok()); line_data.line_num_floats.push(line_num_float); } - for (selector, selection) in settings - .selectors - .iter() - .map(|selector| (selector, selector.get_selection(line, token_buffer))) - { + for (selector, selection) in settings.selectors.iter().map(|selector| { + ( + selector, + selector.get_selection(line, token_buffer, &settings.numeric_locale), + ) + }) { match selection { Selection::AsBigDecimal(parsed_float) => line_data.parsed_floats.push(parsed_float), Selection::WithNumInfo(str, num_info) => { @@ -660,19 +709,24 @@ impl<'a> Line<'a> { writeln!(writer)?; let mut fields = vec![]; - tokenize(self.line, settings.separator, &mut fields); + tokenize( + self.line, + settings.separator, + &mut fields, + &settings.precomputed, + ); for selector in &settings.selectors { let mut selection = selector.get_range(self.line, Some(&fields)); match selector.settings.mode { SortMode::Numeric | SortMode::HumanNumeric => { // find out which range is used for numeric comparisons - let (_, num_range) = NumInfo::parse( - &self.line[selection.clone()], - &NumInfoParseSettings { - accept_si_units: selector.settings.mode == SortMode::HumanNumeric, - ..Default::default() - }, - ); + let mut parse_settings = settings + .numeric_locale + .num_info_settings(selector.settings.mode == SortMode::HumanNumeric); + // Debug annotations should ignore thousands separators to match GNU output. + parse_settings.thousands_separator = None; + let (_, num_range) = + NumInfo::parse(&self.line[selection.clone()], &parse_settings); let initial_selection = selection.clone(); // Shorten selection to num_range. @@ -789,24 +843,50 @@ impl<'a> Line<'a> { } /// Tokenize a line into fields. The result is stored into `token_buffer`. -fn tokenize(line: &[u8], separator: Option, token_buffer: &mut Vec) { +fn tokenize( + line: &[u8], + separator: Option, + token_buffer: &mut Vec, + precomputed: &Precomputed, +) { assert!(token_buffer.is_empty()); if let Some(separator) = separator { tokenize_with_separator(line, separator, token_buffer); } else { - tokenize_default(line, token_buffer); + tokenize_default( + line, + token_buffer, + precomputed.tokenize_blank_thousands_sep, + precomputed.tokenize_allow_unit_after_blank, + ); } } /// By default fields are separated by the first whitespace after non-whitespace. /// Whitespace is included in fields at the start. /// The result is stored into `token_buffer`. -fn tokenize_default(line: &[u8], token_buffer: &mut Vec) { +fn tokenize_default( + line: &[u8], + token_buffer: &mut Vec, + blank_thousands_sep: bool, + allow_unit_after_blank: bool, +) { token_buffer.push(0..0); // pretend that there was whitespace in front of the line let mut previous_was_whitespace = true; for (idx, char) in line.iter().enumerate() { - if char.is_ascii_whitespace() { + let is_whitespace = char.is_ascii_whitespace(); + let treat_as_separator = if is_whitespace { + if blank_thousands_sep && *char == b' ' { + !is_blank_thousands_sep(line, idx, allow_unit_after_blank) + } else { + true + } + } else { + false + }; + + if treat_as_separator { if !previous_was_whitespace { token_buffer.last_mut().unwrap().end = idx; token_buffer.push(idx..0); @@ -819,6 +899,31 @@ fn tokenize_default(line: &[u8], token_buffer: &mut Vec) { token_buffer.last_mut().unwrap().end = line.len(); } +fn is_blank_thousands_sep(line: &[u8], idx: usize, allow_unit_after_blank: bool) -> bool { + if line.get(idx) != Some(&b' ') { + return false; + } + + let prev_is_digit = idx + .checked_sub(1) + .and_then(|prev_idx| line.get(prev_idx)) + .is_some_and(u8::is_ascii_digit); + if !prev_is_digit { + return false; + } + + let next = line.get(idx + 1).copied(); + match next { + Some(c) if c.is_ascii_digit() => true, + Some(b'K' | b'k' | b'M' | b'G' | b'T' | b'P' | b'E' | b'Z' | b'Y' | b'R' | b'Q') + if allow_unit_after_blank => + { + true + } + _ => false, + } +} + /// Split between separators. These separators are not included in fields. /// The result is stored into `token_buffer`. fn tokenize_with_separator(line: &[u8], separator: u8, token_buffer: &mut Vec) { @@ -1077,7 +1182,12 @@ impl FieldSelector { /// Get the selection that corresponds to this selector for the line. /// If `needs_fields` returned false, tokens may be empty. - fn get_selection<'a>(&self, line: &'a [u8], tokens: &[Field]) -> Selection<'a> { + fn get_selection<'a>( + &self, + line: &'a [u8], + tokens: &[Field], + numeric_locale: &NumericLocaleSettings, + ) -> Selection<'a> { // `get_range` expects `None` when we don't need tokens and would get confused by an empty vector. let tokens = if self.needs_tokens { Some(tokens) @@ -1086,24 +1196,10 @@ impl FieldSelector { }; let mut range_str = &line[self.get_range(line, tokens)]; if self.settings.mode == SortMode::Numeric || self.settings.mode == SortMode::HumanNumeric { - // Get the thousands separator from the locale, handling cases where the separator is empty or multi-character - let locale_thousands_separator = i18n::decimal::locale_grouping_separator().as_bytes(); - - // Upstream GNU coreutils ignore multibyte thousands separators - // (FIXME in C source). We keep the same single-byte behavior. - let thousands_separator = match locale_thousands_separator { - [b] => Some(*b), - _ => None, - }; - // Parse NumInfo for this number. let (info, num_range) = NumInfo::parse( range_str, - &NumInfoParseSettings { - accept_si_units: self.settings.mode == SortMode::HumanNumeric, - thousands_separator, - ..Default::default() - }, + &numeric_locale.num_info_settings(self.settings.mode == SortMode::HumanNumeric), ); // Shorten the range to what we need to pass to numeric_str_cmp later. range_str = &range_str[num_range]; @@ -1216,6 +1312,33 @@ impl FieldSelector { } } +fn detect_numeric_locale() -> NumericLocaleSettings { + let numeric_locale = i18n::get_numeric_locale(); + let locale = &numeric_locale.0; + let encoding = numeric_locale.1; + let is_c_locale = encoding == i18n::UEncoding::Ascii && locale.to_string() == "und"; + + if is_c_locale { + return NumericLocaleSettings { + decimal_pt: Some(DECIMAL_PT), + thousands_sep: None, + }; + } + + let grouping = i18n::decimal::locale_grouping_separator(); + NumericLocaleSettings { + decimal_pt: Some(locale_decimal_pt()), + // Upstream GNU coreutils ignore multibyte thousands separators + // (FIXME in C source). We keep the same single-byte behavior. + thousands_sep: match grouping.as_bytes() { + [b] => Some(*b), + // ICU returns NBSP as UTF-8 (0xC2 0xA0). In non-UTF8 locales like ISO-8859-1, + // the input byte is 0xA0, so map it to a single-byte separator. + [0xC2, 0xA0] if encoding != i18n::UEncoding::Utf8 => Some(0xA0), + _ => None, + }, + } +} /// Creates an `Arg` for a sort mode flag. fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg { Arg::new(mode) @@ -1847,7 +1970,10 @@ fn emit_debug_warnings( #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let mut settings = GlobalSettings::default(); + let mut settings = GlobalSettings { + numeric_locale: detect_numeric_locale(), + ..Default::default() + }; let (processed_args, mut legacy_warnings) = preprocess_legacy_args(args); if !legacy_warnings.is_empty() { @@ -1954,7 +2080,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { let ignore_non_printing = matches.get_flag(options::IGNORE_NONPRINTING); let ignore_case = matches.get_flag(options::IGNORE_CASE); - if ordering_incompatible(mode_flags, dictionary_order, ignore_non_printing) { + if !matches.contains_id(options::KEY) + && ordering_incompatible(mode_flags, dictionary_order, ignore_non_printing) + { let opts = ordering_opts_string( mode_flags, dictionary_order, @@ -2964,7 +3092,8 @@ mod tests { fn tokenize_helper(line: &[u8], separator: Option) -> Vec { let mut buffer = vec![]; - tokenize(line, separator, &mut buffer); + let precomputed = Precomputed::default(); + tokenize(line, separator, &mut buffer, &precomputed); buffer } diff --git a/src/uucore/src/lib/features/i18n/decimal.rs b/src/uucore/src/lib/features/i18n/decimal.rs index 0a901143c6b..a7ceca2efa3 100644 --- a/src/uucore/src/lib/features/i18n/decimal.rs +++ b/src/uucore/src/lib/features/i18n/decimal.rs @@ -6,7 +6,7 @@ use std::sync::OnceLock; use icu_decimal::provider::DecimalSymbolsV1; -use icu_locale::Locale; +use icu_locale::{Locale, locale}; use icu_provider::prelude::*; use crate::i18n::get_numeric_locale; @@ -60,7 +60,15 @@ fn get_grouping_separator(loc: Locale) -> String { pub fn locale_grouping_separator() -> &'static str { static GROUPING_SEP: OnceLock = OnceLock::new(); - GROUPING_SEP.get_or_init(|| get_grouping_separator(get_numeric_locale().0.clone())) + GROUPING_SEP.get_or_init(|| { + let loc = get_numeric_locale().0.clone(); + // C/POSIX locale (represented as "und") has no grouping separator. + if loc == locale!("und") { + String::new() + } else { + get_grouping_separator(loc) + } + }) } #[cfg(test)] diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index e794898a286..7218060d9c9 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -8,6 +8,8 @@ use std::env; use std::fmt::Write as FmtWrite; +#[cfg(unix)] +use std::process::Command; use std::time::Duration; use uutests::at_and_ucmd; @@ -1665,6 +1667,69 @@ fn test_g_float_locale_decimal_separator() { .stdout_is("1.10\n1.9\n"); } +#[test] +#[cfg(unix)] +fn test_human_numeric_blank_thousands_sep_locale() { + fn thousands_sep_for(locale: &str) -> Option { + let output = Command::new("locale") + .arg("thousands_sep") + .env("LC_ALL", locale) + .output() + .ok()?; + if !output.status.success() { + return None; + } + let sep = String::from_utf8_lossy(&output.stdout); + let sep = sep.trim_end_matches(&['\n', '\r'][..]); + if sep.is_empty() || sep.len() != 1 || !sep.chars().all(|c| c.is_whitespace()) { + return None; + } + Some(sep.to_string()) + } + + let candidates = ["sv_SE.UTF-8", "sv_SE"]; + let mut selected_locale = None; + let mut thousands_sep = None; + for candidate in candidates { + if let Some(sep) = thousands_sep_for(candidate) { + selected_locale = Some(candidate.to_string()); + thousands_sep = Some(sep); + break; + } + } + + let (Some(locale), Some(sep)) = (selected_locale, thousands_sep) else { + return; + }; + + let line1 = format!("1 1k 1 M 4{sep}003 1M"); + let line2 = format!("2k 2M 2 k 4{sep}002 2"); + let line3 = format!("3M 3 3 G 4{sep}001 3k"); + let input = format!("{line1}\n{line2}\n{line3}\n"); + + let ts = TestScenario::new("sort"); + ts.fixtures.write("blank-thousands.txt", &input); + + let cases = [ + (1, format!("{line1}\n{line2}\n{line3}\n")), + (2, format!("{line3}\n{line1}\n{line2}\n")), + (3, format!("{line1}\n{line2}\n{line3}\n")), + (5, format!("{line3}\n{line2}\n{line1}\n")), + ]; + + for (key, expected) in cases { + let key_str = key.to_string(); + ts.ucmd() + .env("LC_ALL", &locale) + .arg("-h") + .arg("-k") + .arg(&key_str) + .arg("blank-thousands.txt") + .succeeds() + .stdout_is(expected); + } +} + #[test] // Test misc numbers ("'a" is not interpreted as literal, trailing text is ignored...) fn test_g_misc() { @@ -2367,18 +2432,18 @@ _ __ 1 _ -2.4 -___ 2,5 _ -2.,,3 -__ 2.4 ___ +2.,,3 +__ 2.4 ___ 2,,3 _ +2.4 +___ 1a _ 2b diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected b/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected index a781a36bba8..59541af3252 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected @@ -21,10 +21,10 @@ CARAvan 8.013 45 46.89 - 4567. - 37800 576,446.88800000 576,446.890 + 4567. + 37800 4798908.340000000000 4798908.45 4798908.8909800 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected.debug b/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected.debug index a00067b1ee6..b7b76e58986 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected.debug +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric.expected.debug @@ -67,18 +67,18 @@ __ 46.89 _____ _____ - 4567. - _____ -____________________ ->>>>37800 - _____ -_________ 576,446.88800000 ___ ________________ 576,446.890 ___ ___________ + 4567. + _____ +____________________ +>>>>37800 + _____ +_________ 4798908.340000000000 ____________________ ____________________ diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected index 36eeda637f7..0ccdd84c059 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected @@ -24,10 +24,10 @@ CARAvan 8.013 45 46.89 +576,446.890 +576,446.88800000 4567. 37800 -576,446.88800000 -576,446.890 4798908.340000000000 4798908.45 4798908.8909800 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected.debug b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected.debug index 3fba8903042..66a98b20879 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected.debug +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_stable.expected.debug @@ -50,14 +50,14 @@ _____ __ 46.89 _____ +576,446.890 +___ +576,446.88800000 +___ 4567. _____ >>>>37800 _____ -576,446.88800000 -___ -576,446.890 -___ 4798908.340000000000 ____________________ 4798908.45 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected index cb27c6664ce..cd4256c5f46 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected @@ -11,10 +11,9 @@ 8.013 45 46.89 +576,446.890 4567. 37800 -576,446.88800000 -576,446.890 4798908.340000000000 4798908.45 4798908.8909800 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected.debug b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected.debug index dd6e8dfcc67..663a4b3a918 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected.debug +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique.expected.debug @@ -24,14 +24,12 @@ _____ __ 46.89 _____ +576,446.890 +___ 4567. _____ >>>>37800 _____ -576,446.88800000 -___ -576,446.890 -___ 4798908.340000000000 ____________________ 4798908.45 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected index bbce169347f..97e261f1452 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected @@ -1,10 +1,9 @@ 4798908.8909800 4798908.45 4798908.340000000000 -576,446.890 -576,446.88800000 37800 4567. +576,446.890 46.89 45 8.013 diff --git a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected.debug b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected.debug index 4b01a840618..01f7abf5bf2 100644 --- a/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected.debug +++ b/tests/fixtures/sort/mixed_floats_ints_chars_numeric_unique_reverse.expected.debug @@ -4,14 +4,12 @@ _______________ __________ 4798908.340000000000 ____________________ -576,446.890 -___ -576,446.88800000 -___ >>>>37800 _____ 4567. _____ +576,446.890 +___ 46.89 _____ 45 diff --git a/tests/fixtures/sort/multiple_decimals_numeric.expected b/tests/fixtures/sort/multiple_decimals_numeric.expected index 3ef4d22e881..8f42e7ce5da 100644 --- a/tests/fixtures/sort/multiple_decimals_numeric.expected +++ b/tests/fixtures/sort/multiple_decimals_numeric.expected @@ -21,6 +21,8 @@ CARAvan 8.013 45 46.89 +576,446.88800000 +576,446.890 4567..457 4567. 4567.1 @@ -28,8 +30,6 @@ CARAvan 37800 45670.89079.098 45670.89079.1 -576,446.88800000 -576,446.890 4798908.340000000000 4798908.45 4798908.8909800 diff --git a/tests/fixtures/sort/multiple_decimals_numeric.expected.debug b/tests/fixtures/sort/multiple_decimals_numeric.expected.debug index 0ae6d2958a5..948c4869c32 100644 --- a/tests/fixtures/sort/multiple_decimals_numeric.expected.debug +++ b/tests/fixtures/sort/multiple_decimals_numeric.expected.debug @@ -67,6 +67,12 @@ __ 46.89 _____ _____ +576,446.88800000 +___ +________________ +576,446.890 +___ +___________ >>>>>>>>>>4567..457 _____ ___________________ @@ -88,12 +94,6 @@ _____________________ >>>>>>45670.89079.1 ___________ ___________________ -576,446.88800000 -___ -________________ -576,446.890 -___ -___________ 4798908.340000000000 ____________________ ____________________ diff --git a/tests/fixtures/sort/multiple_groupings_numeric.expected b/tests/fixtures/sort/multiple_groupings_numeric.expected index a6daab83676..9dd5b5f6553 100644 --- a/tests/fixtures/sort/multiple_groupings_numeric.expected +++ b/tests/fixtures/sort/multiple_groupings_numeric.expected @@ -2,14 +2,14 @@ CARAvan + 1,999.99 +1,234 1.234 + 2,000 2.000 2.000,50 +12,34 22 23,. 111 210 -1,234 -12,34 - 1,999.99 - 2,000 diff --git a/tests/fixtures/sort/multiple_groupings_numeric.expected.debug b/tests/fixtures/sort/multiple_groupings_numeric.expected.debug index 57a4ae01b9a..62e98a46a18 100644 --- a/tests/fixtures/sort/multiple_groupings_numeric.expected.debug +++ b/tests/fixtures/sort/multiple_groupings_numeric.expected.debug @@ -10,15 +10,27 @@ CARAvan ^ no match for key _______ +>>1,999.99 + _ +__________ +1,234 +_ +_____ >1.234 _____ ______ +>>>2,000 + _ +________ 2.000 _____ _____ 2.000,50 _____ ________ +12,34 +__ +_____ 22 __ __ @@ -31,15 +43,3 @@ ___ >210 ___ ____ -1,234 -_ -_____ -12,34 -__ -_____ ->>1,999.99 - _ -__________ ->>>2,000 - _ -________