Skip to content

Commit

Permalink
uniq: fix multibyte input
Browse files Browse the repository at this point in the history
Should fix tests/uniq/uniq.pl
  • Loading branch information
sylvestre committed Jan 1, 2025
1 parent 805754b commit 1e23a3f
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 30 deletions.
61 changes: 31 additions & 30 deletions src/uu/uniq/src/uniq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,43 +154,44 @@ impl Uniq {

fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
where
F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
{
let fields_to_check = self.skip_fields(line);
let len = fields_to_check.len();
let slice_start = self.slice_start.unwrap_or(0);
let slice_stop = self.slice_stop.unwrap_or(len);
if len > 0 {
// fast path: avoid doing any work if there is no need to skip or map to lower-case
if !self.ignore_case && slice_start == 0 && slice_stop == len {
return closure(&mut fields_to_check.iter().copied());
}

// fast path: avoid skipping
if self.ignore_case && slice_start == 0 && slice_stop == len {
return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
}
// Skip self.slice_start bytes (if -s was used).
// self.slice_start is how many characters to skip, but historically
// uniq’s `-s N` means “skip N *bytes*,” so do that literally:
let skip_bytes = self.slice_start.unwrap_or(0);
let fields_to_check = if skip_bytes < fields_to_check.len() {
&fields_to_check[skip_bytes..]
} else {
// If skipping beyond end-of-line, leftover is empty => effectively ""
&[]
};

// fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
if !self.ignore_case {
return closure(
&mut fields_to_check
.iter()
.skip(slice_start)
.take(slice_stop)
.copied(),
);
// Convert the leftover bytes to UTF-8 for character-based -w
// If invalid UTF-8, just compare them as individual bytes (fallback).
let string_after_skip = match std::str::from_utf8(fields_to_check) {
Ok(s) => s,
Err(_) => {
// Fallback: if invalid UTF-8, treat them as single-byte “chars”
return closure(&mut fields_to_check.iter().map(|&b| b as char));
}
};

closure(
&mut fields_to_check
.iter()
.skip(slice_start)
.take(slice_stop)
.map(|u| u.to_ascii_lowercase()),
)
let total_chars = string_after_skip.chars().count();

// `-w N` => Compare no more than N characters
let slice_stop = self.slice_stop.unwrap_or(total_chars);
let slice_start = slice_stop.min(total_chars);

let mut iter = string_after_skip.chars().take(slice_start);

if self.ignore_case {
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
} else {
closure(&mut fields_to_check.iter().copied())
closure(&mut iter)
}
}

Expand Down
10 changes: 10 additions & 0 deletions tests/by-util/test_uniq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1172,3 +1172,13 @@ fn gnu_tests() {
}
}
}

#[test]
fn test_stdin_w1_multibyte() {
let input = \ná\n";
new_ucmd!()
.args(&["-w1"])
.pipe_in(input)
.run()
.stdout_is(\ná\n");
}

0 comments on commit 1e23a3f

Please sign in to comment.