uniq: fix multibyte input

Should fix tests/uniq/uniq.pl
uutils · Jan 1, 2025 · 1e23a3f · 1e23a3f
1 parent 805754b
commit 1e23a3f
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 30 deletions.
diff --git a/src/uu/uniq/src/uniq.rs b/src/uu/uniq/src/uniq.rs
@@ -154,43 +154,44 @@ impl Uniq {
 
     fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
     where
-        F: FnMut(&mut dyn Iterator<Item = u8>) -> bool,
+        F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
     {
         let fields_to_check = self.skip_fields(line);
-        let len = fields_to_check.len();
-        let slice_start = self.slice_start.unwrap_or(0);
-        let slice_stop = self.slice_stop.unwrap_or(len);
-        if len > 0 {
-            // fast path: avoid doing any work if there is no need to skip or map to lower-case
-            if !self.ignore_case && slice_start == 0 && slice_stop == len {
-                return closure(&mut fields_to_check.iter().copied());
-            }
 
-            // fast path: avoid skipping
-            if self.ignore_case && slice_start == 0 && slice_stop == len {
-                return closure(&mut fields_to_check.iter().map(|u| u.to_ascii_lowercase()));
-            }
+        // Skip self.slice_start bytes (if -s was used).
+        // self.slice_start is how many characters to skip, but historically
+        // uniq’s `-s N` means “skip N *bytes*,” so do that literally:
+        let skip_bytes = self.slice_start.unwrap_or(0);
+        let fields_to_check = if skip_bytes < fields_to_check.len() {
+            &fields_to_check[skip_bytes..]
+        } else {
+            // If skipping beyond end-of-line, leftover is empty => effectively ""
+            &[]
+        };
 
-            // fast path: we can avoid mapping chars to lower-case, if we don't want to ignore the case
-            if !self.ignore_case {
-                return closure(
-                    &mut fields_to_check
-                        .iter()
-                        .skip(slice_start)
-                        .take(slice_stop)
-                        .copied(),
-                );
+        // Convert the leftover bytes to UTF-8 for character-based -w
+        // If invalid UTF-8, just compare them as individual bytes (fallback).
+        let string_after_skip = match std::str::from_utf8(fields_to_check) {
+            Ok(s) => s,
+            Err(_) => {
+                // Fallback: if invalid UTF-8, treat them as single-byte “chars”
+                return closure(&mut fields_to_check.iter().map(|&b| b as char));
             }
+        };
 
-            closure(
-                &mut fields_to_check
-                    .iter()
-                    .skip(slice_start)
-                    .take(slice_stop)
-                    .map(|u| u.to_ascii_lowercase()),
-            )
+        let total_chars = string_after_skip.chars().count();
+
+        // `-w N` => Compare no more than N characters
+        let slice_stop = self.slice_stop.unwrap_or(total_chars);
+        let slice_start = slice_stop.min(total_chars);
+
+        let mut iter = string_after_skip.chars().take(slice_start);
+
+        if self.ignore_case {
+            // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
+            closure(&mut iter.map(|c| c.to_ascii_lowercase()))
         } else {
-            closure(&mut fields_to_check.iter().copied())
+            closure(&mut iter)
         }
     }
 

diff --git a/tests/by-util/test_uniq.rs b/tests/by-util/test_uniq.rs
@@ -1172,3 +1172,13 @@ fn gnu_tests() {
         }
     }
 }
+
+#[test]
+fn test_stdin_w1_multibyte() {
+    let input = "à\ná\n";
+    new_ucmd!()
+        .args(&["-w1"])
+        .pipe_in(input)
+        .run()
+        .stdout_is("à\ná\n");
+}