Skip to content

Commit 31d1f06

Browse files
authored
Refactor uniq field skipping to avoid allocations (#8703)
* refactor uniq skip_fields to avoid allocations * feat(uniq): add read error handling and optimize line processing - Add read error localization keys to en-US and fr-FR locales - Refactor print_uniq to use buffered line reading with LineMeta for case-insensitive comparisons, avoiding memory issues with large inputs - Improve error handling by detecting read failures and exiting appropriately
1 parent 40d0cf0 commit 31d1f06

File tree

3 files changed

+205
-87
lines changed

3 files changed

+205
-87
lines changed

src/uu/uniq/locales/en-US.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ uniq-help-zero-terminated = end lines with 0 byte, not newline
2121
# Error messages
2222
uniq-error-write-line-terminator = Could not write line terminator
2323
uniq-error-write-error = write error
24+
uniq-error-read-error = read error
2425
uniq-error-invalid-argument = Invalid argument for { $opt_name }: { $arg }
2526
uniq-error-try-help = Try 'uniq --help' for more information.
2627
uniq-error-group-mutually-exclusive = --group is mutually exclusive with -c/-d/-D/-u

src/uu/uniq/locales/fr-FR.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ uniq-help-zero-terminated = terminer les lignes avec un octet 0, pas une nouvell
2020
# Messages d'erreur
2121
uniq-error-write-line-terminator = Impossible d'écrire le terminateur de ligne
2222
uniq-error-write-error = erreur d'écriture
23+
uniq-error-read-error = erreur de lecture
2324
uniq-error-invalid-argument = Argument invalide pour { $opt_name } : { $arg }
2425
uniq-error-try-help = Essayez 'uniq --help' pour plus d'informations.
2526
uniq-error-group-mutually-exclusive = --group est mutuellement exclusif avec -c/-d/-D/-u

src/uu/uniq/src/uniq.rs

Lines changed: 203 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ enum Delimiters {
4242
None,
4343
}
4444

45+
const OUTPUT_BUFFER_CAPACITY: usize = 128 * 1024;
46+
4547
struct Uniq {
4648
repeats_only: bool,
4749
uniques_only: bool,
@@ -55,6 +57,14 @@ struct Uniq {
5557
zero_terminated: bool,
5658
}
5759

60+
#[derive(Default)]
61+
struct LineMeta {
62+
key_start: usize,
63+
key_end: usize,
64+
lowercase: Vec<u8>,
65+
use_lowercase: bool,
66+
}
67+
5868
macro_rules! write_line_terminator {
5969
($writer:expr, $line_terminator:expr) => {
6070
$writer
@@ -64,42 +74,53 @@ macro_rules! write_line_terminator {
6474
}
6575

6676
impl Uniq {
67-
pub fn print_uniq(&self, reader: impl BufRead, mut writer: impl Write) -> UResult<()> {
77+
pub fn print_uniq(&self, mut reader: impl BufRead, mut writer: impl Write) -> UResult<()> {
6878
let mut first_line_printed = false;
6979
let mut group_count = 1;
7080
let line_terminator = self.get_line_terminator();
71-
let mut lines = reader.split(line_terminator);
72-
let mut line = match lines.next() {
73-
Some(l) => l?,
74-
None => return Ok(()),
75-
};
76-
7781
let writer = &mut writer;
7882

79-
// compare current `line` with consecutive lines (`next_line`) of the input
80-
// and if needed, print `line` based on the command line options provided
81-
for next_line in lines {
82-
let next_line = next_line?;
83-
if self.cmp_keys(&line, &next_line) {
83+
let mut current_buf = Vec::with_capacity(1024);
84+
if !Self::read_line(&mut reader, &mut current_buf, line_terminator)? {
85+
return Ok(());
86+
}
87+
let mut current_meta = LineMeta::default();
88+
self.build_meta(&current_buf, &mut current_meta);
89+
90+
let mut next_buf = Vec::with_capacity(1024);
91+
let mut next_meta = LineMeta::default();
92+
93+
loop {
94+
if !Self::read_line(&mut reader, &mut next_buf, line_terminator)? {
95+
break;
96+
}
97+
98+
self.build_meta(&next_buf, &mut next_meta);
99+
100+
if self.keys_differ(&current_buf, &current_meta, &next_buf, &next_meta) {
84101
if (group_count == 1 && !self.repeats_only)
85102
|| (group_count > 1 && !self.uniques_only)
86103
{
87-
self.print_line(writer, &line, group_count, first_line_printed)?;
104+
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
88105
first_line_printed = true;
89106
}
90-
line = next_line;
107+
std::mem::swap(&mut current_buf, &mut next_buf);
108+
std::mem::swap(&mut current_meta, &mut next_meta);
91109
group_count = 1;
92110
} else {
93111
if self.all_repeated {
94-
self.print_line(writer, &line, group_count, first_line_printed)?;
112+
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
95113
first_line_printed = true;
96-
line = next_line;
114+
std::mem::swap(&mut current_buf, &mut next_buf);
115+
std::mem::swap(&mut current_meta, &mut next_meta);
97116
}
98117
group_count += 1;
99118
}
119+
next_buf.clear();
100120
}
121+
101122
if (group_count == 1 && !self.repeats_only) || (group_count > 1 && !self.uniques_only) {
102-
self.print_line(writer, &line, group_count, first_line_printed)?;
123+
self.print_line(writer, &current_buf, group_count, first_line_printed)?;
103124
first_line_printed = true;
104125
}
105126
if (self.delimiters == Delimiters::Append || self.delimiters == Delimiters::Both)
@@ -113,79 +134,134 @@ impl Uniq {
113134
Ok(())
114135
}
115136

116-
fn skip_fields(&self, line: &[u8]) -> Vec<u8> {
137+
fn get_line_terminator(&self) -> u8 {
138+
if self.zero_terminated { 0 } else { b'\n' }
139+
}
140+
141+
fn keys_differ(
142+
&self,
143+
first_line: &[u8],
144+
first_meta: &LineMeta,
145+
second_line: &[u8],
146+
second_meta: &LineMeta,
147+
) -> bool {
148+
let first_slice = &first_line[first_meta.key_start..first_meta.key_end];
149+
let second_slice = &second_line[second_meta.key_start..second_meta.key_end];
150+
151+
if !self.ignore_case {
152+
return first_slice != second_slice;
153+
}
154+
155+
let first_cmp = if first_meta.use_lowercase {
156+
first_meta.lowercase.as_slice()
157+
} else {
158+
first_slice
159+
};
160+
let second_cmp = if second_meta.use_lowercase {
161+
second_meta.lowercase.as_slice()
162+
} else {
163+
second_slice
164+
};
165+
166+
first_cmp != second_cmp
167+
}
168+
169+
fn key_bounds(&self, line: &[u8]) -> (usize, usize) {
170+
let mut start = self.skip_fields_offset(line);
171+
if let Some(skip_bytes) = self.slice_start {
172+
start = start.saturating_add(skip_bytes).min(line.len());
173+
}
174+
175+
let end = self.key_end_index(line, start);
176+
(start, end)
177+
}
178+
179+
fn skip_fields_offset(&self, line: &[u8]) -> usize {
117180
if let Some(skip_fields) = self.skip_fields {
118-
let mut line = line.iter();
119-
let mut line_after_skipped_field: Vec<u8>;
181+
let mut idx = 0;
120182
for _ in 0..skip_fields {
121-
if line.all(|u| u.is_ascii_whitespace()) {
122-
return Vec::new();
183+
while idx < line.len() && line[idx].is_ascii_whitespace() {
184+
idx += 1;
123185
}
124-
line_after_skipped_field = line
125-
.by_ref()
126-
.skip_while(|u| !u.is_ascii_whitespace())
127-
.copied()
128-
.collect::<Vec<u8>>();
129-
130-
if line_after_skipped_field.is_empty() {
131-
return Vec::new();
186+
if idx >= line.len() {
187+
return line.len();
188+
}
189+
while idx < line.len() && !line[idx].is_ascii_whitespace() {
190+
idx += 1;
191+
}
192+
if idx >= line.len() {
193+
return line.len();
132194
}
133-
line = line_after_skipped_field.iter();
134195
}
135-
line.copied().collect::<Vec<u8>>()
196+
idx
136197
} else {
137-
line.to_vec()
198+
0
138199
}
139200
}
140201

141-
fn get_line_terminator(&self) -> u8 {
142-
if self.zero_terminated { 0 } else { b'\n' }
202+
fn key_end_index(&self, line: &[u8], key_start: usize) -> usize {
203+
let remainder = &line[key_start..];
204+
match self.slice_stop {
205+
None => line.len(),
206+
Some(limit) => {
207+
if remainder.is_empty() {
208+
return key_start;
209+
}
210+
if let Ok(valid) = std::str::from_utf8(remainder) {
211+
let prefix_len = Self::char_prefix_len(valid, limit);
212+
key_start + prefix_len
213+
} else {
214+
key_start + remainder.len().min(limit)
215+
}
216+
}
217+
}
143218
}
144219

145-
fn cmp_keys(&self, first: &[u8], second: &[u8]) -> bool {
146-
self.cmp_key(first, |first_iter| {
147-
self.cmp_key(second, |second_iter| first_iter.ne(second_iter))
148-
})
220+
fn char_prefix_len(text: &str, limit: usize) -> usize {
221+
for (count, (idx, _)) in text.char_indices().enumerate() {
222+
if count == limit {
223+
return idx;
224+
}
225+
}
226+
text.len()
149227
}
150228

151-
fn cmp_key<F>(&self, line: &[u8], mut closure: F) -> bool
152-
where
153-
F: FnMut(&mut dyn Iterator<Item = char>) -> bool,
154-
{
155-
let fields_to_check = self.skip_fields(line);
156-
157-
// Skip self.slice_start bytes (if -s was used).
158-
// self.slice_start is how many characters to skip, but historically
159-
// uniq's `-s N` means "skip N *bytes*," so do that literally:
160-
let skip_bytes = self.slice_start.unwrap_or(0);
161-
let fields_to_check = if skip_bytes < fields_to_check.len() {
162-
&fields_to_check[skip_bytes..]
163-
} else {
164-
// If skipping beyond end-of-line, leftover is empty => effectively ""
165-
&[]
166-
};
167-
168-
// Convert the leftover bytes to UTF-8 for character-based -w
169-
// If invalid UTF-8, just compare them as individual bytes (fallback).
170-
let Ok(string_after_skip) = std::str::from_utf8(fields_to_check) else {
171-
// Fallback: if invalid UTF-8, treat them as single-byte "chars"
172-
return closure(&mut fields_to_check.iter().map(|&b| b as char));
173-
};
174-
175-
let total_chars = string_after_skip.chars().count();
176-
177-
// `-w N` => Compare no more than N characters
178-
let slice_stop = self.slice_stop.unwrap_or(total_chars);
179-
let slice_start = slice_stop.min(total_chars);
229+
fn build_meta(&self, line: &[u8], meta: &mut LineMeta) {
230+
let (key_start, key_end) = self.key_bounds(line);
231+
meta.key_start = key_start;
232+
meta.key_end = key_end;
233+
234+
if self.ignore_case && key_start < key_end {
235+
let slice = &line[key_start..key_end];
236+
if slice.iter().any(|b| b.is_ascii_uppercase()) {
237+
meta.lowercase.clear();
238+
meta.lowercase.reserve(slice.len());
239+
meta.lowercase
240+
.extend(slice.iter().map(|b| b.to_ascii_lowercase()));
241+
meta.use_lowercase = true;
242+
return;
243+
}
244+
}
180245

181-
let mut iter = string_after_skip.chars().take(slice_start);
246+
meta.use_lowercase = false;
247+
}
182248

183-
if self.ignore_case {
184-
// We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
185-
closure(&mut iter.map(|c| c.to_ascii_lowercase()))
186-
} else {
187-
closure(&mut iter)
249+
fn read_line(
250+
reader: &mut impl BufRead,
251+
buffer: &mut Vec<u8>,
252+
line_terminator: u8,
253+
) -> UResult<bool> {
254+
buffer.clear();
255+
let bytes_read = reader
256+
.read_until(line_terminator, buffer)
257+
.map_err_context(|| translate!("uniq-error-read-error"))?;
258+
if bytes_read == 0 {
259+
return Ok(false);
188260
}
261+
if buffer.last().is_some_and(|last| *last == line_terminator) {
262+
buffer.pop();
263+
}
264+
Ok(true)
189265
}
190266

191267
fn should_print_delimiter(&self, group_count: usize, first_line_printed: bool) -> bool {
@@ -214,22 +290,59 @@ impl Uniq {
214290
write_line_terminator!(writer, line_terminator)?;
215291
}
216292

293+
let mut count_buf = [0u8; Self::COUNT_PREFIX_BUF_SIZE];
294+
217295
if self.show_counts {
218-
let prefix = format!("{count:7} ");
219-
let out = prefix
220-
.as_bytes()
221-
.iter()
222-
.chain(line.iter())
223-
.copied()
224-
.collect::<Vec<u8>>();
225-
writer.write_all(out.as_slice())
226-
} else {
227-
writer.write_all(line)
296+
// Call the associated function (no &self) after the refactor above.
297+
let prefix = Self::build_count_prefix(count, &mut count_buf);
298+
writer
299+
.write_all(prefix)
300+
.map_err_context(|| translate!("uniq-error-write-error"))?;
228301
}
229-
.map_err_context(|| translate!("uniq-error-write-error"))?;
302+
303+
writer
304+
.write_all(line)
305+
.map_err_context(|| translate!("uniq-error-write-error"))?;
230306

231307
write_line_terminator!(writer, line_terminator)
232308
}
309+
310+
const COUNT_PREFIX_WIDTH: usize = 7;
311+
const COUNT_PREFIX_BUF_SIZE: usize = 32;
312+
313+
// This function does not use `self`, so make it an associated function.
314+
// Also remove needless explicit lifetimes to satisfy clippy::needless-lifetimes.
315+
fn build_count_prefix(count: usize, buf: &mut [u8; Self::COUNT_PREFIX_BUF_SIZE]) -> &[u8] {
316+
let mut digits_buf = [0u8; 20];
317+
let mut value = count;
318+
let mut idx = digits_buf.len();
319+
320+
if value == 0 {
321+
idx -= 1;
322+
digits_buf[idx] = b'0';
323+
} else {
324+
while value > 0 {
325+
idx -= 1;
326+
digits_buf[idx] = b'0' + (value % 10) as u8;
327+
value /= 10;
328+
}
329+
}
330+
331+
let digits = &digits_buf[idx..];
332+
let width = Self::COUNT_PREFIX_WIDTH;
333+
334+
if digits.len() <= width {
335+
let pad = width - digits.len();
336+
buf[..pad].fill(b' ');
337+
buf[pad..pad + digits.len()].copy_from_slice(digits);
338+
buf[width] = b' ';
339+
&buf[..=width]
340+
} else {
341+
buf[..digits.len()].copy_from_slice(digits);
342+
buf[digits.len()] = b' ';
343+
&buf[..=digits.len()]
344+
}
345+
}
233346
}
234347

235348
fn opt_parsed(opt_name: &str, matches: &ArgMatches) -> UResult<Option<usize>> {
@@ -741,8 +854,11 @@ fn open_output_file(out_file_name: Option<&OsStr>) -> UResult<Box<dyn Write>> {
741854
let out_file = File::create(path).map_err_context(
742855
|| translate!("uniq-error-could-not-open", "path" => path.maybe_quote()),
743856
)?;
744-
Box::new(BufWriter::new(out_file))
857+
Box::new(BufWriter::with_capacity(OUTPUT_BUFFER_CAPACITY, out_file))
745858
}
746-
_ => Box::new(stdout().lock()),
859+
_ => Box::new(BufWriter::with_capacity(
860+
OUTPUT_BUFFER_CAPACITY,
861+
stdout().lock(),
862+
)),
747863
})
748864
}

0 commit comments

Comments
 (0)