@@ -42,6 +42,8 @@ enum Delimiters {
42
42
None ,
43
43
}
44
44
45
+ const OUTPUT_BUFFER_CAPACITY : usize = 128 * 1024 ;
46
+
45
47
struct Uniq {
46
48
repeats_only : bool ,
47
49
uniques_only : bool ,
@@ -55,6 +57,14 @@ struct Uniq {
55
57
zero_terminated : bool ,
56
58
}
57
59
60
+ #[ derive( Default ) ]
61
+ struct LineMeta {
62
+ key_start : usize ,
63
+ key_end : usize ,
64
+ lowercase : Vec < u8 > ,
65
+ use_lowercase : bool ,
66
+ }
67
+
58
68
macro_rules! write_line_terminator {
59
69
( $writer: expr, $line_terminator: expr) => {
60
70
$writer
@@ -64,42 +74,53 @@ macro_rules! write_line_terminator {
64
74
}
65
75
66
76
impl Uniq {
67
- pub fn print_uniq ( & self , reader : impl BufRead , mut writer : impl Write ) -> UResult < ( ) > {
77
+ pub fn print_uniq ( & self , mut reader : impl BufRead , mut writer : impl Write ) -> UResult < ( ) > {
68
78
let mut first_line_printed = false ;
69
79
let mut group_count = 1 ;
70
80
let line_terminator = self . get_line_terminator ( ) ;
71
- let mut lines = reader. split ( line_terminator) ;
72
- let mut line = match lines. next ( ) {
73
- Some ( l) => l?,
74
- None => return Ok ( ( ) ) ,
75
- } ;
76
-
77
81
let writer = & mut writer;
78
82
79
- // compare current `line` with consecutive lines (`next_line`) of the input
80
- // and if needed, print `line` based on the command line options provided
81
- for next_line in lines {
82
- let next_line = next_line?;
83
- if self . cmp_keys ( & line, & next_line) {
83
+ let mut current_buf = Vec :: with_capacity ( 1024 ) ;
84
+ if !Self :: read_line ( & mut reader, & mut current_buf, line_terminator) ? {
85
+ return Ok ( ( ) ) ;
86
+ }
87
+ let mut current_meta = LineMeta :: default ( ) ;
88
+ self . build_meta ( & current_buf, & mut current_meta) ;
89
+
90
+ let mut next_buf = Vec :: with_capacity ( 1024 ) ;
91
+ let mut next_meta = LineMeta :: default ( ) ;
92
+
93
+ loop {
94
+ if !Self :: read_line ( & mut reader, & mut next_buf, line_terminator) ? {
95
+ break ;
96
+ }
97
+
98
+ self . build_meta ( & next_buf, & mut next_meta) ;
99
+
100
+ if self . keys_differ ( & current_buf, & current_meta, & next_buf, & next_meta) {
84
101
if ( group_count == 1 && !self . repeats_only )
85
102
|| ( group_count > 1 && !self . uniques_only )
86
103
{
87
- self . print_line ( writer, & line , group_count, first_line_printed) ?;
104
+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
88
105
first_line_printed = true ;
89
106
}
90
- line = next_line;
107
+ std:: mem:: swap ( & mut current_buf, & mut next_buf) ;
108
+ std:: mem:: swap ( & mut current_meta, & mut next_meta) ;
91
109
group_count = 1 ;
92
110
} else {
93
111
if self . all_repeated {
94
- self . print_line ( writer, & line , group_count, first_line_printed) ?;
112
+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
95
113
first_line_printed = true ;
96
- line = next_line;
114
+ std:: mem:: swap ( & mut current_buf, & mut next_buf) ;
115
+ std:: mem:: swap ( & mut current_meta, & mut next_meta) ;
97
116
}
98
117
group_count += 1 ;
99
118
}
119
+ next_buf. clear ( ) ;
100
120
}
121
+
101
122
if ( group_count == 1 && !self . repeats_only ) || ( group_count > 1 && !self . uniques_only ) {
102
- self . print_line ( writer, & line , group_count, first_line_printed) ?;
123
+ self . print_line ( writer, & current_buf , group_count, first_line_printed) ?;
103
124
first_line_printed = true ;
104
125
}
105
126
if ( self . delimiters == Delimiters :: Append || self . delimiters == Delimiters :: Both )
@@ -113,79 +134,134 @@ impl Uniq {
113
134
Ok ( ( ) )
114
135
}
115
136
116
- fn skip_fields ( & self , line : & [ u8 ] ) -> Vec < u8 > {
137
+ fn get_line_terminator ( & self ) -> u8 {
138
+ if self . zero_terminated { 0 } else { b'\n' }
139
+ }
140
+
141
+ fn keys_differ (
142
+ & self ,
143
+ first_line : & [ u8 ] ,
144
+ first_meta : & LineMeta ,
145
+ second_line : & [ u8 ] ,
146
+ second_meta : & LineMeta ,
147
+ ) -> bool {
148
+ let first_slice = & first_line[ first_meta. key_start ..first_meta. key_end ] ;
149
+ let second_slice = & second_line[ second_meta. key_start ..second_meta. key_end ] ;
150
+
151
+ if !self . ignore_case {
152
+ return first_slice != second_slice;
153
+ }
154
+
155
+ let first_cmp = if first_meta. use_lowercase {
156
+ first_meta. lowercase . as_slice ( )
157
+ } else {
158
+ first_slice
159
+ } ;
160
+ let second_cmp = if second_meta. use_lowercase {
161
+ second_meta. lowercase . as_slice ( )
162
+ } else {
163
+ second_slice
164
+ } ;
165
+
166
+ first_cmp != second_cmp
167
+ }
168
+
169
+ fn key_bounds ( & self , line : & [ u8 ] ) -> ( usize , usize ) {
170
+ let mut start = self . skip_fields_offset ( line) ;
171
+ if let Some ( skip_bytes) = self . slice_start {
172
+ start = start. saturating_add ( skip_bytes) . min ( line. len ( ) ) ;
173
+ }
174
+
175
+ let end = self . key_end_index ( line, start) ;
176
+ ( start, end)
177
+ }
178
+
179
+ fn skip_fields_offset ( & self , line : & [ u8 ] ) -> usize {
117
180
if let Some ( skip_fields) = self . skip_fields {
118
- let mut line = line. iter ( ) ;
119
- let mut line_after_skipped_field: Vec < u8 > ;
181
+ let mut idx = 0 ;
120
182
for _ in 0 ..skip_fields {
121
- if line. all ( |u| u . is_ascii_whitespace ( ) ) {
122
- return Vec :: new ( ) ;
183
+ while idx < line. len ( ) && line [ idx ] . is_ascii_whitespace ( ) {
184
+ idx += 1 ;
123
185
}
124
- line_after_skipped_field = line
125
- . by_ref ( )
126
- . skip_while ( |u| !u . is_ascii_whitespace ( ) )
127
- . copied ( )
128
- . collect :: < Vec < u8 > > ( ) ;
129
-
130
- if line_after_skipped_field . is_empty ( ) {
131
- return Vec :: new ( ) ;
186
+ if idx > = line. len ( ) {
187
+ return line . len ( ) ;
188
+ }
189
+ while idx < line . len ( ) && !line [ idx ] . is_ascii_whitespace ( ) {
190
+ idx += 1 ;
191
+ }
192
+ if idx >= line . len ( ) {
193
+ return line . len ( ) ;
132
194
}
133
- line = line_after_skipped_field. iter ( ) ;
134
195
}
135
- line . copied ( ) . collect :: < Vec < u8 > > ( )
196
+ idx
136
197
} else {
137
- line . to_vec ( )
198
+ 0
138
199
}
139
200
}
140
201
141
- fn get_line_terminator ( & self ) -> u8 {
142
- if self . zero_terminated { 0 } else { b'\n' }
202
+ fn key_end_index ( & self , line : & [ u8 ] , key_start : usize ) -> usize {
203
+ let remainder = & line[ key_start..] ;
204
+ match self . slice_stop {
205
+ None => line. len ( ) ,
206
+ Some ( limit) => {
207
+ if remainder. is_empty ( ) {
208
+ return key_start;
209
+ }
210
+ if let Ok ( valid) = std:: str:: from_utf8 ( remainder) {
211
+ let prefix_len = Self :: char_prefix_len ( valid, limit) ;
212
+ key_start + prefix_len
213
+ } else {
214
+ key_start + remainder. len ( ) . min ( limit)
215
+ }
216
+ }
217
+ }
143
218
}
144
219
145
- fn cmp_keys ( & self , first : & [ u8 ] , second : & [ u8 ] ) -> bool {
146
- self . cmp_key ( first, |first_iter| {
147
- self . cmp_key ( second, |second_iter| first_iter. ne ( second_iter) )
148
- } )
220
+ fn char_prefix_len ( text : & str , limit : usize ) -> usize {
221
+ for ( count, ( idx, _) ) in text. char_indices ( ) . enumerate ( ) {
222
+ if count == limit {
223
+ return idx;
224
+ }
225
+ }
226
+ text. len ( )
149
227
}
150
228
151
- fn cmp_key < F > ( & self , line : & [ u8 ] , mut closure : F ) -> bool
152
- where
153
- F : FnMut ( & mut dyn Iterator < Item = char > ) -> bool ,
154
- {
155
- let fields_to_check = self . skip_fields ( line) ;
156
-
157
- // Skip self.slice_start bytes (if -s was used).
158
- // self.slice_start is how many characters to skip, but historically
159
- // uniq's `-s N` means "skip N *bytes*," so do that literally:
160
- let skip_bytes = self . slice_start . unwrap_or ( 0 ) ;
161
- let fields_to_check = if skip_bytes < fields_to_check. len ( ) {
162
- & fields_to_check[ skip_bytes..]
163
- } else {
164
- // If skipping beyond end-of-line, leftover is empty => effectively ""
165
- & [ ]
166
- } ;
167
-
168
- // Convert the leftover bytes to UTF-8 for character-based -w
169
- // If invalid UTF-8, just compare them as individual bytes (fallback).
170
- let Ok ( string_after_skip) = std:: str:: from_utf8 ( fields_to_check) else {
171
- // Fallback: if invalid UTF-8, treat them as single-byte "chars"
172
- return closure ( & mut fields_to_check. iter ( ) . map ( |& b| b as char ) ) ;
173
- } ;
174
-
175
- let total_chars = string_after_skip. chars ( ) . count ( ) ;
176
-
177
- // `-w N` => Compare no more than N characters
178
- let slice_stop = self . slice_stop . unwrap_or ( total_chars) ;
179
- let slice_start = slice_stop. min ( total_chars) ;
229
+ fn build_meta ( & self , line : & [ u8 ] , meta : & mut LineMeta ) {
230
+ let ( key_start, key_end) = self . key_bounds ( line) ;
231
+ meta. key_start = key_start;
232
+ meta. key_end = key_end;
233
+
234
+ if self . ignore_case && key_start < key_end {
235
+ let slice = & line[ key_start..key_end] ;
236
+ if slice. iter ( ) . any ( |b| b. is_ascii_uppercase ( ) ) {
237
+ meta. lowercase . clear ( ) ;
238
+ meta. lowercase . reserve ( slice. len ( ) ) ;
239
+ meta. lowercase
240
+ . extend ( slice. iter ( ) . map ( |b| b. to_ascii_lowercase ( ) ) ) ;
241
+ meta. use_lowercase = true ;
242
+ return ;
243
+ }
244
+ }
180
245
181
- let mut iter = string_after_skip. chars ( ) . take ( slice_start) ;
246
+ meta. use_lowercase = false ;
247
+ }
182
248
183
- if self . ignore_case {
184
- // We can do ASCII-lowercase or full Unicode-lowercase. For minimal changes, do ASCII:
185
- closure ( & mut iter. map ( |c| c. to_ascii_lowercase ( ) ) )
186
- } else {
187
- closure ( & mut iter)
249
+ fn read_line (
250
+ reader : & mut impl BufRead ,
251
+ buffer : & mut Vec < u8 > ,
252
+ line_terminator : u8 ,
253
+ ) -> UResult < bool > {
254
+ buffer. clear ( ) ;
255
+ let bytes_read = reader
256
+ . read_until ( line_terminator, buffer)
257
+ . map_err_context ( || translate ! ( "uniq-error-read-error" ) ) ?;
258
+ if bytes_read == 0 {
259
+ return Ok ( false ) ;
188
260
}
261
+ if buffer. last ( ) . is_some_and ( |last| * last == line_terminator) {
262
+ buffer. pop ( ) ;
263
+ }
264
+ Ok ( true )
189
265
}
190
266
191
267
fn should_print_delimiter ( & self , group_count : usize , first_line_printed : bool ) -> bool {
@@ -214,22 +290,59 @@ impl Uniq {
214
290
write_line_terminator ! ( writer, line_terminator) ?;
215
291
}
216
292
293
+ let mut count_buf = [ 0u8 ; Self :: COUNT_PREFIX_BUF_SIZE ] ;
294
+
217
295
if self . show_counts {
218
- let prefix = format ! ( "{count:7} " ) ;
219
- let out = prefix
220
- . as_bytes ( )
221
- . iter ( )
222
- . chain ( line. iter ( ) )
223
- . copied ( )
224
- . collect :: < Vec < u8 > > ( ) ;
225
- writer. write_all ( out. as_slice ( ) )
226
- } else {
227
- writer. write_all ( line)
296
+ // Call the associated function (no &self) after the refactor above.
297
+ let prefix = Self :: build_count_prefix ( count, & mut count_buf) ;
298
+ writer
299
+ . write_all ( prefix)
300
+ . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
228
301
}
229
- . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
302
+
303
+ writer
304
+ . write_all ( line)
305
+ . map_err_context ( || translate ! ( "uniq-error-write-error" ) ) ?;
230
306
231
307
write_line_terminator ! ( writer, line_terminator)
232
308
}
309
+
310
+ const COUNT_PREFIX_WIDTH : usize = 7 ;
311
+ const COUNT_PREFIX_BUF_SIZE : usize = 32 ;
312
+
313
+ // This function does not use `self`, so make it an associated function.
314
+ // Also remove needless explicit lifetimes to satisfy clippy::needless-lifetimes.
315
+ fn build_count_prefix ( count : usize , buf : & mut [ u8 ; Self :: COUNT_PREFIX_BUF_SIZE ] ) -> & [ u8 ] {
316
+ let mut digits_buf = [ 0u8 ; 20 ] ;
317
+ let mut value = count;
318
+ let mut idx = digits_buf. len ( ) ;
319
+
320
+ if value == 0 {
321
+ idx -= 1 ;
322
+ digits_buf[ idx] = b'0' ;
323
+ } else {
324
+ while value > 0 {
325
+ idx -= 1 ;
326
+ digits_buf[ idx] = b'0' + ( value % 10 ) as u8 ;
327
+ value /= 10 ;
328
+ }
329
+ }
330
+
331
+ let digits = & digits_buf[ idx..] ;
332
+ let width = Self :: COUNT_PREFIX_WIDTH ;
333
+
334
+ if digits. len ( ) <= width {
335
+ let pad = width - digits. len ( ) ;
336
+ buf[ ..pad] . fill ( b' ' ) ;
337
+ buf[ pad..pad + digits. len ( ) ] . copy_from_slice ( digits) ;
338
+ buf[ width] = b' ' ;
339
+ & buf[ ..=width]
340
+ } else {
341
+ buf[ ..digits. len ( ) ] . copy_from_slice ( digits) ;
342
+ buf[ digits. len ( ) ] = b' ' ;
343
+ & buf[ ..=digits. len ( ) ]
344
+ }
345
+ }
233
346
}
234
347
235
348
fn opt_parsed ( opt_name : & str , matches : & ArgMatches ) -> UResult < Option < usize > > {
@@ -741,8 +854,11 @@ fn open_output_file(out_file_name: Option<&OsStr>) -> UResult<Box<dyn Write>> {
741
854
let out_file = File :: create ( path) . map_err_context (
742
855
|| translate ! ( "uniq-error-could-not-open" , "path" => path. maybe_quote( ) ) ,
743
856
) ?;
744
- Box :: new ( BufWriter :: new ( out_file) )
857
+ Box :: new ( BufWriter :: with_capacity ( OUTPUT_BUFFER_CAPACITY , out_file) )
745
858
}
746
- _ => Box :: new ( stdout ( ) . lock ( ) ) ,
859
+ _ => Box :: new ( BufWriter :: with_capacity (
860
+ OUTPUT_BUFFER_CAPACITY ,
861
+ stdout ( ) . lock ( ) ,
862
+ ) ) ,
747
863
} )
748
864
}
0 commit comments