@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743
744744struct State < ' a > {
745745 peekable : Peekable < Chars < ' a > > ,
746+ /// Reference to the original source string being tokenized
747+ source : & ' a str ,
746748 pub line : u64 ,
747749 pub col : u64 ,
750+ /// Byte position in the source string
751+ pub byte_pos : usize ,
748752}
749753
750754impl State < ' _ > {
@@ -759,6 +763,8 @@ impl State<'_> {
759763 } else {
760764 self . col += 1 ;
761765 }
766+ // Update byte position (characters can be multi-byte in UTF-8)
767+ self . byte_pos += s. len_utf8 ( ) ;
762768 Some ( s)
763769 }
764770 }
@@ -769,6 +775,13 @@ impl State<'_> {
769775 self . peekable . peek ( )
770776 }
771777
778+ /// return the character after the next character (lookahead by 2) without advancing the stream
779+ pub fn peek_next ( & mut self ) -> Option < char > {
780+ let mut clone = self . peekable . clone ( ) ;
781+ clone. next ( ) ; // skip the current peeked char
782+ clone. next ( ) // get the next one
783+ }
784+
772785 pub fn location ( & self ) -> Location {
773786 Location {
774787 line : self . line ,
@@ -893,8 +906,10 @@ impl<'a> Tokenizer<'a> {
893906 ) -> Result < ( ) , TokenizerError > {
894907 let mut state = State {
895908 peekable : self . query . chars ( ) . peekable ( ) ,
909+ source : self . query ,
896910 line : 1 ,
897911 col : 1 ,
912+ byte_pos : 0 ,
898913 } ;
899914
900915 let mut location = state. location ( ) ;
@@ -912,18 +927,21 @@ impl<'a> Tokenizer<'a> {
912927 fn tokenize_identifier_or_keyword (
913928 & self ,
914929 ch : impl IntoIterator < Item = char > ,
915- chars : & mut State ,
930+ chars : & mut State < ' a > ,
916931 ) -> Result < Option < Token > , TokenizerError > {
917932 chars. next ( ) ; // consume the first char
918- let ch: String = ch. into_iter ( ) . collect ( ) ;
919- let word = self . tokenize_word ( ch, chars) ;
933+ // Calculate total byte length without allocating a String
934+ let consumed_byte_len: usize = ch. into_iter ( ) . map ( |c| c. len_utf8 ( ) ) . sum ( ) ;
935+ let word = self . tokenize_word ( consumed_byte_len, chars) ;
920936
921937 // TODO: implement parsing of exponent here
922938 if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
923939 let mut inner_state = State {
924940 peekable : word. chars ( ) . peekable ( ) ,
941+ source : & word,
925942 line : 0 ,
926943 col : 0 ,
944+ byte_pos : 0 ,
927945 } ;
928946 let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
929947 let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
@@ -937,7 +955,7 @@ impl<'a> Tokenizer<'a> {
937955 /// Get the next token or return None
938956 fn next_token (
939957 & self ,
940- chars : & mut State ,
958+ chars : & mut State < ' a > ,
941959 prev_token : Option < & Token > ,
942960 ) -> Result < Option < Token > , TokenizerError > {
943961 match chars. peek ( ) {
@@ -988,7 +1006,7 @@ impl<'a> Tokenizer<'a> {
9881006 }
9891007 _ => {
9901008 // regular identifier starting with an "b" or "B"
991- let s = self . tokenize_word ( b, chars) ;
1009+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
9921010 Ok ( Some ( Token :: make_word ( & s, None ) ) )
9931011 }
9941012 }
@@ -1015,7 +1033,7 @@ impl<'a> Tokenizer<'a> {
10151033 ) ,
10161034 _ => {
10171035 // regular identifier starting with an "r" or "R"
1018- let s = self . tokenize_word ( b, chars) ;
1036+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
10191037 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10201038 }
10211039 }
@@ -1034,7 +1052,7 @@ impl<'a> Tokenizer<'a> {
10341052 }
10351053 _ => {
10361054 // regular identifier starting with an "N"
1037- let s = self . tokenize_word ( n, chars) ;
1055+ let s = self . tokenize_word ( n. len_utf8 ( ) , chars) ;
10381056 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10391057 }
10401058 }
@@ -1051,7 +1069,7 @@ impl<'a> Tokenizer<'a> {
10511069 }
10521070 _ => {
10531071 // regular identifier starting with an "E" or "e"
1054- let s = self . tokenize_word ( x, chars) ;
1072+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10551073 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10561074 }
10571075 }
@@ -1070,7 +1088,7 @@ impl<'a> Tokenizer<'a> {
10701088 }
10711089 }
10721090 // regular identifier starting with an "U" or "u"
1073- let s = self . tokenize_word ( x, chars) ;
1091+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10741092 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10751093 }
10761094 // The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1103,7 @@ impl<'a> Tokenizer<'a> {
10851103 }
10861104 _ => {
10871105 // regular identifier starting with an "X"
1088- let s = self . tokenize_word ( x, chars) ;
1106+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10891107 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10901108 }
10911109 }
@@ -1876,13 +1894,29 @@ impl<'a> Tokenizer<'a> {
18761894 comment
18771895 }
18781896
1879- /// Tokenize an identifier or keyword, after the first char is already consumed.
1880- fn tokenize_word ( & self , first_chars : impl Into < String > , chars : & mut State ) -> String {
1881- let mut s = first_chars. into ( ) ;
1882- s. push_str ( & peeking_take_while ( chars, |ch| {
1883- self . dialect . is_identifier_part ( ch)
1884- } ) ) ;
1885- s
1897+ /// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1898+ /// `consumed_byte_len` is the byte length of the consumed character(s).
1899+ fn tokenize_word ( & self , consumed_byte_len : usize , chars : & mut State < ' a > ) -> String {
1900+ // Calculate where the first character started
1901+ let first_char_byte_pos = chars. byte_pos - consumed_byte_len;
1902+
1903+ // Use the zero-copy version and convert to String
1904+ self . tokenize_word_borrowed ( first_char_byte_pos, chars) . to_string ( )
1905+ }
1906+
1907+ /// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1908+ /// The first character position must be provided (before it was consumed).
1909+ /// Returns a slice with the same lifetime as the State's source.
1910+ fn tokenize_word_borrowed (
1911+ & self ,
1912+ first_char_byte_pos : usize ,
1913+ chars : & mut State < ' a > ,
1914+ ) -> & ' a str {
1915+ // Consume the rest of the word
1916+ borrow_slice_until ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1917+
1918+ // Return a slice from the first char to the current position
1919+ & chars. source [ first_char_byte_pos..chars. byte_pos ]
18861920 }
18871921
18881922 /// Read a quoted identifier
@@ -2176,35 +2210,82 @@ impl<'a> Tokenizer<'a> {
21762210/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772211/// Return the characters read as String, and keep the first non-matching
21782212/// char available as `chars.next()`.
2179- fn peeking_take_while ( chars : & mut State , mut predicate : impl FnMut ( char ) -> bool ) -> String {
2180- let mut s = String :: new ( ) ;
2213+ fn peeking_take_while ( chars : & mut State , predicate : impl FnMut ( char ) -> bool ) -> String {
2214+ borrow_slice_until ( chars, predicate) . to_string ( )
2215+ }
2216+
2217+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2218+ ///
2219+ /// # Arguments
2220+ /// * `chars` - The character iterator state (contains reference to original source)
2221+ /// * `predicate` - Function that returns true while we should continue taking characters
2222+ ///
2223+ /// # Returns
2224+ /// A borrowed slice of the source string containing the matched characters
2225+ fn borrow_slice_until < ' a > (
2226+ chars : & mut State < ' a > ,
2227+ mut predicate : impl FnMut ( char ) -> bool ,
2228+ ) -> & ' a str {
2229+ // Record the starting byte position
2230+ let start_pos = chars. byte_pos ;
2231+
2232+ // Consume characters while predicate is true
21812233 while let Some ( & ch) = chars. peek ( ) {
21822234 if predicate ( ch) {
2183- chars. next ( ) ; // consume
2184- s. push ( ch) ;
2235+ chars. next ( ) ; // consume (this updates byte_pos)
21852236 } else {
21862237 break ;
21872238 }
21882239 }
2189- s
2240+
2241+ // Get the ending byte position
2242+ let end_pos = chars. byte_pos ;
2243+
2244+ // Return the slice from the original source
2245+ & chars. source [ start_pos..end_pos]
21902246}
21912247
2192- /// Same as peeking_take_while, but also passes the next character to the predicate.
2193- fn peeking_next_take_while (
2194- chars : & mut State ,
2248+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2249+ /// This version also passes the next character to the predicate for lookahead.
2250+ /// This is a zero-copy version of `peeking_next_take_while`.
2251+ ///
2252+ /// # Arguments
2253+ /// * `chars` - The character iterator state (contains reference to original source)
2254+ /// * `predicate` - Function that returns true while we should continue taking characters.
2255+ /// Takes current char and optional next char for lookahead.
2256+ ///
2257+ /// # Returns
2258+ /// A borrowed slice of the source string containing the matched characters
2259+ fn borrow_slice_until_next < ' a > (
2260+ chars : & mut State < ' a > ,
21952261 mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
2196- ) -> String {
2197- let mut s = String :: new ( ) ;
2262+ ) -> & ' a str {
2263+ // Record the starting byte position
2264+ let start_pos = chars. byte_pos ;
2265+
2266+ // Consume characters while predicate is true
21982267 while let Some ( & ch) = chars. peek ( ) {
2199- let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
2268+ let next_char = chars. peek_next ( ) ;
22002269 if predicate ( ch, next_char) {
2201- chars. next ( ) ; // consume
2202- s. push ( ch) ;
2270+ chars. next ( ) ; // consume (this updates byte_pos)
22032271 } else {
22042272 break ;
22052273 }
22062274 }
2207- s
2275+
2276+ // Get the ending byte position
2277+ let end_pos = chars. byte_pos ;
2278+
2279+ // Return the slice from the original source
2280+ & chars. source [ start_pos..end_pos]
2281+ }
2282+
2283+ /// Same as peeking_take_while, but also passes the next character to the predicate.
2284+ fn peeking_next_take_while (
2285+ chars : & mut State ,
2286+ predicate : impl FnMut ( char , Option < char > ) -> bool ,
2287+ ) -> String {
2288+ borrow_slice_until_next ( chars, predicate) . to_string ( )
22082289}
22092290
22102291fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
@@ -3496,8 +3577,10 @@ mod tests {
34963577 let s = format ! ( "'{s}'" ) ;
34973578 let mut state = State {
34983579 peekable : s. chars ( ) . peekable ( ) ,
3580+ source : & s,
34993581 line : 0 ,
35003582 col : 0 ,
3583+ byte_pos : 0 ,
35013584 } ;
35023585
35033586 assert_eq ! (
0 commit comments