Skip to content

Commit 2780c15

Browse files
eyalsatorieyalleshem
authored andcommitted
Prepare tokenizer for using borrowed strings instead of allocations.
Key points for this commit: - The peekable trait isn't sufficient for using string slices, as we need the byte indexes (start/end) to create string slices, so added the current byte position to the State struct (Note: in the long term we could potentially remove peekable and use only the current position as an iterator) - Created internal functions that create slices from the original query instead of allocating strings, then converted these functions to return String to maintain compatibility (the idea is to make a small, reviewable commit without changing the Token struct or the parser)
1 parent c8531d4 commit 2780c15

File tree

1 file changed

+114
-31
lines changed

1 file changed

+114
-31
lines changed

src/tokenizer.rs

Lines changed: 114 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743

744744
struct State<'a> {
745745
peekable: Peekable<Chars<'a>>,
746+
/// Reference to the original source string being tokenized
747+
source: &'a str,
746748
pub line: u64,
747749
pub col: u64,
750+
/// Byte position in the source string
751+
pub byte_pos: usize,
748752
}
749753

750754
impl State<'_> {
@@ -759,6 +763,8 @@ impl State<'_> {
759763
} else {
760764
self.col += 1;
761765
}
766+
// Update byte position (characters can be multi-byte in UTF-8)
767+
self.byte_pos += s.len_utf8();
762768
Some(s)
763769
}
764770
}
@@ -769,6 +775,13 @@ impl State<'_> {
769775
self.peekable.peek()
770776
}
771777

778+
/// return the character after the next character (lookahead by 2) without advancing the stream
779+
pub fn peek_next(&mut self) -> Option<char> {
780+
let mut clone = self.peekable.clone();
781+
clone.next(); // skip the current peeked char
782+
clone.next() // get the next one
783+
}
784+
772785
pub fn location(&self) -> Location {
773786
Location {
774787
line: self.line,
@@ -893,8 +906,10 @@ impl<'a> Tokenizer<'a> {
893906
) -> Result<(), TokenizerError> {
894907
let mut state = State {
895908
peekable: self.query.chars().peekable(),
909+
source: self.query,
896910
line: 1,
897911
col: 1,
912+
byte_pos: 0,
898913
};
899914

900915
let mut location = state.location();
@@ -912,18 +927,21 @@ impl<'a> Tokenizer<'a> {
912927
fn tokenize_identifier_or_keyword(
913928
&self,
914929
ch: impl IntoIterator<Item = char>,
915-
chars: &mut State,
930+
chars: &mut State<'a>,
916931
) -> Result<Option<Token>, TokenizerError> {
917932
chars.next(); // consume the first char
918-
let ch: String = ch.into_iter().collect();
919-
let word = self.tokenize_word(ch, chars);
933+
// Calculate total byte length without allocating a String
934+
let consumed_byte_len: usize = ch.into_iter().map(|c| c.len_utf8()).sum();
935+
let word = self.tokenize_word(consumed_byte_len, chars);
920936

921937
// TODO: implement parsing of exponent here
922938
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923939
let mut inner_state = State {
924940
peekable: word.chars().peekable(),
941+
source: &word,
925942
line: 0,
926943
col: 0,
944+
byte_pos: 0,
927945
};
928946
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929947
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
@@ -937,7 +955,7 @@ impl<'a> Tokenizer<'a> {
937955
/// Get the next token or return None
938956
fn next_token(
939957
&self,
940-
chars: &mut State,
958+
chars: &mut State<'a>,
941959
prev_token: Option<&Token>,
942960
) -> Result<Option<Token>, TokenizerError> {
943961
match chars.peek() {
@@ -988,7 +1006,7 @@ impl<'a> Tokenizer<'a> {
9881006
}
9891007
_ => {
9901008
// regular identifier starting with an "b" or "B"
991-
let s = self.tokenize_word(b, chars);
1009+
let s = self.tokenize_word(b.len_utf8(), chars);
9921010
Ok(Some(Token::make_word(&s, None)))
9931011
}
9941012
}
@@ -1015,7 +1033,7 @@ impl<'a> Tokenizer<'a> {
10151033
),
10161034
_ => {
10171035
// regular identifier starting with an "r" or "R"
1018-
let s = self.tokenize_word(b, chars);
1036+
let s = self.tokenize_word(b.len_utf8(), chars);
10191037
Ok(Some(Token::make_word(&s, None)))
10201038
}
10211039
}
@@ -1034,7 +1052,7 @@ impl<'a> Tokenizer<'a> {
10341052
}
10351053
_ => {
10361054
// regular identifier starting with an "N"
1037-
let s = self.tokenize_word(n, chars);
1055+
let s = self.tokenize_word(n.len_utf8(), chars);
10381056
Ok(Some(Token::make_word(&s, None)))
10391057
}
10401058
}
@@ -1051,7 +1069,7 @@ impl<'a> Tokenizer<'a> {
10511069
}
10521070
_ => {
10531071
// regular identifier starting with an "E" or "e"
1054-
let s = self.tokenize_word(x, chars);
1072+
let s = self.tokenize_word(x.len_utf8(), chars);
10551073
Ok(Some(Token::make_word(&s, None)))
10561074
}
10571075
}
@@ -1070,7 +1088,7 @@ impl<'a> Tokenizer<'a> {
10701088
}
10711089
}
10721090
// regular identifier starting with an "U" or "u"
1073-
let s = self.tokenize_word(x, chars);
1091+
let s = self.tokenize_word(x.len_utf8(), chars);
10741092
Ok(Some(Token::make_word(&s, None)))
10751093
}
10761094
// The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1103,7 @@ impl<'a> Tokenizer<'a> {
10851103
}
10861104
_ => {
10871105
// regular identifier starting with an "X"
1088-
let s = self.tokenize_word(x, chars);
1106+
let s = self.tokenize_word(x.len_utf8(), chars);
10891107
Ok(Some(Token::make_word(&s, None)))
10901108
}
10911109
}
@@ -1876,13 +1894,29 @@ impl<'a> Tokenizer<'a> {
18761894
comment
18771895
}
18781896

1879-
/// Tokenize an identifier or keyword, after the first char is already consumed.
1880-
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881-
let mut s = first_chars.into();
1882-
s.push_str(&peeking_take_while(chars, |ch| {
1883-
self.dialect.is_identifier_part(ch)
1884-
}));
1885-
s
1897+
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1898+
/// `consumed_byte_len` is the byte length of the consumed character(s).
1899+
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
1900+
// Calculate where the first character started
1901+
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
1902+
1903+
// Use the zero-copy version and convert to String
1904+
self.tokenize_word_borrowed(first_char_byte_pos, chars).to_string()
1905+
}
1906+
1907+
/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1908+
/// The first character position must be provided (before it was consumed).
1909+
/// Returns a slice with the same lifetime as the State's source.
1910+
fn tokenize_word_borrowed(
1911+
&self,
1912+
first_char_byte_pos: usize,
1913+
chars: &mut State<'a>,
1914+
) -> &'a str {
1915+
// Consume the rest of the word
1916+
borrow_slice_until(chars, |ch| self.dialect.is_identifier_part(ch));
1917+
1918+
// Return a slice from the first char to the current position
1919+
&chars.source[first_char_byte_pos..chars.byte_pos]
18861920
}
18871921

18881922
/// Read a quoted identifier
@@ -2176,35 +2210,82 @@ impl<'a> Tokenizer<'a> {
21762210
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772211
/// Return the characters read as String, and keep the first non-matching
21782212
/// char available as `chars.next()`.
2179-
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180-
let mut s = String::new();
2213+
fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String {
2214+
borrow_slice_until(chars, predicate).to_string()
2215+
}
2216+
2217+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2218+
///
2219+
/// # Arguments
2220+
/// * `chars` - The character iterator state (contains reference to original source)
2221+
/// * `predicate` - Function that returns true while we should continue taking characters
2222+
///
2223+
/// # Returns
2224+
/// A borrowed slice of the source string containing the matched characters
2225+
fn borrow_slice_until<'a>(
2226+
chars: &mut State<'a>,
2227+
mut predicate: impl FnMut(char) -> bool,
2228+
) -> &'a str {
2229+
// Record the starting byte position
2230+
let start_pos = chars.byte_pos;
2231+
2232+
// Consume characters while predicate is true
21812233
while let Some(&ch) = chars.peek() {
21822234
if predicate(ch) {
2183-
chars.next(); // consume
2184-
s.push(ch);
2235+
chars.next(); // consume (this updates byte_pos)
21852236
} else {
21862237
break;
21872238
}
21882239
}
2189-
s
2240+
2241+
// Get the ending byte position
2242+
let end_pos = chars.byte_pos;
2243+
2244+
// Return the slice from the original source
2245+
&chars.source[start_pos..end_pos]
21902246
}
21912247

2192-
/// Same as peeking_take_while, but also passes the next character to the predicate.
2193-
fn peeking_next_take_while(
2194-
chars: &mut State,
2248+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2249+
/// This version also passes the next character to the predicate for lookahead.
2250+
/// This is a zero-copy version of `peeking_next_take_while`.
2251+
///
2252+
/// # Arguments
2253+
/// * `chars` - The character iterator state (contains reference to original source)
2254+
/// * `predicate` - Function that returns true while we should continue taking characters.
2255+
/// Takes current char and optional next char for lookahead.
2256+
///
2257+
/// # Returns
2258+
/// A borrowed slice of the source string containing the matched characters
2259+
fn borrow_slice_until_next<'a>(
2260+
chars: &mut State<'a>,
21952261
mut predicate: impl FnMut(char, Option<char>) -> bool,
2196-
) -> String {
2197-
let mut s = String::new();
2262+
) -> &'a str {
2263+
// Record the starting byte position
2264+
let start_pos = chars.byte_pos;
2265+
2266+
// Consume characters while predicate is true
21982267
while let Some(&ch) = chars.peek() {
2199-
let next_char = chars.peekable.clone().nth(1);
2268+
let next_char = chars.peek_next();
22002269
if predicate(ch, next_char) {
2201-
chars.next(); // consume
2202-
s.push(ch);
2270+
chars.next(); // consume (this updates byte_pos)
22032271
} else {
22042272
break;
22052273
}
22062274
}
2207-
s
2275+
2276+
// Get the ending byte position
2277+
let end_pos = chars.byte_pos;
2278+
2279+
// Return the slice from the original source
2280+
&chars.source[start_pos..end_pos]
2281+
}
2282+
2283+
/// Same as peeking_take_while, but also passes the next character to the predicate.
2284+
fn peeking_next_take_while(
2285+
chars: &mut State,
2286+
predicate: impl FnMut(char, Option<char>) -> bool,
2287+
) -> String {
2288+
borrow_slice_until_next(chars, predicate).to_string()
22082289
}
22092290

22102291
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
@@ -3496,8 +3577,10 @@ mod tests {
34963577
let s = format!("'{s}'");
34973578
let mut state = State {
34983579
peekable: s.chars().peekable(),
3580+
source: &s,
34993581
line: 0,
35003582
col: 0,
3583+
byte_pos: 0,
35013584
};
35023585

35033586
assert_eq!(

0 commit comments

Comments
 (0)