naga_oil/compose/
tokenizer.rs

1use std::collections::VecDeque;
2
3#[derive(PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
4pub enum Token<'a> {
5    Identifier(&'a str, usize),
6    Other(char, usize),
7    Whitespace(&'a str, usize),
8}
9
10impl<'a> Token<'a> {
11    pub fn pos(&self) -> usize {
12        match self {
13            Token::Identifier(_, pos) | Token::Other(_, pos) | Token::Whitespace(_, pos) => *pos,
14        }
15    }
16
17    pub fn identifier(&self) -> Option<&str> {
18        match self {
19            Token::Identifier(ident, _) => Some(ident),
20            _ => None,
21        }
22    }
23}
24
25#[derive(Clone, Copy, PartialEq, Eq)]
26enum TokenKind {
27    Identifier,
28    Whitespace,
29}
30
31// a basic tokenizer that separates identifiers from non-identifiers, and optionally returns whitespace tokens
32// unicode XID rules apply, except that additional characters '"' and '::' (sequences of two colons) are allowed in identifiers.
33// quotes treat any further chars until the next quote as part of the identifier.
34// note we don't support non-USV identifiers like ๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง which is apparently in XID_continue
35pub struct Tokenizer<'a> {
36    tokens: VecDeque<Token<'a>>,
37}
38
39impl<'a> Tokenizer<'a> {
40    pub fn new(src: &'a str, emit_whitespace: bool) -> Self {
41        let mut tokens = VecDeque::default();
42        let mut current_token_start = 0;
43        let mut current_token = None;
44        let mut quoted_token = false;
45
46        let mut chars = src.char_indices().peekable();
47
48        while let Some((ix, char)) = chars.next() {
49            if char == '"' {
50                quoted_token = !quoted_token;
51                if !quoted_token {
52                    continue;
53                }
54            }
55
56            if let Some(tok) = current_token {
57                match tok {
58                    TokenKind::Identifier => {
59                        // accept anything within quotes, or XID_continues
60                        if quoted_token || unicode_ident::is_xid_continue(char) {
61                            continue;
62                        }
63                        // accept `::`
64                        if char == ':' && chars.peek() == Some(&(ix + 1, ':')) {
65                            chars.next();
66                            continue;
67                        }
68
69                        tokens.push_back(Token::Identifier(
70                            &src[current_token_start..ix],
71                            current_token_start,
72                        ));
73                    }
74                    TokenKind::Whitespace => {
75                        if char.is_whitespace() {
76                            continue;
77                        }
78                        tokens.push_back(Token::Whitespace(
79                            &src[current_token_start..ix],
80                            current_token_start,
81                        ));
82                    }
83                };
84
85                current_token_start = ix;
86                current_token = None;
87            }
88
89            if quoted_token || unicode_ident::is_xid_start(char) {
90                current_token = Some(TokenKind::Identifier);
91                current_token_start = ix;
92            } else if !char.is_whitespace() {
93                tokens.push_back(Token::Other(char, ix));
94            } else if char.is_whitespace() && emit_whitespace {
95                current_token = Some(TokenKind::Whitespace);
96                current_token_start = ix;
97            }
98        }
99
100        if let Some(tok) = current_token {
101            match tok {
102                TokenKind::Identifier => {
103                    tokens.push_back(Token::Identifier(
104                        &src[current_token_start..src.len()],
105                        current_token_start,
106                    ));
107                }
108                TokenKind::Whitespace => {
109                    tokens.push_back(Token::Whitespace(
110                        &src[current_token_start..src.len()],
111                        current_token_start,
112                    ));
113                }
114            };
115        }
116
117        Self { tokens }
118    }
119}
120
121impl<'a> Iterator for Tokenizer<'a> {
122    type Item = Token<'a>;
123
124    fn next(&mut self) -> Option<Self::Item> {
125        self.tokens.pop_front()
126    }
127}