lib/std/arch/rv64/asm/scanner.rad 9.0 KiB raw
1
//! Assembly-specific lexical scanner.
2
@test mod tests;
3
4
use std::char;
5
use std::lang::strings;
6
7
/// Token kinds recognized by the assembler scanner.
8
export union TokenKind {
9
    /// Special end-of-file token generated when the input is exhausted.
10
    Eof,
11
    /// Special invalid token carrying an error message in [`Token::source`].
12
    Invalid,
13
14
    LParen,     // (
15
    RParen,     // )
16
    Comma,      // ,
17
    Colon,      // :
18
    ColonColon, // ::
19
    Semicolon,  // ;
20
    Minus,      // -
21
    Plus,       // +
22
    Slash,      // /
23
    Star,       // *
24
25
    /// Bare identifier used for mnemonics, constants, CSR names, and symbol segments.
26
    Ident,
27
    /// Identifier-shaped label token including the leading `@`.
28
    Label,
29
    /// Quoted label token including the leading `@` and quote delimiters.
30
    QuotedLabel,
31
    /// Directive token including the leading `.`.
32
    Directive,
33
    /// Register token including the leading `%`.
34
    Register,
35
36
    /// String literal token including delimiters.
37
    String,
38
    /// Character literal token including delimiters.
39
    Char,
40
    /// Integer literal token.
41
    Number,
42
}
43
44
/// Describes where assembler source originated from.
45
export union SourceKind {
46
    /// Source loaded from a file at the given path.
47
    File { path: *[u8] },
48
    /// Source provided as an inline string.
49
    String,
50
}
51
52
/// Lexical scanner state for assembler source.
53
export record Scanner {
54
    /// Origin of the source being scanned.
55
    sourceKind: SourceKind,
56
    /// Source buffer.
57
    source: *[u8],
58
    /// Offset of the current token in `source`.
59
    token: u32,
60
    /// Offset of the current cursor in `source`.
61
    cursor: u32,
62
    /// Current token observed by the parser.
63
    current: Token,
64
    /// Previously consumed token observed by the parser.
65
    previous: Token,
66
    /// Intern pool for identifier-shaped token text.
67
    pool: *mut strings::Pool,
68
}
69
70
/// Individual token with kind, source text, and byte offset.
71
export record Token {
72
    /// Token kind.
73
    kind: TokenKind,
74
    /// Token source text.
75
    source: *[u8],
76
    /// Byte offset of `source` in the input buffer.
77
    offset: u32,
78
}
79
80
/// Create a new assembler scanner.
81
export fn scanner(sourceKind: SourceKind, source: *[u8], pool: *mut strings::Pool) -> Scanner {
82
    let invalidToken = invalid(0, "");
83
    return Scanner {
84
        sourceKind,
85
        source,
86
        token: 0,
87
        cursor: 0,
88
        current: invalidToken,
89
        previous: invalidToken,
90
        pool,
91
    };
92
}
93
94
/// Create an invalid token with the given message.
95
export fn invalid(offset: u32, message: *[u8]) -> Token {
96
    return Token { kind: TokenKind::Invalid, source: message, offset };
97
}
98
99
/// Return `true` when the scanner has consumed all input.
100
export fn isEof(s: *Scanner) -> bool {
101
    return s.cursor >= s.source.len;
102
}
103
104
/// Return the current character without advancing.
105
fn current(s: *Scanner) -> ?u8 {
106
    if isEof(s) {
107
        return nil;
108
    }
109
    return s.source[s.cursor];
110
}
111
112
/// Return the next character without advancing.
113
fn peek(s: *Scanner) -> ?u8 {
114
    if s.cursor + 1 >= s.source.len {
115
        return nil;
116
    }
117
    return s.source[s.cursor + 1];
118
}
119
120
/// Advance the scanner cursor and return the consumed character.
121
fn advance(s: *mut Scanner) -> u8 {
122
    set s.cursor += 1;
123
    return s.source[s.cursor - 1];
124
}
125
126
/// Consume `expected` when it is present at the current cursor.
127
fn consume(s: *mut Scanner, expected: u8) -> bool {
128
    if let ch = current(s); ch == expected {
129
        advance(s);
130
        return true;
131
    }
132
    return false;
133
}
134
135
/// Skip spaces, newlines, tabs, and `//` line comments.
136
fn skipWhitespace(s: *mut Scanner) {
137
    while let ch = current(s) {
138
        match ch {
139
            case ' ', '\n', '\r', '\t' => advance(s),
140
            case '/' => {
141
                if let nextCh = peek(s); nextCh == '/' {
142
                    while let lineCh = current(s); lineCh <> '\n' {
143
                        advance(s);
144
                    }
145
                } else {
146
                    return;
147
                }
148
            }
149
            else => return,
150
        }
151
    }
152
}
153
154
/// Return the next assembler token.
155
export fn next(s: *mut Scanner) -> Token {
156
    skipWhitespace(s);
157
    set s.token = s.cursor;
158
159
    if isEof(s) {
160
        return tok(s, TokenKind::Eof);
161
    }
162
    let ch = advance(s);
163
164
    if char::isDigit(ch) {
165
        return scanNumber(s);
166
    }
167
    if char::isAlpha(ch) or ch == '_' {
168
        return scanIdentToken(s, TokenKind::Ident);
169
    }
170
171
    match ch {
172
        case '(' => return tok(s, TokenKind::LParen),
173
        case ')' => return tok(s, TokenKind::RParen),
174
        case ',' => return tok(s, TokenKind::Comma),
175
        case ';' => return tok(s, TokenKind::Semicolon),
176
        case ':' => {
177
            if consume(s, ':') {
178
                return tok(s, TokenKind::ColonColon);
179
            }
180
            return invalid(s.token, "unexpected `:`");
181
        }
182
        case '"' => return scanString(s),
183
        case '\'' => return scanChar(s),
184
        case '.' => return scanPrefixedToken(s, TokenKind::Directive, "expected directive name after `.`"),
185
        case '@' => return scanLabelToken(s),
186
        case '%' => return scanPrefixedToken(s, TokenKind::Register, "expected register after `%`"),
187
        case '-' => return scanSignedNumberOrToken(s, TokenKind::Minus),
188
        case '+' => return scanSignedNumberOrToken(s, TokenKind::Plus),
189
        case '/' => return tok(s, TokenKind::Slash),
190
        case '*' => return tok(s, TokenKind::Star),
191
        else => return invalid(s.token, "unexpected character"),
192
    }
193
}
194
195
/// Create a token spanning the current scanner range.
196
fn tok(s: *Scanner, kind: TokenKind) -> Token {
197
    return Token { kind, source: &s.source[s.token..s.cursor], offset: s.token };
198
}
199
200
/// Scan the identifier continuation characters that follow the current token start.
201
fn scanIdentifierBody(s: *mut Scanner) {
202
    while let ch = current(s); char::isAlpha(ch) or char::isDigit(ch) or ch == '_' {
203
        advance(s);
204
    }
205
}
206
207
/// Scan a signed number when `+` or `-` is followed by a digit, otherwise return the punctuation token.
208
fn scanSignedNumberOrToken(s: *mut Scanner, kind: TokenKind) -> Token {
209
    if let nextCh = current(s); char::isDigit(nextCh) {
210
        return scanNumber(s);
211
    }
212
    return tok(s, kind);
213
}
214
215
/// Scan a numeric literal.
216
fn scanNumber(s: *mut Scanner) -> Token {
217
    let first = s.source[s.cursor - 1];
218
    if first == '-' or first == '+' {
219
        advance(s);
220
    }
221
    if s.source[s.cursor - 1] == '0' {
222
        if let ch = current(s); ch == 'x' or ch == 'X' {
223
            advance(s);
224
            if let digit = current(s); not char::isHexDigit(digit) {
225
                return invalid(s.token, "invalid hex literal");
226
            }
227
            while let digit = current(s); char::isHexDigit(digit) {
228
                advance(s);
229
            }
230
            return tok(s, TokenKind::Number);
231
        }
232
    }
233
    while let digit = current(s); char::isDigit(digit) {
234
        advance(s);
235
    }
236
    return tok(s, TokenKind::Number);
237
}
238
239
/// Scan a printable token terminated by `delim`.
240
fn scanCharsUntil(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token {
241
    while let ch = current(s); ch <> delim {
242
        if not char::isPrint(ch) {
243
            return invalid(s.token, "invalid character");
244
        }
245
        if consume(s, '\\') {
246
            if isEof(s) {
247
                return nil;
248
            }
249
        }
250
        advance(s);
251
    }
252
    if not consume(s, delim) {
253
        return nil;
254
    }
255
    return tok(s, kind);
256
}
257
258
/// Scan a string literal.
259
fn scanString(s: *mut Scanner) -> Token {
260
    if let token = scanCharsUntil(s, '"', TokenKind::String) {
261
        return token;
262
    }
263
    return invalid(s.token, "unterminated string");
264
}
265
266
/// Scan a character literal.
267
fn scanChar(s: *mut Scanner) -> Token {
268
    if let token = scanCharsUntil(s, '\'', TokenKind::Char) {
269
        return token;
270
    }
271
    return invalid(s.token, "unterminated character");
272
}
273
274
/// Scan an identifier-shaped token of the given kind.
275
fn scanIdentToken(s: *mut Scanner, kind: TokenKind) -> Token {
276
    scanIdentifierBody(s);
277
278
    return Token {
279
        kind,
280
        source: strings::intern(s.pool, &s.source[s.token..s.cursor]),
281
        offset: s.token,
282
    };
283
}
284
285
/// Scan a sigil-prefixed identifier-shaped token.
286
fn scanPrefixedToken(s: *mut Scanner, kind: TokenKind, message: *[u8]) -> Token {
287
    let ch = current(s) else {
288
        return invalid(s.token, message);
289
    };
290
    if not char::isAlpha(ch) and ch <> '_' {
291
        return invalid(s.token, message);
292
    }
293
    scanIdentifierBody(s);
294
295
    return Token {
296
        kind,
297
        source: strings::intern(s.pool, &s.source[s.token..s.cursor]),
298
        offset: s.token,
299
    };
300
}
301
302
/// Scan an assembler label token, accepting either `@name` or `@"quoted"` syntax.
303
fn scanLabelToken(s: *mut Scanner) -> Token {
304
    let ch = current(s) else {
305
        return invalid(s.token, "expected label after `@`");
306
    };
307
    if ch == '"' {
308
        advance(s);
309
        if let token = scanCharsUntil(s, '"', TokenKind::QuotedLabel) {
310
            return token;
311
        }
312
        return invalid(s.token, "unterminated quoted label");
313
    }
314
    return scanPrefixedToken(s, TokenKind::Label, "expected label after `@`");
315
}