lib/std/lang/scanner.rad 18.1 KiB raw
1
//! Lexical scanner for the Radiance programming language.
2
//!
3
//! This module implements a hand-written scanner that tokenizes Radiance
4
//! source code into a stream of tokens for consumption by the parser.
5
@test mod tests;
6
7
use std::mem;
8
use std::lang::strings;
9
10
/// Token kinds representing all lexical elements in Radiance.
11
///
12
/// This enum covers operators, keywords, literals, and structural
13
/// elements used by the parser to build the AST.
14
pub union TokenKind {
15
    /// Special end of file token generated when the input is exhausted.
16
    Eof,
17
    /// Special invalid token.
18
    Invalid,
19
20
    LParen,     // (
21
    RParen,     // )
22
    LBrace,     // {
23
    RBrace,     // }
24
    LBracket,   // [
25
    RBracket,   // ]
26
    Comma,      // ,
27
    Dot,        // .
28
    DotDot,     // ..
29
    Minus,      // -
30
    Plus,       // +
31
    Colon,      // :
32
    ColonColon, // ::
33
    Semicolon,  // ;
34
    Slash,      // /
35
    Star,       // *
36
    Percent,    // %
37
    Amp,        // &
38
    Pipe,       // |
39
    Caret,      // ^
40
    Tilde,      // ~
41
    Underscore, // _
42
    Question,   // ?
43
    Bang,       // !
44
    BangEqual,  // !=
45
    Equal,      // =
46
    EqualEqual, // ==
47
    Gt,         // >
48
    GtEqual,    // >=
49
    Lt,         // <
50
    LtEqual,    // <=
51
    LtLt,       // <<
52
    GtGt,       // >>
53
    Arrow,      // ->
54
    FatArrow,   // =>
55
56
    // Compound assignment operators.
57
    PlusEqual,    // +=
58
    MinusEqual,   // -=
59
    StarEqual,    // *=
60
    SlashEqual,   // /=
61
    PercentEqual, // %=
62
    AmpEqual,     // &=
63
    PipeEqual,    // |=
64
    CaretEqual,   // ^=
65
    LtLtEqual,    // <<=
66
    GtGtEqual,    // >>=
67
68
    // Boolean operators.
69
    Not, And, Or,
70
71
    /// Eg. `fnord`
72
    Ident,
73
    /// Eg. `@default`
74
    AtIdent,
75
    /// The `log` keyword.
76
    Log,
77
78
    // Literals.
79
    String,     // "fnord"
80
    Char,       // 'f'
81
    Number,     // 42
82
    True,       // true
83
    False,      // false
84
    Nil,        // nil
85
    Undefined,  // undefined
86
87
    // Control flow tokens.
88
    If, Else, Return, Break,
89
    Continue, While, For, In,
90
    Loop, Match, Case, Try, Catch,
91
    Throw, Throws, Panic, Assert,
92
93
    // Variable binding tokens.
94
    Let, Mut, Const, Align,
95
96
    // Module-related tokens.
97
    Mod, Use, Super,
98
99
    // Type or function attributes.
100
    Pub, Extern, Static,
101
102
    // Trait-related tokens.
103
    Trait, Instance,
104
105
    // Type-related tokens.
106
    I8, I16, I32, I64, U8, U16, U32, U64,
107
    Opaque, Fn, Bool, Union, Record, As
108
}
109
110
/// A reserved keyword.
111
record Keyword {
112
    /// Keyword string.
113
    name: *[u8],
114
    /// Corresponding token.
115
    tok: TokenKind,
116
}
117
118
/// Sorted keyword table for binary search.
119
const KEYWORDS: [Keyword; 51] = [
120
    { name: "align", tok: TokenKind::Align },
121
    { name: "and", tok: TokenKind::And },
122
    { name: "as", tok: TokenKind::As },
123
    { name: "assert", tok: TokenKind::Assert },
124
    { name: "bool", tok: TokenKind::Bool },
125
    { name: "break", tok: TokenKind::Break },
126
    { name: "case", tok: TokenKind::Case },
127
    { name: "catch", tok: TokenKind::Catch },
128
    { name: "const", tok: TokenKind::Const },
129
    { name: "continue", tok: TokenKind::Continue },
130
    { name: "else", tok: TokenKind::Else },
131
    { name: "extern", tok: TokenKind::Extern },
132
    { name: "false", tok: TokenKind::False },
133
    { name: "fn", tok: TokenKind::Fn },
134
    { name: "for", tok: TokenKind::For },
135
    { name: "i16", tok: TokenKind::I16 },
136
    { name: "i32", tok: TokenKind::I32 },
137
    { name: "i64", tok: TokenKind::I64 },
138
    { name: "i8", tok: TokenKind::I8 },
139
    { name: "if", tok: TokenKind::If },
140
    { name: "in", tok: TokenKind::In },
141
    { name: "instance", tok: TokenKind::Instance },
142
    { name: "let", tok: TokenKind::Let },
143
    { name: "log", tok: TokenKind::Log },
144
    { name: "loop", tok: TokenKind::Loop },
145
    { name: "match", tok: TokenKind::Match },
146
    { name: "mod", tok: TokenKind::Mod },
147
    { name: "mut", tok: TokenKind::Mut },
148
    { name: "nil", tok: TokenKind::Nil },
149
    { name: "not", tok: TokenKind::Not },
150
    { name: "opaque", tok: TokenKind::Opaque },
151
    { name: "or", tok: TokenKind::Or },
152
    { name: "panic", tok: TokenKind::Panic },
153
    { name: "pub", tok: TokenKind::Pub },
154
    { name: "record", tok: TokenKind::Record },
155
    { name: "return", tok: TokenKind::Return },
156
    { name: "static", tok: TokenKind::Static },
157
    { name: "super", tok: TokenKind::Super },
158
    { name: "throw", tok: TokenKind::Throw },
159
    { name: "throws", tok: TokenKind::Throws },
160
    { name: "trait", tok: TokenKind::Trait },
161
    { name: "true", tok: TokenKind::True },
162
    { name: "try", tok: TokenKind::Try },
163
    { name: "u16", tok: TokenKind::U16 },
164
    { name: "u32", tok: TokenKind::U32 },
165
    { name: "u64", tok: TokenKind::U64 },
166
    { name: "u8", tok: TokenKind::U8 },
167
    { name: "undefined", tok: TokenKind::Undefined },
168
    { name: "union", tok: TokenKind::Union },
169
    { name: "use", tok: TokenKind::Use },
170
    { name: "while", tok: TokenKind::While },
171
];
172
173
/// Describes where source code originated from.
174
pub union SourceLoc {
175
    /// Source loaded from a file at the given path.
176
    File(*[u8]),
177
    /// Source provided as an inline string (no file path).
178
    String,
179
}
180
181
/// Lexical scanner state for tokenizing Radiance source code.
182
///
183
/// Maintains position information and source buffer reference.
184
pub record Scanner {
185
    /// Origin of the source being scanned.
186
    sourceLoc: SourceLoc,
187
    /// Source buffer.
188
    source: *[u8],
189
    /// Offset of current token into buffer.
190
    token: u32,
191
    /// Offset of current character being scanned.
192
    cursor: u32,
193
    /// Interned string pool.
194
    pool: *mut strings::Pool,
195
}
196
197
/// Individual token with kind, source text, and position.
198
///
199
/// Represents a single lexical element extracted from source,
200
/// including its original text and byte offset for error reporting.
201
pub record Token {
202
    /// Token kind.
203
    kind: TokenKind,
204
    /// Token source string.
205
    source: *[u8],
206
    /// Byte offset of `source` in input buffer.
207
    offset: u32,
208
}
209
210
/// Source code location with line/column information.
211
///
212
/// Used for error reporting and debugging.
213
pub record Location {
214
    /// Origin of the source.
215
    source: SourceLoc,
216
    /// Line number.
217
    line: u16,
218
    /// Column number.
219
    col: u16,
220
}
221
222
/// Create a new scanner object.
223
pub fn scanner(sourceLoc: SourceLoc, source: *[u8], pool: *mut strings::Pool) -> Scanner {
224
    // Intern built-in functions and attributes.
225
    strings::intern(pool, "@sizeOf");
226
    strings::intern(pool, "@alignOf");
227
    strings::intern(pool, "@sliceOf");
228
    strings::intern(pool, "@default");
229
    strings::intern(pool, "@intrinsic");
230
    strings::intern(pool, "@test");
231
    // Intern built-in slice methods.
232
    strings::intern(pool, "append");
233
    strings::intern(pool, "delete");
234
235
    return Scanner { sourceLoc, source, token: 0, cursor: 0, pool };
236
}
237
238
/// Check if we've reached the end of input.
239
pub fn isEof(s: *Scanner) -> bool {
240
    return s.cursor >= s.source.len;
241
}
242
243
/// Get the current character, if any.
244
pub fn current(s: *Scanner) -> ?u8 {
245
    if isEof(s) {
246
        return nil;
247
    }
248
    return s.source[s.cursor];
249
}
250
251
/// Peek at the next character without advancing the scanner.
252
fn peek(s: *Scanner) -> ?u8 {
253
    if s.cursor + 1 >= s.source.len {
254
        return nil;
255
    }
256
    return s.source[s.cursor + 1];
257
}
258
259
/// Advance scanner and return the character that was consumed.
260
fn advance(s: *mut Scanner) -> u8 {
261
    s.cursor += 1;
262
    return s.source[s.cursor - 1];
263
}
264
265
/// Consume the expected character if it matches the current position.
266
fn consume(s: *mut Scanner, expected: u8) -> bool {
267
    if let c = current(s); c == expected {
268
        advance(s);
269
        return true;
270
    }
271
    return false;
272
}
273
274
/// Create a token from the current scanner state.
275
fn tok(s: *Scanner, kind: TokenKind) -> Token {
276
    return Token { kind, source: &s.source[s.token..s.cursor], offset: s.token };
277
}
278
279
/// Create an invalid token with the given message.
280
pub fn invalid(offset: u32, message: *[u8]) -> Token {
281
    return Token { kind: TokenKind::Invalid, source: message, offset };
282
}
283
284
/// Skip whitespace characters and line comments.
285
fn skipWhitespace(s: *mut Scanner) {
286
    while let ch = current(s) {
287
        match ch {
288
            case ' ', '\n', '\r', '\t' => advance(s),
289
            case '/' => {
290
                if let c = peek(s); c == '/' {
291
                    while let ch = current(s); ch != '\n' {
292
                        advance(s);
293
                    }
294
                } else {
295
                    return;
296
                }
297
            }
298
            else => return,
299
        }
300
    }
301
}
302
303
/// Check if character is an ASCII digit (0-9).
304
fn isDigit(c: u8) -> bool {
305
    return c >= '0' and c <= '9';
306
}
307
308
/// Check if character is a hexadecimal digit (0-9, a-f, A-F).
309
fn isHexDigit(c: u8) -> bool {
310
    return (c >= '0' and c <= '9')
311
        or (c >= 'a' and c <= 'f')
312
        or (c >= 'A' and c <= 'F');
313
}
314
315
/// Check if character is a binary digit (0 or 1).
316
fn isBinDigit(c: u8) -> bool {
317
    return c == '0' or c == '1';
318
}
319
320
/// Check if character is alphabetic.
321
fn isAlpha(c: u8) -> bool {
322
    return (c >= 'a' and c <= 'z')
323
        or (c >= 'A' and c <= 'Z');
324
}
325
326
/// Check if character is printable ASCII.
327
fn isPrint(c: u8) -> bool {
328
    return c >= ' ' and c <= '~';
329
}
330
331
/// Scan numeric literal (decimal, hex, or binary).
332
fn scanNumber(s: *mut Scanner) -> Token {
333
    let first = s.source[s.cursor - 1];
334
    if first == '-' or first == '+' {
335
        advance(s);
336
    }
337
    // Check for hex literal (`0x` or `0X` prefix).
338
    if s.source[s.cursor - 1] == '0' {
339
        if let ch = current(s); ch == 'x' or ch == 'X' {
340
            advance(s);
341
            // Must have at least one hex digit after `0x`.
342
            if let ch = current(s); not isHexDigit(ch) {
343
                return invalid(s.token, "invalid hex literal");
344
            }
345
            while let ch = current(s); isHexDigit(ch) {
346
                advance(s);
347
            }
348
            return tok(s, TokenKind::Number);
349
        }
350
        // Check for binary literal (`0b` or `0B` prefix).
351
        if let ch = current(s); ch == 'b' or ch == 'B' {
352
            advance(s);
353
            // Must have at least one binary digit after `0b`.
354
            if let ch = current(s); not isBinDigit(ch) {
355
                return invalid(s.token, "invalid binary literal");
356
            }
357
            while let ch = current(s); isBinDigit(ch) {
358
                advance(s);
359
            }
360
            return tok(s, TokenKind::Number);
361
        }
362
    }
363
364
    // Regular decimal number.
365
    while let ch = current(s); isDigit(ch) {
366
        advance(s);
367
    }
368
369
    // Look for decimal part.
370
    if let ch = current(s); ch == '.' {
371
        if let p = peek(s); isDigit(p) {
372
            advance(s); // Consume the "."
373
            while let ch = current(s); isDigit(ch) {
374
                advance(s);
375
            }
376
        }
377
    }
378
    return tok(s, TokenKind::Number);
379
}
380
381
fn scanDelimited(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token {
382
    while let ch = current(s); ch != delim {
383
        if not isPrint(ch) {
384
            return invalid(s.token, "invalid character");
385
        }
386
        consume(s, '\\'); // Consume escapes
387
        advance(s);
388
    }
389
    if not consume(s, delim) {
390
        return nil;
391
    }
392
    return tok(s, kind);
393
}
394
395
/// Scan string literal enclosed in double quotes.
396
fn scanString(s: *mut Scanner) -> Token {
397
    if let tok = scanDelimited(s, '"', TokenKind::String) {
398
        return tok;
399
    }
400
    return invalid(s.token, "unterminated string");
401
}
402
403
/// Scan character literal enclosed in single quotes.
404
fn scanChar(s: *mut Scanner) -> Token {
405
    if let tok = scanDelimited(s, '\'', TokenKind::Char) {
406
        return tok;
407
    }
408
    return invalid(s.token, "unterminated character");
409
}
410
411
/// Scan a keyword or an identifier.
412
fn keywordOrIdent(src: *[u8]) -> TokenKind {
413
    let mut left: u32 = 0;
414
    let mut right: u32 = KEYWORDS.len;
415
416
    while left < right {
417
        let mid = left + ((right - left) / 2);
418
        let kw = KEYWORDS[mid];
419
        let cmp = mem::cmp(src, kw.name);
420
421
        match cmp {
422
            case -1 => right = mid,
423
            case 1 => left = mid + 1,
424
            else => return kw.tok,
425
        }
426
    }
427
    return TokenKind::Ident;
428
}
429
430
/// Scan an identifier, keyword, or label.
431
fn scanIdentifier(s: *mut Scanner) -> Token {
432
    while let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' or ch == '#' {
433
        advance(s);
434
    }
435
    let ident = &s.source[s.token..s.cursor];
436
    let kind = keywordOrIdent(ident);
437
438
    // Only intern actual identifiers, not keywords.
439
    if kind == TokenKind::Ident {
440
        return Token { kind, source: strings::intern(s.pool, ident), offset: s.token };
441
    }
442
    return tok(s, kind);
443
}
444
445
/// Scan the next token.
446
pub fn next(s: *mut Scanner) -> Token {
447
    skipWhitespace(s);  // Skip any whitespace between tokens.
448
    s.token = s.cursor; // Token starts at current position.
449
450
    if isEof(s) {
451
        return tok(s, TokenKind::Eof);
452
    }
453
    let c: u8 = advance(s);
454
455
    if isDigit(c) {
456
        return scanNumber(s);
457
    }
458
    if isAlpha(c) {
459
        return scanIdentifier(s);
460
    }
461
    match c {
462
        case '\'' => return scanChar(s),
463
        case '"'  => return scanString(s),
464
        case '('  => return tok(s, TokenKind::LParen),
465
        case ')'  => return tok(s, TokenKind::RParen),
466
        case '{'  => return tok(s, TokenKind::LBrace),
467
        case '}'  => return tok(s, TokenKind::RBrace),
468
        case '['  => return tok(s, TokenKind::LBracket),
469
        case ']'  => return tok(s, TokenKind::RBracket),
470
        case ';'  => return tok(s, TokenKind::Semicolon),
471
        case ','  => return tok(s, TokenKind::Comma),
472
        case '.'  => {
473
            if consume(s, '.') {
474
                return tok(s, TokenKind::DotDot);
475
            }
476
            return tok(s, TokenKind::Dot);
477
        }
478
        case ':'  => {
479
            if consume(s, ':') {
480
                return tok(s, TokenKind::ColonColon);
481
            }
482
            return tok(s, TokenKind::Colon);
483
        }
484
        case '-'  => {
485
            if consume(s, '>') {
486
                return tok(s, TokenKind::Arrow);
487
            }
488
            if consume(s, '=') {
489
                return tok(s, TokenKind::MinusEqual);
490
            }
491
            // If followed by a digit, scan as negative number
492
            if let ch = current(s); isDigit(ch) {
493
                return scanNumber(s);
494
            }
495
            return tok(s, TokenKind::Minus);
496
        }
497
        case '+' => {
498
            if consume(s, '=') {
499
                return tok(s, TokenKind::PlusEqual);
500
            }
501
            if let ch = current(s); isDigit(ch) {
502
                return scanNumber(s);
503
            }
504
            return tok(s, TokenKind::Plus);
505
        }
506
        case '/' => {
507
            if consume(s, '=') {
508
                return tok(s, TokenKind::SlashEqual);
509
            }
510
            return tok(s, TokenKind::Slash);
511
        }
512
        case '*' => {
513
            if consume(s, '=') {
514
                return tok(s, TokenKind::StarEqual);
515
            }
516
            return tok(s, TokenKind::Star);
517
        }
518
        case '%' => {
519
            if consume(s, '=') {
520
                return tok(s, TokenKind::PercentEqual);
521
            }
522
            return tok(s, TokenKind::Percent);
523
        }
524
        case '&' => {
525
            if consume(s, '=') {
526
                return tok(s, TokenKind::AmpEqual);
527
            }
528
            return tok(s, TokenKind::Amp);
529
        }
530
        case '?' => return tok(s, TokenKind::Question),
531
        case '|' => {
532
            if consume(s, '=') {
533
                return tok(s, TokenKind::PipeEqual);
534
            }
535
            return tok(s, TokenKind::Pipe);
536
        }
537
        case '^' => {
538
            if consume(s, '=') {
539
                return tok(s, TokenKind::CaretEqual);
540
            }
541
            return tok(s, TokenKind::Caret);
542
        }
543
        case '~' => return tok(s, TokenKind::Tilde),
544
        case '!' => {
545
            if consume(s, '=') {
546
                return tok(s, TokenKind::BangEqual);
547
            }
548
            return tok(s, TokenKind::Bang);
549
        }
550
        case '=' => {
551
            if consume(s, '>') {
552
                return tok(s, TokenKind::FatArrow);
553
            }
554
            if consume(s, '=') {
555
                return tok(s, TokenKind::EqualEqual);
556
            }
557
            return tok(s, TokenKind::Equal);
558
        }
559
        case '<' => {
560
            if consume(s, '<') {
561
                if consume(s, '=') {
562
                    return tok(s, TokenKind::LtLtEqual);
563
                }
564
                return tok(s, TokenKind::LtLt);
565
            }
566
            if consume(s, '=') {
567
                return tok(s, TokenKind::LtEqual);
568
            }
569
            return tok(s, TokenKind::Lt);
570
        }
571
        case '>' => {
572
            if consume(s, '>') {
573
                if consume(s, '=') {
574
                    return tok(s, TokenKind::GtGtEqual);
575
                }
576
                return tok(s, TokenKind::GtGt);
577
            }
578
            if consume(s, '=') {
579
                return tok(s, TokenKind::GtEqual);
580
            }
581
            return tok(s, TokenKind::Gt);
582
        }
583
        case '@' => {
584
            // Scan `@identifier` as a single token.
585
            while let ch = current(s); isAlpha(ch) {
586
                advance(s);
587
            }
588
            // Must have at least one character after `@`.
589
            if s.cursor - s.token <= 1 {
590
                return invalid(s.token, "expected identifier after `@`");
591
            }
592
            let name = &s.source[s.token..s.cursor];
593
            return Token {
594
                kind: TokenKind::AtIdent,
595
                source: strings::intern(s.pool, name),
596
                offset: s.token,
597
            };
598
        }
599
        case '_' => {
600
            if let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' {
601
                // This is part of an identifier like `_foo` or `__start`
602
                return scanIdentifier(s);
603
            }
604
            return tok(s, TokenKind::Underscore);
605
        }
606
        else => return invalid(s.token, "unexpected character"),
607
    }
608
}
609
610
/// Get the source code location from a byte offset.
611
pub fn getLocation(sourceLoc: SourceLoc, source: *[u8], offset: u32) -> ?Location {
612
    let mut l: u16 = 1;
613
    let mut c: u16 = 1;
614
615
    if offset >= source.len {
616
        return nil;
617
    }
618
    for ch in &source[..offset] {
619
        if ch == '\n' {
620
            c = 1;
621
            l += 1;
622
        } else {
623
            c += 1;
624
        }
625
    }
626
    return Location { source: sourceLoc, line: l, col: c };
627
}