radiance · Radiance self-hosting compiler

compiler/ lib/ examples/ std/ arch/ collections/ lang/ alloc/ ast/ gen/ il/ lower/ module/ parser/ resolver/ scanner/ alloc.rad 4.2 KiB ast.rad 22.6 KiB gen.rad 265 B il.rad 15.1 KiB lower.rad 258.3 KiB module.rad 14.1 KiB package.rad 1.4 KiB parser.rad 78.7 KiB resolver.rad 227.9 KiB scanner.rad 23.4 KiB sexpr.rad 7.0 KiB strings.rad 2.2 KiB sys/ arch.rad 65 B collections.rad 36 B fmt.rad 3.8 KiB intrinsics.rad 206 B io.rad 1.2 KiB lang.rad 222 B mem.rad 2.2 KiB sys.rad 167 B testing.rad 2.4 KiB tests.rad 11.6 KiB vec.rad 3.1 KiB std.rad 231 B scripts/ seed/ test/ vim/ .gitignore 353 B .gitsigners 112 B LICENSE 1.1 KiB Makefile 3.1 KiB README 2.5 KiB std.lib 987 B std.lib.test 252 B
lib/std/lang/scanner.rad 23.4 KiB raw
//! Lexical scanner for the Radiance programming language.
//!
//! This module implements a hand-written scanner that tokenizes Radiance
//! source code into a stream of tokens for consumption by the parser.
@test mod tests;

use std::mem;
use std::lang::strings;

/// Token kinds representing all lexical elements in Radiance.
///
/// This enum covers operators, keywords, literals, and structural
/// elements used by the parser to build the AST.
pub union TokenKind {
    /// Special end of file token generated when the input is exhausted.
    Eof,
    /// Special invalid token.
    Invalid,

    LParen,     // (
    RParen,     // )
    LBrace,     // {
    RBrace,     // }
    LBracket,   // [
    RBracket,   // ]
    Comma,      // ,
    Dot,        // .
    DotDot,     // ..
    Minus,      // -
    Plus,       // +
    Colon,      // :
    ColonColon, // ::
    Semicolon,  // ;
    Slash,      // /
    Star,       // *
    Percent,    // %
    Amp,        // &
    Pipe,       // |
    Caret,      // ^
    Tilde,      // ~
    Underscore, // _
    Question,   // ?
    Bang,       // !
    BangEqual,  // !=
    Equal,      // =
    EqualEqual, // ==
    Gt,         // >
    GtEqual,    // >=
    Lt,         // <
    LtEqual,    // <=
    LtLt,       // <<
    GtGt,       // >>
    Arrow,      // ->
    FatArrow,   // =>

    // Compound assignment operators.
    PlusEqual,    // +=
    MinusEqual,   // -=
    StarEqual,    // *=
    SlashEqual,   // /=
    PercentEqual, // %=
    AmpEqual,     // &=
    PipeEqual,    // |=
    CaretEqual,   // ^=
    LtLtEqual,    // <<=
    GtGtEqual,    // >>=

    // Boolean operators.
    Not, And, Or,

    /// Eg. `input:`
    Label,
    /// Eg. `fnord`
    Ident,
    /// Eg. `@default`
    AtIdent,
    /// The `log` keyword.
    Log,

    // Literals.
    String,     // "fnord"
    Char,       // 'f'
    Number,     // 42
    True,       // true
    False,      // false
    Nil,        // nil
    Undefined,  // undefined

    // Control flow tokens.
    If, Else, Return, Break,
    Continue, While, For, In,
    Loop, Match, Case, Try, Catch,
    Throw, Throws, Panic, Assert,

    // Variable binding tokens.
    Let, Mut, Const, Align,

    // Module-related tokens.
    Mod, Use, Super,

    // Type or function attributes.
    Pub, Extern, Static,

    // Trait-related tokens.
    Trait, Instance,

    // Type-related tokens.
    I8, I16, I32, I64, U8, U16, U32, U64,
    Void, Opaque, Fn, Bool, Union, Record, As
}

/// Convert a token kind to its string representation.
pub fn tokenKindToString(kind: TokenKind) -> *[u8] {
    match kind {
        case TokenKind::Eof => return "Eof",
        case TokenKind::Invalid => return "Invalid",
        case TokenKind::LParen => return "LParen",
        case TokenKind::RParen => return "RParen",
        case TokenKind::LBrace => return "LBrace",
        case TokenKind::RBrace => return "RBrace",
        case TokenKind::LBracket => return "LBracket",
        case TokenKind::RBracket => return "RBracket",
        case TokenKind::Comma => return "Comma",
        case TokenKind::Dot => return "Dot",
        case TokenKind::DotDot => return "DotDot",
        case TokenKind::Minus => return "Minus",
        case TokenKind::Plus => return "Plus",
        case TokenKind::Colon => return "Colon",
        case TokenKind::ColonColon => return "ColonColon",
        case TokenKind::Semicolon => return "Semicolon",
        case TokenKind::Slash => return "Slash",
        case TokenKind::Star => return "Star",
        case TokenKind::Percent => return "Percent",
        case TokenKind::Amp => return "Amp",
        case TokenKind::Pipe => return "Pipe",
        case TokenKind::Caret => return "Caret",
        case TokenKind::Tilde => return "Tilde",
        case TokenKind::Underscore => return "Underscore",
        case TokenKind::AtIdent => return "AtIdent",
        case TokenKind::Question => return "Question",
        case TokenKind::Bang => return "Bang",
        case TokenKind::BangEqual => return "BangEqual",
        case TokenKind::Equal => return "Equal",
        case TokenKind::EqualEqual => return "EqualEqual",
        case TokenKind::Gt => return "Gt",
        case TokenKind::GtEqual => return "GtEqual",
        case TokenKind::Lt => return "Lt",
        case TokenKind::LtEqual => return "LtEqual",
        case TokenKind::LtLt => return "LtLt",
        case TokenKind::GtGt => return "GtGt",
        case TokenKind::Arrow => return "Arrow",
        case TokenKind::FatArrow => return "FatArrow",
        case TokenKind::PlusEqual => return "PlusEqual",
        case TokenKind::MinusEqual => return "MinusEqual",
        case TokenKind::StarEqual => return "StarEqual",
        case TokenKind::SlashEqual => return "SlashEqual",
        case TokenKind::PercentEqual => return "PercentEqual",
        case TokenKind::AmpEqual => return "AmpEqual",
        case TokenKind::PipeEqual => return "PipeEqual",
        case TokenKind::CaretEqual => return "CaretEqual",
        case TokenKind::LtLtEqual => return "LtLtEqual",
        case TokenKind::GtGtEqual => return "GtGtEqual",
        case TokenKind::Not => return "Not",
        case TokenKind::And => return "And",
        case TokenKind::Or => return "Or",
        case TokenKind::Label => return "Label",
        case TokenKind::Ident => return "Ident",
        case TokenKind::Log => return "Log",
        case TokenKind::String => return "String",
        case TokenKind::Char => return "Char",
        case TokenKind::Number => return "Number",
        case TokenKind::True => return "True",
        case TokenKind::False => return "False",
        case TokenKind::Nil => return "Nil",
        case TokenKind::Undefined => return "Undefined",
        case TokenKind::If => return "If",
        case TokenKind::Else => return "Else",
        case TokenKind::Return => return "Return",
        case TokenKind::Break => return "Break",
        case TokenKind::Continue => return "Continue",
        case TokenKind::While => return "While",
        case TokenKind::For => return "For",
        case TokenKind::In => return "In",
        case TokenKind::Loop => return "Loop",
        case TokenKind::Match => return "Match",
        case TokenKind::Case => return "Case",
        case TokenKind::Try => return "Try",
        case TokenKind::Catch => return "Catch",
        case TokenKind::Throw => return "Throw",
        case TokenKind::Throws => return "Throws",
        case TokenKind::Panic => return "Panic",
        case TokenKind::Assert => return "Assert",
        case TokenKind::Let => return "Let",
        case TokenKind::Mut => return "Mut",
        case TokenKind::Const => return "Const",
        case TokenKind::Align => return "Align",
        case TokenKind::Mod => return "Mod",
        case TokenKind::Use => return "Use",
        case TokenKind::Super => return "Super",
        case TokenKind::Pub => return "Pub",
        case TokenKind::Extern => return "Extern",
        case TokenKind::Static => return "Static",
        case TokenKind::Trait => return "Trait",
        case TokenKind::Instance => return "Instance",
        case TokenKind::I8 => return "I8",
        case TokenKind::I16 => return "I16",
        case TokenKind::I32 => return "I32",
        case TokenKind::I64 => return "I64",
        case TokenKind::U8 => return "U8",
        case TokenKind::U16 => return "U16",
        case TokenKind::U32 => return "U32",
        case TokenKind::U64 => return "U64",
        case TokenKind::Void => return "Void",
        case TokenKind::Opaque => return "Opaque",
        case TokenKind::Fn => return "Fn",
        case TokenKind::Bool => return "Bool",
        case TokenKind::Union => return "Union",
        case TokenKind::Record => return "Record",
        case TokenKind::As => return "As",
    }
}

/// A reserved keyword.
record Keyword {
    /// Keyword string.
    name: *[u8],
    /// Corresponding token.
    tok: TokenKind,
}

/// Sorted keyword table for binary search.
const KEYWORDS: [Keyword; 52] = [
    { name: "align", tok: TokenKind::Align },
    { name: "and", tok: TokenKind::And },
    { name: "as", tok: TokenKind::As },
    { name: "assert", tok: TokenKind::Assert },
    { name: "bool", tok: TokenKind::Bool },
    { name: "break", tok: TokenKind::Break },
    { name: "case", tok: TokenKind::Case },
    { name: "catch", tok: TokenKind::Catch },
    { name: "const", tok: TokenKind::Const },
    { name: "continue", tok: TokenKind::Continue },
    { name: "else", tok: TokenKind::Else },
    { name: "extern", tok: TokenKind::Extern },
    { name: "false", tok: TokenKind::False },
    { name: "fn", tok: TokenKind::Fn },
    { name: "for", tok: TokenKind::For },
    { name: "i16", tok: TokenKind::I16 },
    { name: "i32", tok: TokenKind::I32 },
    { name: "i64", tok: TokenKind::I64 },
    { name: "i8", tok: TokenKind::I8 },
    { name: "if", tok: TokenKind::If },
    { name: "in", tok: TokenKind::In },
    { name: "instance", tok: TokenKind::Instance },
    { name: "let", tok: TokenKind::Let },
    { name: "log", tok: TokenKind::Log },
    { name: "loop", tok: TokenKind::Loop },
    { name: "match", tok: TokenKind::Match },
    { name: "mod", tok: TokenKind::Mod },
    { name: "mut", tok: TokenKind::Mut },
    { name: "nil", tok: TokenKind::Nil },
    { name: "not", tok: TokenKind::Not },
    { name: "opaque", tok: TokenKind::Opaque },
    { name: "or", tok: TokenKind::Or },
    { name: "panic", tok: TokenKind::Panic },
    { name: "pub", tok: TokenKind::Pub },
    { name: "record", tok: TokenKind::Record },
    { name: "return", tok: TokenKind::Return },
    { name: "static", tok: TokenKind::Static },
    { name: "super", tok: TokenKind::Super },
    { name: "throw", tok: TokenKind::Throw },
    { name: "throws", tok: TokenKind::Throws },
    { name: "trait", tok: TokenKind::Trait },
    { name: "true", tok: TokenKind::True },
    { name: "try", tok: TokenKind::Try },
    { name: "u16", tok: TokenKind::U16 },
    { name: "u32", tok: TokenKind::U32 },
    { name: "u64", tok: TokenKind::U64 },
    { name: "u8", tok: TokenKind::U8 },
    { name: "undefined", tok: TokenKind::Undefined },
    { name: "union", tok: TokenKind::Union },
    { name: "use", tok: TokenKind::Use },
    { name: "void", tok: TokenKind::Void },
    { name: "while", tok: TokenKind::While },
];

/// Describes where source code originated from.
pub union SourceLoc {
    /// Source loaded from a file at the given path.
    File(*[u8]),
    /// Source provided as an inline string (no file path).
    String,
}

/// Lexical scanner state for tokenizing Radiance source code.
///
/// Maintains position information and source buffer reference.
pub record Scanner {
    /// Origin of the source being scanned.
    sourceLoc: SourceLoc,
    /// Source buffer.
    source: *[u8],
    /// Offset of current token into buffer.
    token: u32,
    /// Offset of current character being scanned.
    cursor: u32,
    /// Interned string pool.
    pool: *mut strings::Pool,
}

/// Individual token with kind, source text, and position.
///
/// Represents a single lexical element extracted from source,
/// including its original text and byte offset for error reporting.
pub record Token {
    /// Token kind.
    kind: TokenKind,
    /// Token source string.
    source: *[u8],
    /// Byte offset of `source` in input buffer.
    offset: u32,
}

/// Source code location with line/column information.
///
/// Used for error reporting and debugging.
pub record Location {
    /// Origin of the source.
    source: SourceLoc,
    /// Line number.
    line: u16,
    /// Column number.
    col: u16,
}

/// Create a new scanner object.
pub fn scanner(sourceLoc: SourceLoc, source: *[u8], pool: *mut strings::Pool) -> Scanner {
    // Intern built-in functions and attributes.
    strings::intern(pool, "@sizeOf");
    strings::intern(pool, "@alignOf");
    strings::intern(pool, "@sliceOf");
    strings::intern(pool, "@default");
    strings::intern(pool, "@intrinsic");
    strings::intern(pool, "@test");
    // Intern built-in slice methods.
    strings::intern(pool, "append");
    strings::intern(pool, "delete");

    return Scanner { sourceLoc, source, token: 0, cursor: 0, pool };
}

/// Check if we've reached the end of input.
pub fn isEof(s: *Scanner) -> bool {
    return s.cursor >= s.source.len;
}

/// Get the current character, if any.
pub fn current(s: *Scanner) -> ?u8 {
    if isEof(s) {
        return nil;
    }
    return s.source[s.cursor];
}

/// Peek at the next character without advancing the scanner.
fn peek(s: *Scanner) -> ?u8 {
    if s.cursor + 1 >= s.source.len {
        return nil;
    }
    return s.source[s.cursor + 1];
}

/// Advance scanner and return the character that was consumed.
fn advance(s: *mut Scanner) -> u8 {
    s.cursor += 1;
    return s.source[s.cursor - 1];
}

/// Consume the expected character if it matches the current position.
fn consume(s: *mut Scanner, expected: u8) -> bool {
    if let c = current(s); c == expected {
        advance(s);
        return true;
    }
    return false;
}

/// Create a token from the current scanner state.
fn tok(s: *Scanner, kind: TokenKind) -> Token {
    return Token { kind, source: &s.source[s.token..s.cursor], offset: s.token };
}

/// Create an invalid token with the given message.
pub fn invalid(offset: u32, message: *[u8]) -> Token {
    return Token { kind: TokenKind::Invalid, source: message, offset };
}

/// Skip whitespace characters and line comments.
fn skipWhitespace(s: *mut Scanner) {
    while let ch = current(s) {
        match ch {
            case ' ', '\n', '\r', '\t' => advance(s),
            case '/' => {
                if let c = peek(s); c == '/' {
                    while let ch = current(s); ch != '\n' {
                        advance(s);
                    }
                } else {
                    return;
                }
            }
            else => return,
        }
    }
}

/// Check if character is an ASCII digit (0-9).
fn isDigit(c: u8) -> bool {
    return c >= '0' and c <= '9';
}

/// Check if character is a hexadecimal digit (0-9, a-f, A-F).
fn isHexDigit(c: u8) -> bool {
    return (c >= '0' and c <= '9')
        or (c >= 'a' and c <= 'f')
        or (c >= 'A' and c <= 'F');
}

/// Check if character is a binary digit (0 or 1).
fn isBinDigit(c: u8) -> bool {
    return c == '0' or c == '1';
}

/// Check if character is alphabetic.
fn isAlpha(c: u8) -> bool {
    return (c >= 'a' and c <= 'z')
        or (c >= 'A' and c <= 'Z');
}

/// Check if character is printable ASCII.
fn isPrint(c: u8) -> bool {
    return c >= ' ' and c <= '~';
}

/// Scan numeric literal (decimal, hex, or binary).
fn scanNumber(s: *mut Scanner) -> Token {
    let first = s.source[s.cursor - 1];
    if first == '-' or first == '+' {
        advance(s);
    }
    // Check for hex literal (`0x` or `0X` prefix).
    if s.source[s.cursor - 1] == '0' {
        if let ch = current(s); ch == 'x' or ch == 'X' {
            advance(s);
            // Must have at least one hex digit after `0x`.
            if let ch = current(s); not isHexDigit(ch) {
                return invalid(s.token, "invalid hex literal");
            }
            while let ch = current(s); isHexDigit(ch) {
                advance(s);
            }
            return tok(s, TokenKind::Number);
        }
        // Check for binary literal (`0b` or `0B` prefix).
        if let ch = current(s); ch == 'b' or ch == 'B' {
            advance(s);
            // Must have at least one binary digit after `0b`.
            if let ch = current(s); not isBinDigit(ch) {
                return invalid(s.token, "invalid binary literal");
            }
            while let ch = current(s); isBinDigit(ch) {
                advance(s);
            }
            return tok(s, TokenKind::Number);
        }
    }

    // Regular decimal number.
    while let ch = current(s); isDigit(ch) {
        advance(s);
    }

    // Look for decimal part.
    if let ch = current(s); ch == '.' {
        if let p = peek(s); isDigit(p) {
            advance(s); // Consume the "."
            while let ch = current(s); isDigit(ch) {
                advance(s);
            }
        }
    }
    return tok(s, TokenKind::Number);
}

fn scanDelimited(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token {
    while let ch = current(s); ch != delim {
        if not isPrint(ch) {
            return invalid(s.token, "invalid character");
        }
        consume(s, '\\'); // Consume escapes
        advance(s);
    }
    if not consume(s, delim) {
        return nil;
    }
    return tok(s, kind);
}

/// Scan string literal enclosed in double quotes.
fn scanString(s: *mut Scanner) -> Token {
    if let tok = scanDelimited(s, '"', TokenKind::String) {
        return tok;
    }
    return invalid(s.token, "unterminated string");
}

/// Scan character literal enclosed in single quotes.
fn scanChar(s: *mut Scanner) -> Token {
    if let tok = scanDelimited(s, '\'', TokenKind::Char) {
        return tok;
    }
    return invalid(s.token, "unterminated character");
}

/// Scan a keyword or an identifier.
fn keywordOrIdent(src: *[u8]) -> TokenKind {
    let mut left: u32 = 0;
    let mut right: u32 = KEYWORDS.len;

    while left < right {
        let mid = left + ((right - left) / 2);
        let kw = KEYWORDS[mid];
        let cmp = mem::cmp(src, kw.name);

        match cmp {
            case -1 => right = mid,
            case 1 => left = mid + 1,
            else => return kw.tok,
        }
    }
    return TokenKind::Ident;
}

/// Scan an identifier, keyword, or label.
fn scanIdentifier(s: *mut Scanner) -> Token {
    while let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' or ch == '#' {
        advance(s);
    }
    let ident = &s.source[s.token..s.cursor];
    let kind = keywordOrIdent(ident);

    // Only intern actual identifiers, not keywords.
    if kind == TokenKind::Ident {
        return Token { kind, source: strings::intern(s.pool, ident), offset: s.token };
    }
    return tok(s, kind);
}

/// Scan the next token.
pub fn next(s: *mut Scanner) -> Token {
    skipWhitespace(s);  // Skip any whitespace between tokens.
    s.token = s.cursor; // Token starts at current position.

    if isEof(s) {
        return tok(s, TokenKind::Eof);
    }
    let c: u8 = advance(s);

    if isDigit(c) {
        return scanNumber(s);
    }
    if isAlpha(c) {
        return scanIdentifier(s);
    }
    match c {
        case '\'' => return scanChar(s),
        case '"'  => return scanString(s),
        case '('  => return tok(s, TokenKind::LParen),
        case ')'  => return tok(s, TokenKind::RParen),
        case '{'  => return tok(s, TokenKind::LBrace),
        case '}'  => return tok(s, TokenKind::RBrace),
        case '['  => return tok(s, TokenKind::LBracket),
        case ']'  => return tok(s, TokenKind::RBracket),
        case ';'  => return tok(s, TokenKind::Semicolon),
        case ','  => return tok(s, TokenKind::Comma),
        case '.'  => {
            if consume(s, '.') {
                return tok(s, TokenKind::DotDot);
            }
            return tok(s, TokenKind::Dot);
        }
        case ':'  => {
            if consume(s, ':') {
                return tok(s, TokenKind::ColonColon);
            }
            return tok(s, TokenKind::Colon);
        }
        case '-'  => {
            if consume(s, '>') {
                return tok(s, TokenKind::Arrow);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::MinusEqual);
            }
            // If followed by a digit, scan as negative number
            if let ch = current(s); isDigit(ch) {
                return scanNumber(s);
            }
            return tok(s, TokenKind::Minus);
        }
        case '+' => {
            if consume(s, '=') {
                return tok(s, TokenKind::PlusEqual);
            }
            if let ch = current(s); isDigit(ch) {
                return scanNumber(s);
            }
            return tok(s, TokenKind::Plus);
        }
        case '/' => {
            if consume(s, '=') {
                return tok(s, TokenKind::SlashEqual);
            }
            return tok(s, TokenKind::Slash);
        }
        case '*' => {
            if consume(s, '=') {
                return tok(s, TokenKind::StarEqual);
            }
            return tok(s, TokenKind::Star);
        }
        case '%' => {
            if consume(s, '=') {
                return tok(s, TokenKind::PercentEqual);
            }
            return tok(s, TokenKind::Percent);
        }
        case '&' => {
            if consume(s, '=') {
                return tok(s, TokenKind::AmpEqual);
            }
            return tok(s, TokenKind::Amp);
        }
        case '?' => return tok(s, TokenKind::Question),
        case '|' => {
            if consume(s, '=') {
                return tok(s, TokenKind::PipeEqual);
            }
            return tok(s, TokenKind::Pipe);
        }
        case '^' => {
            if consume(s, '=') {
                return tok(s, TokenKind::CaretEqual);
            }
            return tok(s, TokenKind::Caret);
        }
        case '~' => return tok(s, TokenKind::Tilde),
        case '!' => {
            if consume(s, '=') {
                return tok(s, TokenKind::BangEqual);
            }
            return tok(s, TokenKind::Bang);
        }
        case '=' => {
            if consume(s, '>') {
                return tok(s, TokenKind::FatArrow);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::EqualEqual);
            }
            return tok(s, TokenKind::Equal);
        }
        case '<' => {
            if consume(s, '<') {
                if consume(s, '=') {
                    return tok(s, TokenKind::LtLtEqual);
                }
                return tok(s, TokenKind::LtLt);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::LtEqual);
            }
            return tok(s, TokenKind::Lt);
        }
        case '>' => {
            if consume(s, '>') {
                if consume(s, '=') {
                    return tok(s, TokenKind::GtGtEqual);
                }
                return tok(s, TokenKind::GtGt);
            }
            if consume(s, '=') {
                return tok(s, TokenKind::GtEqual);
            }
            return tok(s, TokenKind::Gt);
        }
        case '@' => {
            // Scan `@identifier` as a single token.
            while let ch = current(s); isAlpha(ch) {
                advance(s);
            }
            // Must have at least one character after `@`.
            if s.cursor - s.token <= 1 {
                return invalid(s.token, "expected identifier after `@`");
            }
            let name = &s.source[s.token..s.cursor];
            return Token {
                kind: TokenKind::AtIdent,
                source: strings::intern(s.pool, name),
                offset: s.token,
            };
        }
        case '_' => {
            if let ch = current(s); isAlpha(ch) or isDigit(ch) or ch == '_' {
                // This is part of an identifier like `_foo` or `__start`
                return scanIdentifier(s);
            }
            return tok(s, TokenKind::Underscore);
        }
        else => return invalid(s.token, "unexpected character"),
    }
}

/// Get the source code location from a byte offset.
pub fn getLocation(sourceLoc: SourceLoc, source: *[u8], offset: u32) -> ?Location {
    let mut l: u16 = 1;
    let mut c: u16 = 1;

    if offset >= source.len {
        return nil;
    }
    for ch in &source[..offset] {
        if ch == '\n' {
            c = 1;
            l += 1;
        } else {
            c += 1;
        }
    }
    return Location { source: sourceLoc, line: l, col: c };
}