#include #include #include "scanner.h" #include "types.h" /* Keyword lookup table. */ static const struct { const char *name; usize length; tokenclass_t tok; } keywords[] = { { "fn", 2, T_FN }, { "pub", 3, T_PUB }, { "return", 6, T_RETURN }, { "while", 5, T_WHILE }, { "mut", 3, T_MUT }, { "let", 3, T_LET }, { "static", 6, T_STATIC }, { "if", 2, T_IF }, { "else", 4, T_ELSE }, { "i8", 2, T_I8 }, { "i16", 3, T_I16 }, { "i32", 3, T_I32 }, { "i64", 3, T_I64 }, { "u8", 2, T_U8 }, { "u16", 3, T_U16 }, { "u32", 3, T_U32 }, { "u64", 3, T_U64 }, { "f32", 3, T_F32 }, { "bool", 4, T_BOOL }, { "void", 4, T_VOID }, { "true", 4, T_TRUE }, { "false", 5, T_FALSE }, { "nil", 3, T_NIL }, { "loop", 4, T_LOOP }, { "try", 3, T_TRY }, { "catch", 5, T_CATCH }, { "for", 3, T_FOR }, { "in", 2, T_IN }, { "const", 5, T_CONST }, { "break", 5, T_BREAK }, { "throw", 5, T_THROW }, { "union", 5, T_UNION }, { "and", 3, T_AND }, { "or", 2, T_OR }, { "not", 3, T_NOT }, { "match", 5, T_MATCH }, { "use", 3, T_USE }, { "case", 4, T_CASE }, { "extern", 6, T_EXTERN }, { "mod", 3, T_MOD }, { "as", 2, T_AS }, { "record", 6, T_RECORD }, { "undefined", 9, T_UNDEF }, { "align", 5, T_ALIGN }, { "throws", 6, T_THROWS }, { "super", 5, T_SUPER }, { "panic", 5, T_PANIC }, { "opaque", 6, T_OPAQUE }, }; /* Initialize scanner with source text. */ void scanner_init(scanner_t *s, const char *file, const char *source) { s->file = file; s->source = source; s->token = source; s->cursor = source; } /* Check if we've reached the end. */ static bool is_eof(scanner_t *s) { return *s->cursor == '\0'; } /* Peek at next character. */ static char peek(scanner_t *s) { if (is_eof(s)) return '\0'; return s->cursor[1]; } /* Advance current position and return previous char. */ static char advance(scanner_t *s) { s->cursor++; return s->cursor[-1]; } /* Match expected character. */ static bool consume(scanner_t *s, char expected) { if (is_eof(s)) return false; if (*s->cursor != expected) return false; s->cursor++; return true; } /* Create a token of given class. */ static token_t tok(scanner_t *s, tokenclass_t cls) { token_t t = { .cls = cls, .start = s->token, .length = (usize)(s->cursor - s->token), .position = (usize)(s->token - s->source) }; return t; } /* Create an error token. */ static token_t error_tok( scanner_t *s, const char *offset, const char *message ) { token_t t = { .cls = T_INVALID, .start = message, .length = strlen(message), .position = (usize)(offset - s->source) }; return t; } /* Skip whitespace and comments. */ static void skip_whitespace(scanner_t *s) { for (;;) { switch (*s->cursor) { case ' ': case '\r': case '\t': advance(s); break; case '\n': advance(s); break; case '/': if (peek(s) == '/') { /* Comment goes until end of line. */ while (*s->cursor != '\n' && !is_eof(s)) advance(s); } else { return; } break; default: return; } } } /* Check if character is digit. */ static bool is_digit(char c) { return c >= '0' && c <= '9'; } /* Check if character is hex digit. */ static bool is_hex_digit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } /* Check if character is binary digit. */ static bool is_bin_digit(char c) { return c == '0' || c == '1'; } /* Check if character is letter. */ static bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } /* Scan a number token. */ static token_t scan_number(scanner_t *s) { bool signed_token = (s->token[0] == '-') || (s->token[0] == '+'); if (signed_token) advance(s); /* Consume the leading sign. */ /* Check for hex literal (0x or 0X prefix) */ if (s->cursor[-1] == '0' && (*s->cursor == 'x' || *s->cursor == 'X')) { advance(s); /* Consume the 'x' or 'X' */ /* Must have at least one hex digit after 0x */ if (!is_hex_digit(*s->cursor)) return error_tok(s, s->token, "invalid hex literal"); while (is_hex_digit(*s->cursor)) advance(s); return tok(s, T_NUMBER); } /* Check for binary literal (0b or 0B prefix) */ if (s->cursor[-1] == '0' && (*s->cursor == 'b' || *s->cursor == 'B')) { advance(s); /* Consume the 'b' or 'B' */ /* Must have at least one binary digit after 0b */ if (!is_bin_digit(*s->cursor)) return error_tok(s, s->token, "invalid binary literal"); while (is_bin_digit(*s->cursor)) advance(s); return tok(s, T_NUMBER); } /* Regular decimal number */ while (is_digit(*s->cursor)) advance(s); /* Look for decimal part. */ if (*s->cursor == '.' && is_digit(peek(s))) { advance(s); /* Consume the "." */ while (is_digit(*s->cursor)) advance(s); } return tok(s, T_NUMBER); } /* Scan a string. */ static token_t scan_string(scanner_t *s) { while (*s->cursor != '"' && !is_eof(s)) { consume(s, '\\'); // Consume escapes. advance(s); } if (!consume(s, '"')) return error_tok(s, s->token, "unterminated string"); return tok(s, T_STRING); } /* Scan a character, such as: 'z' */ static token_t scan_char(scanner_t *s) { while (*s->cursor != '\'' && !is_eof(s)) { if (!isprint(*s->cursor)) return error_tok(s, s->token, "invalid character"); consume(s, '\\'); advance(s); } if (!consume(s, '\'')) return error_tok(s, s->token, "unterminated character"); return tok(s, T_CHAR); } /* Return a keyword or identifier token. */ static tokenclass_t keyword_or_ident(const char *start, usize length) { for (usize i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) { if (length == keywords[i].length && memcmp(start, keywords[i].name, length) == 0) { return keywords[i].tok; } } return T_IDENT; } /* Scan an identifier, label or keyword. */ static token_t scan_identifier(scanner_t *s) { while (is_alpha(*s->cursor) || is_digit(*s->cursor) || *s->cursor == '_' || *s->cursor == '#') advance(s); return tok(s, keyword_or_ident(s->token, (usize)(s->cursor - s->token))); } /* Scan the next token. */ token_t scanner_next(scanner_t *s) { skip_whitespace(s); s->token = s->cursor; if (is_eof(s)) return tok(s, T_EOF); char c = advance(s); if (is_digit(c)) return scan_number(s); if (is_alpha(c)) return scan_identifier(s); switch (c) { case '\'': return scan_char(s); case '"': return scan_string(s); case '(': return tok(s, T_LPAREN); case ')': return tok(s, T_RPAREN); case '{': return tok(s, T_LBRACE); case '}': return tok(s, T_RBRACE); case '[': return tok(s, T_LBRACKET); case ']': return tok(s, T_RBRACKET); case ';': return tok(s, T_SEMICOLON); case ',': return tok(s, T_COMMA); case '.': if (*s->cursor == '.') { advance(s); return tok(s, T_DOT_DOT); } return tok(s, T_DOT); case ':': if (*s->cursor == ':') { advance(s); return tok(s, T_COLON_COLON); } return tok(s, T_COLON); case '-': if (*s->cursor == '>') { advance(s); return tok(s, T_ARROW); } /* If followed by a digit, scan as negative number */ if (is_digit(*s->cursor)) { return scan_number(s); } return tok(s, T_MINUS); case '+': if (is_digit(*s->cursor)) { return scan_number(s); } return tok(s, T_PLUS); case '/': return tok(s, T_SLASH); case '*': return tok(s, T_STAR); case '%': return tok(s, T_PERCENT); case '&': return tok(s, T_AMP); case '?': return tok(s, T_QUESTION); case '!': return tok(s, consume(s, '=') ? T_BANG_EQ : T_BANG); case '=': if (*s->cursor == '>') { advance(s); return tok(s, T_FAT_ARROW); } return tok(s, consume(s, '=') ? T_EQ_EQ : T_EQ); case '<': if (*s->cursor == '<') { advance(s); return tok(s, T_LSHIFT); } return tok(s, consume(s, '=') ? T_LT_EQ : T_LT); case '>': if (*s->cursor == '>') { advance(s); return tok(s, T_RSHIFT); } return tok(s, consume(s, '=') ? T_GT_EQ : T_GT); case '|': return tok(s, T_PIPE); case '^': return tok(s, T_CARET); case '~': return tok(s, T_TILDE); case '@': /* Scan @identifier as a single token. */ if (!is_alpha(*s->cursor)) return error_tok(s, s->token, "expected identifier after `@`"); while (is_alpha(*s->cursor)) advance(s); return tok(s, T_AT_IDENT); case '_': if (is_alpha(*s->cursor) || is_digit(*s->cursor) || *s->cursor == '_') { /* This is part of an identifier like `_foo` or `__start` */ return scan_identifier(s); } return tok(s, T_UNDERSCORE); } return error_tok(s, s->token, "unexpected character"); } /* Get the source code location from a byte offset. */ location_t scanner_get_location(scanner_t *s, u32 position) { u32 l = 1; u32 c = 1; for (u32 i = 0; i < position; i++) { if (s->source[i] == '\n') { l++; c = 1; } else { c++; } } return (location_t){ .file = s->file, .src = s->source + position, .line = l, .col = c, }; }