scanner.c 10.2 KiB raw
1
#include <ctype.h>
2
#include <string.h>
3
4
#include "scanner.h"
5
#include "types.h"
6
7
/* Keyword lookup table. */
8
static const struct {
9
    const char  *name;
10
    usize        length;
11
    tokenclass_t tok;
12
} keywords[] = {
13
    { "fn", 2, T_FN },           { "pub", 3, T_PUB },
14
    { "return", 6, T_RETURN },   { "while", 5, T_WHILE },
15
    { "mut", 3, T_MUT },         { "let", 3, T_LET },
16
    { "static", 6, T_STATIC },   { "if", 2, T_IF },
17
    { "else", 4, T_ELSE },       { "i8", 2, T_I8 },
18
    { "i16", 3, T_I16 },         { "i32", 3, T_I32 },
19
    { "i64", 3, T_I64 },         { "u8", 2, T_U8 },
20
    { "u16", 3, T_U16 },         { "u32", 3, T_U32 },
21
    { "u64", 3, T_U64 },         { "f32", 3, T_F32 },
22
    { "bool", 4, T_BOOL },       { "void", 4, T_VOID },
23
    { "true", 4, T_TRUE },       { "false", 5, T_FALSE },
24
    { "nil", 3, T_NIL },         { "loop", 4, T_LOOP },
25
    { "try", 3, T_TRY },         { "catch", 5, T_CATCH },
26
    { "for", 3, T_FOR },         { "in", 2, T_IN },
27
    { "const", 5, T_CONST },     { "break", 5, T_BREAK },
28
    { "throw", 5, T_THROW },     { "union", 5, T_UNION },
29
    { "and", 3, T_AND },         { "or", 2, T_OR },
30
    { "not", 3, T_NOT },         { "match", 5, T_MATCH },
31
    { "use", 3, T_USE },         { "case", 4, T_CASE },
32
    { "extern", 6, T_EXTERN },   { "mod", 3, T_MOD },
33
    { "as", 2, T_AS },           { "record", 6, T_RECORD },
34
    { "undefined", 9, T_UNDEF }, { "align", 5, T_ALIGN },
35
    { "throws", 6, T_THROWS },   { "super", 5, T_SUPER },
36
    { "panic", 5, T_PANIC },     { "opaque", 6, T_OPAQUE },
37
};
38
39
/* Initialize scanner with source text. */
40
void scanner_init(scanner_t *s, const char *file, const char *source) {
41
    s->file   = file;
42
    s->source = source;
43
    s->token  = source;
44
    s->cursor = source;
45
}
46
47
/* Check if we've reached the end. */
48
static bool is_eof(scanner_t *s) {
49
    return *s->cursor == '\0';
50
}
51
52
/* Peek at next character. */
53
static char peek(scanner_t *s) {
54
    if (is_eof(s))
55
        return '\0';
56
    return s->cursor[1];
57
}
58
59
/* Advance current position and return previous char. */
60
static char advance(scanner_t *s) {
61
    s->cursor++;
62
    return s->cursor[-1];
63
}
64
65
/* Match expected character. */
66
static bool consume(scanner_t *s, char expected) {
67
    if (is_eof(s))
68
        return false;
69
    if (*s->cursor != expected)
70
        return false;
71
    s->cursor++;
72
73
    return true;
74
}
75
76
/* Create a token of given class. */
77
static token_t tok(scanner_t *s, tokenclass_t cls) {
78
    token_t t = { .cls      = cls,
79
                  .start    = s->token,
80
                  .length   = (usize)(s->cursor - s->token),
81
                  .position = (usize)(s->token - s->source) };
82
    return t;
83
}
84
85
/* Create an error token. */
86
static token_t error_tok(
87
    scanner_t *s, const char *offset, const char *message
88
) {
89
    token_t t = { .cls      = T_INVALID,
90
                  .start    = message,
91
                  .length   = strlen(message),
92
                  .position = (usize)(offset - s->source) };
93
    return t;
94
}
95
96
/* Skip whitespace and comments. */
97
static void skip_whitespace(scanner_t *s) {
98
    for (;;) {
99
        switch (*s->cursor) {
100
        case ' ':
101
        case '\r':
102
        case '\t':
103
            advance(s);
104
            break;
105
        case '\n':
106
            advance(s);
107
            break;
108
        case '/':
109
            if (peek(s) == '/') {
110
                /* Comment goes until end of line. */
111
                while (*s->cursor != '\n' && !is_eof(s))
112
                    advance(s);
113
            } else {
114
                return;
115
            }
116
            break;
117
        default:
118
            return;
119
        }
120
    }
121
}
122
123
/* Check if character is digit. */
124
static bool is_digit(char c) {
125
    return c >= '0' && c <= '9';
126
}
127
128
/* Check if character is hex digit. */
129
static bool is_hex_digit(char c) {
130
    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
131
           (c >= 'A' && c <= 'F');
132
}
133
134
/* Check if character is binary digit. */
135
static bool is_bin_digit(char c) {
136
    return c == '0' || c == '1';
137
}
138
139
/* Check if character is letter. */
140
static bool is_alpha(char c) {
141
    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
142
}
143
144
/* Scan a number token. */
145
static token_t scan_number(scanner_t *s) {
146
    bool signed_token = (s->token[0] == '-') || (s->token[0] == '+');
147
148
    if (signed_token)
149
        advance(s); /* Consume the leading sign. */
150
151
    /* Check for hex literal (0x or 0X prefix) */
152
    if (s->cursor[-1] == '0' && (*s->cursor == 'x' || *s->cursor == 'X')) {
153
        advance(s); /* Consume the 'x' or 'X' */
154
        /* Must have at least one hex digit after 0x */
155
        if (!is_hex_digit(*s->cursor))
156
            return error_tok(s, s->token, "invalid hex literal");
157
158
        while (is_hex_digit(*s->cursor))
159
            advance(s);
160
161
        return tok(s, T_NUMBER);
162
    }
163
164
    /* Check for binary literal (0b or 0B prefix) */
165
    if (s->cursor[-1] == '0' && (*s->cursor == 'b' || *s->cursor == 'B')) {
166
        advance(s); /* Consume the 'b' or 'B' */
167
        /* Must have at least one binary digit after 0b */
168
        if (!is_bin_digit(*s->cursor))
169
            return error_tok(s, s->token, "invalid binary literal");
170
171
        while (is_bin_digit(*s->cursor))
172
            advance(s);
173
174
        return tok(s, T_NUMBER);
175
    }
176
177
    /* Regular decimal number */
178
    while (is_digit(*s->cursor))
179
        advance(s);
180
181
    /* Look for decimal part. */
182
    if (*s->cursor == '.' && is_digit(peek(s))) {
183
        advance(s); /* Consume the "." */
184
        while (is_digit(*s->cursor))
185
            advance(s);
186
    }
187
    return tok(s, T_NUMBER);
188
}
189
190
/* Scan a string. */
191
static token_t scan_string(scanner_t *s) {
192
    while (*s->cursor != '"' && !is_eof(s)) {
193
        consume(s, '\\'); // Consume escapes.
194
        advance(s);
195
    }
196
    if (!consume(s, '"'))
197
        return error_tok(s, s->token, "unterminated string");
198
199
    return tok(s, T_STRING);
200
}
201
202
/* Scan a character, such as: 'z' */
203
static token_t scan_char(scanner_t *s) {
204
    while (*s->cursor != '\'' && !is_eof(s)) {
205
        if (!isprint(*s->cursor))
206
            return error_tok(s, s->token, "invalid character");
207
208
        consume(s, '\\');
209
        advance(s);
210
    }
211
    if (!consume(s, '\''))
212
        return error_tok(s, s->token, "unterminated character");
213
214
    return tok(s, T_CHAR);
215
}
216
217
/* Return a keyword or identifier token. */
218
static tokenclass_t keyword_or_ident(const char *start, usize length) {
219
    for (usize i = 0; i < sizeof(keywords) / sizeof(keywords[0]); i++) {
220
        if (length == keywords[i].length &&
221
            memcmp(start, keywords[i].name, length) == 0) {
222
            return keywords[i].tok;
223
        }
224
    }
225
    return T_IDENT;
226
}
227
228
/* Scan an identifier, label or keyword. */
229
static token_t scan_identifier(scanner_t *s) {
230
    while (is_alpha(*s->cursor) || is_digit(*s->cursor) || *s->cursor == '_' ||
231
           *s->cursor == '#')
232
        advance(s);
233
234
    return tok(s, keyword_or_ident(s->token, (usize)(s->cursor - s->token)));
235
}
236
237
/* Scan the next token. */
238
token_t scanner_next(scanner_t *s) {
239
    skip_whitespace(s);
240
    s->token = s->cursor;
241
242
    if (is_eof(s))
243
        return tok(s, T_EOF);
244
245
    char c = advance(s);
246
247
    if (is_digit(c))
248
        return scan_number(s);
249
    if (is_alpha(c))
250
        return scan_identifier(s);
251
252
    switch (c) {
253
    case '\'':
254
        return scan_char(s);
255
    case '"':
256
        return scan_string(s);
257
    case '(':
258
        return tok(s, T_LPAREN);
259
    case ')':
260
        return tok(s, T_RPAREN);
261
    case '{':
262
        return tok(s, T_LBRACE);
263
    case '}':
264
        return tok(s, T_RBRACE);
265
    case '[':
266
        return tok(s, T_LBRACKET);
267
    case ']':
268
        return tok(s, T_RBRACKET);
269
    case ';':
270
        return tok(s, T_SEMICOLON);
271
    case ',':
272
        return tok(s, T_COMMA);
273
    case '.':
274
        if (*s->cursor == '.') {
275
            advance(s);
276
            return tok(s, T_DOT_DOT);
277
        }
278
        return tok(s, T_DOT);
279
    case ':':
280
        if (*s->cursor == ':') {
281
            advance(s);
282
            return tok(s, T_COLON_COLON);
283
        }
284
        return tok(s, T_COLON);
285
    case '-':
286
        if (*s->cursor == '>') {
287
            advance(s);
288
            return tok(s, T_ARROW);
289
        }
290
        /* If followed by a digit, scan as negative number */
291
        if (is_digit(*s->cursor)) {
292
            return scan_number(s);
293
        }
294
        return tok(s, T_MINUS);
295
    case '+':
296
        if (is_digit(*s->cursor)) {
297
            return scan_number(s);
298
        }
299
        return tok(s, T_PLUS);
300
    case '/':
301
        return tok(s, T_SLASH);
302
    case '*':
303
        return tok(s, T_STAR);
304
    case '%':
305
        return tok(s, T_PERCENT);
306
    case '&':
307
        return tok(s, T_AMP);
308
    case '?':
309
        return tok(s, T_QUESTION);
310
    case '!':
311
        return tok(s, consume(s, '=') ? T_BANG_EQ : T_BANG);
312
    case '=':
313
        if (*s->cursor == '>') {
314
            advance(s);
315
            return tok(s, T_FAT_ARROW);
316
        }
317
        return tok(s, consume(s, '=') ? T_EQ_EQ : T_EQ);
318
    case '<':
319
        if (*s->cursor == '<') {
320
            advance(s);
321
            return tok(s, T_LSHIFT);
322
        }
323
        return tok(s, consume(s, '=') ? T_LT_EQ : T_LT);
324
    case '>':
325
        if (*s->cursor == '>') {
326
            advance(s);
327
            return tok(s, T_RSHIFT);
328
        }
329
        return tok(s, consume(s, '=') ? T_GT_EQ : T_GT);
330
    case '|':
331
        return tok(s, T_PIPE);
332
    case '^':
333
        return tok(s, T_CARET);
334
    case '~':
335
        return tok(s, T_TILDE);
336
    case '@':
337
        /* Scan @identifier as a single token. */
338
        if (!is_alpha(*s->cursor))
339
            return error_tok(s, s->token, "expected identifier after `@`");
340
        while (is_alpha(*s->cursor))
341
            advance(s);
342
        return tok(s, T_AT_IDENT);
343
    case '_':
344
        if (is_alpha(*s->cursor) || is_digit(*s->cursor) || *s->cursor == '_') {
345
            /* This is part of an identifier like `_foo` or `__start` */
346
            return scan_identifier(s);
347
        }
348
        return tok(s, T_UNDERSCORE);
349
    }
350
    return error_tok(s, s->token, "unexpected character");
351
}
352
353
/* Get the source code location from a byte offset. */
354
location_t scanner_get_location(scanner_t *s, u32 position) {
355
    u32 l = 1;
356
    u32 c = 1;
357
358
    for (u32 i = 0; i < position; i++) {
359
        if (s->source[i] == '\n') {
360
            l++;
361
            c = 1;
362
        } else {
363
            c++;
364
        }
365
    }
366
    return (location_t){
367
        .file = s->file,
368
        .src  = s->source + position,
369
        .line = l,
370
        .col  = c,
371
    };
372
}