compiler/
lib/
examples/
std/
arch/
rv64/
asm/
scanner/
emit.rad
8.2 KiB
parser.rad
29.8 KiB
scanner.rad
9.0 KiB
tests.rad
6.3 KiB
asm.rad
22.6 KiB
decode.rad
14.3 KiB
emit.rad
26.0 KiB
encode.rad
21.5 KiB
isel.rad
47.0 KiB
printer.rad
12.6 KiB
tests.rad
17.1 KiB
rv64.rad
11.9 KiB
char/
collections/
lang/
sys/
arch.rad
68 B
char.rad
855 B
collections.rad
39 B
fmt.rad
8.5 KiB
intrinsics.rad
391 B
io.rad
1.3 KiB
lang.rad
258 B
mem.rad
2.2 KiB
sys.rad
173 B
testing.rad
2.4 KiB
tests.rad
14.8 KiB
vec.rad
3.2 KiB
std.rad
281 B
scripts/
seed/
sublime/
test/
vim/
.gitignore
366 B
.gitsigners
112 B
LICENSE
1.1 KiB
Makefile
3.6 KiB
README
2.5 KiB
STYLE
2.5 KiB
std.lib
1.2 KiB
std.lib.test
347 B
lib/std/arch/rv64/asm/scanner.rad
raw
| 1 | //! Assembly-specific lexical scanner. |
| 2 | @test mod tests; |
| 3 | |
| 4 | use std::char; |
| 5 | use std::lang::strings; |
| 6 | |
| 7 | /// Token kinds recognized by the assembler scanner. |
| 8 | export union TokenKind { |
| 9 | /// Special end-of-file token generated when the input is exhausted. |
| 10 | Eof, |
| 11 | /// Special invalid token carrying an error message in [`Token::source`]. |
| 12 | Invalid, |
| 13 | |
| 14 | LParen, // ( |
| 15 | RParen, // ) |
| 16 | Comma, // , |
| 17 | Colon, // : |
| 18 | ColonColon, // :: |
| 19 | Semicolon, // ; |
| 20 | Minus, // - |
| 21 | Plus, // + |
| 22 | Slash, // / |
| 23 | Star, // * |
| 24 | |
| 25 | /// Bare identifier used for mnemonics, constants, CSR names, and symbol segments. |
| 26 | Ident, |
| 27 | /// Identifier-shaped label token including the leading `@`. |
| 28 | Label, |
| 29 | /// Quoted label token including the leading `@` and quote delimiters. |
| 30 | QuotedLabel, |
| 31 | /// Directive token including the leading `.`. |
| 32 | Directive, |
| 33 | /// Register token including the leading `%`. |
| 34 | Register, |
| 35 | |
| 36 | /// String literal token including delimiters. |
| 37 | String, |
| 38 | /// Character literal token including delimiters. |
| 39 | Char, |
| 40 | /// Integer literal token. |
| 41 | Number, |
| 42 | } |
| 43 | |
| 44 | /// Describes where assembler source originated from. |
| 45 | export union SourceKind { |
| 46 | /// Source loaded from a file at the given path. |
| 47 | File { path: *[u8] }, |
| 48 | /// Source provided as an inline string. |
| 49 | String, |
| 50 | } |
| 51 | |
| 52 | /// Lexical scanner state for assembler source. |
| 53 | export record Scanner { |
| 54 | /// Origin of the source being scanned. |
| 55 | sourceKind: SourceKind, |
| 56 | /// Source buffer. |
| 57 | source: *[u8], |
| 58 | /// Offset of the current token in `source`. |
| 59 | token: u32, |
| 60 | /// Offset of the current cursor in `source`. |
| 61 | cursor: u32, |
| 62 | /// Current token observed by the parser. |
| 63 | current: Token, |
| 64 | /// Previously consumed token observed by the parser. |
| 65 | previous: Token, |
| 66 | /// Intern pool for identifier-shaped token text. |
| 67 | pool: *mut strings::Pool, |
| 68 | } |
| 69 | |
| 70 | /// Individual token with kind, source text, and byte offset. |
| 71 | export record Token { |
| 72 | /// Token kind. |
| 73 | kind: TokenKind, |
| 74 | /// Token source text. |
| 75 | source: *[u8], |
| 76 | /// Byte offset of `source` in the input buffer. |
| 77 | offset: u32, |
| 78 | } |
| 79 | |
| 80 | /// Create a new assembler scanner. |
| 81 | export fn scanner(sourceKind: SourceKind, source: *[u8], pool: *mut strings::Pool) -> Scanner { |
| 82 | let invalidToken = invalid(0, ""); |
| 83 | return Scanner { |
| 84 | sourceKind, |
| 85 | source, |
| 86 | token: 0, |
| 87 | cursor: 0, |
| 88 | current: invalidToken, |
| 89 | previous: invalidToken, |
| 90 | pool, |
| 91 | }; |
| 92 | } |
| 93 | |
| 94 | /// Create an invalid token with the given message. |
| 95 | export fn invalid(offset: u32, message: *[u8]) -> Token { |
| 96 | return Token { kind: TokenKind::Invalid, source: message, offset }; |
| 97 | } |
| 98 | |
| 99 | /// Return `true` when the scanner has consumed all input. |
| 100 | export fn isEof(s: *Scanner) -> bool { |
| 101 | return s.cursor >= s.source.len; |
| 102 | } |
| 103 | |
| 104 | /// Return the current character without advancing. |
| 105 | fn current(s: *Scanner) -> ?u8 { |
| 106 | if isEof(s) { |
| 107 | return nil; |
| 108 | } |
| 109 | return s.source[s.cursor]; |
| 110 | } |
| 111 | |
| 112 | /// Return the next character without advancing. |
| 113 | fn peek(s: *Scanner) -> ?u8 { |
| 114 | if s.cursor + 1 >= s.source.len { |
| 115 | return nil; |
| 116 | } |
| 117 | return s.source[s.cursor + 1]; |
| 118 | } |
| 119 | |
| 120 | /// Advance the scanner cursor and return the consumed character. |
| 121 | fn advance(s: *mut Scanner) -> u8 { |
| 122 | set s.cursor += 1; |
| 123 | return s.source[s.cursor - 1]; |
| 124 | } |
| 125 | |
| 126 | /// Consume `expected` when it is present at the current cursor. |
| 127 | fn consume(s: *mut Scanner, expected: u8) -> bool { |
| 128 | if let ch = current(s); ch == expected { |
| 129 | advance(s); |
| 130 | return true; |
| 131 | } |
| 132 | return false; |
| 133 | } |
| 134 | |
| 135 | /// Skip spaces, newlines, tabs, and `//` line comments. |
| 136 | fn skipWhitespace(s: *mut Scanner) { |
| 137 | while let ch = current(s) { |
| 138 | match ch { |
| 139 | case ' ', '\n', '\r', '\t' => advance(s), |
| 140 | case '/' => { |
| 141 | if let nextCh = peek(s); nextCh == '/' { |
| 142 | while let lineCh = current(s); lineCh <> '\n' { |
| 143 | advance(s); |
| 144 | } |
| 145 | } else { |
| 146 | return; |
| 147 | } |
| 148 | } |
| 149 | else => return, |
| 150 | } |
| 151 | } |
| 152 | } |
| 153 | |
| 154 | /// Return the next assembler token. |
| 155 | export fn next(s: *mut Scanner) -> Token { |
| 156 | skipWhitespace(s); |
| 157 | set s.token = s.cursor; |
| 158 | |
| 159 | if isEof(s) { |
| 160 | return tok(s, TokenKind::Eof); |
| 161 | } |
| 162 | let ch = advance(s); |
| 163 | |
| 164 | if char::isDigit(ch) { |
| 165 | return scanNumber(s); |
| 166 | } |
| 167 | if char::isAlpha(ch) or ch == '_' { |
| 168 | return scanIdentToken(s, TokenKind::Ident); |
| 169 | } |
| 170 | |
| 171 | match ch { |
| 172 | case '(' => return tok(s, TokenKind::LParen), |
| 173 | case ')' => return tok(s, TokenKind::RParen), |
| 174 | case ',' => return tok(s, TokenKind::Comma), |
| 175 | case ';' => return tok(s, TokenKind::Semicolon), |
| 176 | case ':' => { |
| 177 | if consume(s, ':') { |
| 178 | return tok(s, TokenKind::ColonColon); |
| 179 | } |
| 180 | return invalid(s.token, "unexpected `:`"); |
| 181 | } |
| 182 | case '"' => return scanString(s), |
| 183 | case '\'' => return scanChar(s), |
| 184 | case '.' => return scanPrefixedToken(s, TokenKind::Directive, "expected directive name after `.`"), |
| 185 | case '@' => return scanLabelToken(s), |
| 186 | case '%' => return scanPrefixedToken(s, TokenKind::Register, "expected register after `%`"), |
| 187 | case '-' => return scanSignedNumberOrToken(s, TokenKind::Minus), |
| 188 | case '+' => return scanSignedNumberOrToken(s, TokenKind::Plus), |
| 189 | case '/' => return tok(s, TokenKind::Slash), |
| 190 | case '*' => return tok(s, TokenKind::Star), |
| 191 | else => return invalid(s.token, "unexpected character"), |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | /// Create a token spanning the current scanner range. |
| 196 | fn tok(s: *Scanner, kind: TokenKind) -> Token { |
| 197 | return Token { kind, source: &s.source[s.token..s.cursor], offset: s.token }; |
| 198 | } |
| 199 | |
| 200 | /// Scan the identifier continuation characters that follow the current token start. |
| 201 | fn scanIdentifierBody(s: *mut Scanner) { |
| 202 | while let ch = current(s); char::isAlpha(ch) or char::isDigit(ch) or ch == '_' { |
| 203 | advance(s); |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | /// Scan a signed number when `+` or `-` is followed by a digit, otherwise return the punctuation token. |
| 208 | fn scanSignedNumberOrToken(s: *mut Scanner, kind: TokenKind) -> Token { |
| 209 | if let nextCh = current(s); char::isDigit(nextCh) { |
| 210 | return scanNumber(s); |
| 211 | } |
| 212 | return tok(s, kind); |
| 213 | } |
| 214 | |
| 215 | /// Scan a numeric literal. |
| 216 | fn scanNumber(s: *mut Scanner) -> Token { |
| 217 | let first = s.source[s.cursor - 1]; |
| 218 | if first == '-' or first == '+' { |
| 219 | advance(s); |
| 220 | } |
| 221 | if s.source[s.cursor - 1] == '0' { |
| 222 | if let ch = current(s); ch == 'x' or ch == 'X' { |
| 223 | advance(s); |
| 224 | if let digit = current(s); not char::isHexDigit(digit) { |
| 225 | return invalid(s.token, "invalid hex literal"); |
| 226 | } |
| 227 | while let digit = current(s); char::isHexDigit(digit) { |
| 228 | advance(s); |
| 229 | } |
| 230 | return tok(s, TokenKind::Number); |
| 231 | } |
| 232 | } |
| 233 | while let digit = current(s); char::isDigit(digit) { |
| 234 | advance(s); |
| 235 | } |
| 236 | return tok(s, TokenKind::Number); |
| 237 | } |
| 238 | |
| 239 | /// Scan a printable token terminated by `delim`. |
| 240 | fn scanCharsUntil(s: *mut Scanner, delim: u8, kind: TokenKind) -> ?Token { |
| 241 | while let ch = current(s); ch <> delim { |
| 242 | if not char::isPrint(ch) { |
| 243 | return invalid(s.token, "invalid character"); |
| 244 | } |
| 245 | if consume(s, '\\') { |
| 246 | if isEof(s) { |
| 247 | return nil; |
| 248 | } |
| 249 | } |
| 250 | advance(s); |
| 251 | } |
| 252 | if not consume(s, delim) { |
| 253 | return nil; |
| 254 | } |
| 255 | return tok(s, kind); |
| 256 | } |
| 257 | |
| 258 | /// Scan a string literal. |
| 259 | fn scanString(s: *mut Scanner) -> Token { |
| 260 | if let token = scanCharsUntil(s, '"', TokenKind::String) { |
| 261 | return token; |
| 262 | } |
| 263 | return invalid(s.token, "unterminated string"); |
| 264 | } |
| 265 | |
| 266 | /// Scan a character literal. |
| 267 | fn scanChar(s: *mut Scanner) -> Token { |
| 268 | if let token = scanCharsUntil(s, '\'', TokenKind::Char) { |
| 269 | return token; |
| 270 | } |
| 271 | return invalid(s.token, "unterminated character"); |
| 272 | } |
| 273 | |
| 274 | /// Scan an identifier-shaped token of the given kind. |
| 275 | fn scanIdentToken(s: *mut Scanner, kind: TokenKind) -> Token { |
| 276 | scanIdentifierBody(s); |
| 277 | |
| 278 | return Token { |
| 279 | kind, |
| 280 | source: strings::intern(s.pool, &s.source[s.token..s.cursor]), |
| 281 | offset: s.token, |
| 282 | }; |
| 283 | } |
| 284 | |
| 285 | /// Scan a sigil-prefixed identifier-shaped token. |
| 286 | fn scanPrefixedToken(s: *mut Scanner, kind: TokenKind, message: *[u8]) -> Token { |
| 287 | let ch = current(s) else { |
| 288 | return invalid(s.token, message); |
| 289 | }; |
| 290 | if not char::isAlpha(ch) and ch <> '_' { |
| 291 | return invalid(s.token, message); |
| 292 | } |
| 293 | scanIdentifierBody(s); |
| 294 | |
| 295 | return Token { |
| 296 | kind, |
| 297 | source: strings::intern(s.pool, &s.source[s.token..s.cursor]), |
| 298 | offset: s.token, |
| 299 | }; |
| 300 | } |
| 301 | |
| 302 | /// Scan an assembler label token, accepting either `@name` or `@"quoted"` syntax. |
| 303 | fn scanLabelToken(s: *mut Scanner) -> Token { |
| 304 | let ch = current(s) else { |
| 305 | return invalid(s.token, "expected label after `@`"); |
| 306 | }; |
| 307 | if ch == '"' { |
| 308 | advance(s); |
| 309 | if let token = scanCharsUntil(s, '"', TokenKind::QuotedLabel) { |
| 310 | return token; |
| 311 | } |
| 312 | return invalid(s.token, "unterminated quoted label"); |
| 313 | } |
| 314 | return scanPrefixedToken(s, TokenKind::Label, "expected label after `@`"); |
| 315 | } |