radiance · Radiance self-hosting compiler

compiler/ lib/ examples/ std/ arch/ rv64/ tests/ decode.rad 14.6 KiB emit.rad 24.4 KiB encode.rad 19.9 KiB isel.rad 41.1 KiB printer.rad 13.0 KiB tests.rad 15.7 KiB rv64.rad 13.0 KiB collections/ lang/ sys/ arch.rad 65 B collections.rad 36 B fmt.rad 3.8 KiB intrinsics.rad 206 B io.rad 1.2 KiB lang.rad 222 B mem.rad 2.2 KiB sys.rad 167 B testing.rad 2.4 KiB tests.rad 11.6 KiB vec.rad 3.1 KiB std.rad 231 B scripts/ seed/ test/ vim/ .gitignore 353 B .gitsigners 112 B LICENSE 1.1 KiB Makefile 3.1 KiB README 2.5 KiB std.lib 987 B std.lib.test 252 B
lib/std/arch/rv64/isel.rad 41.1 KiB raw
//! RV64 instruction selection.
//!
//! Walks IL and selects RV64 instructions for each operation.
//!
//! *Register resolution hierarchy*
//!
//!   getReg(ssa) -> Reg
//!     Primitive physical register lookup. Panics if the register is spilled.
//!     Used as a building block by the functions below.
//!
//!   getSrcReg(ssa, scratch) -> Reg
//!     Source register for an [`il::Reg`] operand. Returns the physical register,
//!     or loads a spilled value into `scratch`. Used for instruction fields
//!     typed as [`il::Reg`] (e.g. base addresses in Load/Store/Blit).
//!
//!   getDstReg(ssa, scratch) -> Reg
//!     Destination register for an instruction result. Returns the physical
//!     register, or records a pending spill and returns `scratch`. The pending
//!     spill is flushed by [`selectBlock`] after each instruction.
//!
//!   resolveVal(scratch, val) -> Reg
//!     Resolve an [`il::Val`] to whatever register holds it. Delegates to [`getSrcReg`]
//!     for register values; materializes immediates and symbols into `scratch`.
//!     Used for operands that can be consumed from any register.
//!
//!   loadVal(rd, val) -> Reg
//!     Force an [`il::Val`] into a specific register `rd`. Built on [`resolveVal`] + [`emitMv`].
//!     Used when the instruction requires the value in `rd` (e.g. `sub rd, rd, rs2`).

use std::mem;
use std::lang::il;
use std::lang::gen::regalloc;
use std::lang::gen::labels;

use super::encode;
use super::emit;

///////////////
// Constants //
///////////////

/// Shift amount for byte sign/zero extension (64 - 8).
const SHIFT_W8: i32 = 56;
/// Shift amount for halfword sign/zero extension (64 - 16).
const SHIFT_W16: i32 = 48;
/// Shift amount for word sign/zero extension (64 - 32).
const SHIFT_W32: i32 = 32;
/// Mask for extracting byte value.
const MASK_W8: i32 = 0xFF;
/// Maximum number of block arguments supported.
const MAX_BLOCK_ARGS: u32 = 16;

/// Binary operation.
union BinOp { Add, And, Or, Xor }
/// Shift operation.
union ShiftOp { Sll, Srl, Sra }
/// Compare operation.
union CmpOp { Slt, Ult }

/// Selector errors.
pub union SelectorError {
    Internal,
}

/// A pending spill store to be flushed after instruction selection.
record PendingSpill {
    /// The SSA register that was spilled.
    ssa: il::Reg,
    /// The physical register holding the value to store.
    rd: super::Reg,
}

////////////////////
// Selector State //
////////////////////

/// Instruction selector state.
pub record Selector {
    /// Emitter for outputting instructions.
    e: *mut emit::Emitter,
    /// Register allocation result.
    ralloc: *regalloc::AllocResult,
    /// Data symbols for resolving symbol addresses.
    dataSyms: *[super::DataSym],
    /// Total stack frame size.
    frameSize: i32,
    /// Running offset into the reserve region of the frame.
    /// Tracks current position within the pre-allocated reserve slots.
    reserveOffset: i32,
    /// Pending spill store, auto-committed after each instruction.
    pendingSpill: ?PendingSpill,
    /// Next synthetic block index for skip-branch targets.
    nextSynthBlock: u32,
}

/////////////////////////
// Register Allocation //
/////////////////////////

/// Get the physical register for an already-allocated SSA register.
fn getReg(s: *Selector, ssa: il::Reg) -> super::Reg {
    let phys = s.ralloc.assignments[ssa.n] else {
        panic "getReg: spilled register has no physical assignment";
    };
    return super::Reg { n: phys };
}

/// Compute the FP-relative offset for a spill slot.
/// Spill slots live at the bottom of the frame.
/// Since `FP = SP + totalFrameSize`, the offset from FP is `slot - totalSize`.
fn spillOffset(s: *Selector, slot: i32) -> i32 {
    return slot - s.frameSize;
}

/// Get the destination register for an SSA register.
/// If the register is spilled, records a pending spill and returns the scratch
/// register. The pending spill is auto-committed by [`selectBlock`] after each
/// instruction. If not spilled, returns the physical register.
fn getDstReg(s: *mut Selector, ssa: il::Reg, scratch: super::Reg) -> super::Reg {
    if let _ = regalloc::spill::spillSlot(&s.ralloc.spill, ssa) {
        s.pendingSpill = PendingSpill { ssa, rd: scratch };
        return scratch;
    }
    return getReg(s, ssa);
}

/// Get the source register for an SSA register.
/// If the register is spilled, loads the value from the spill slot into the
/// scratch register and returns it. Otherwise returns the physical register.
fn getSrcReg(s: *mut Selector, ssa: il::Reg, scratch: super::Reg) -> super::Reg {
    if let slot = regalloc::spill::spillSlot(&s.ralloc.spill, ssa) {
        emit::emitLd(s.e, scratch, super::FP, spillOffset(s, slot));
        return scratch;
    }
    return getReg(s, ssa);
}

/// Look up symbol address in data map.
fn lookupDataSym(s: *Selector, name: *[u8]) -> u32 throws (SelectorError) {
    for sym in s.dataSyms {
        if mem::eq(sym.name, name) {
            return sym.addr;
        }
    }
    throw SelectorError::Internal;
}

/// Resolve an IL value to the physical register holding it.
/// For non-spilled register values, returns the physical register directly.
/// For immediates, symbols, and spilled registers, materializes into `scratch`.
fn resolveVal(s: *mut Selector, scratch: super::Reg, val: il::Val) -> super::Reg {
    match val {
        case il::Val::Reg(r) => {
            return getSrcReg(s, r, scratch);
        },
        case il::Val::Imm(imm) => {
            emit::loadImm(s.e, scratch, imm);
            return scratch;
        },
        case il::Val::DataSym(name) => {
            let addr = try lookupDataSym(s, name) catch {
                panic "resolveVal: data symbol not found";
            };
            emit::loadImm(s.e, scratch, addr as i64);
            return scratch;
        },
        case il::Val::FnAddr(name) => {
            emit::recordAddrLoad(s.e, name, scratch);
            return scratch;
        },
        case il::Val::Undef => {
            return scratch;
        }
    }
}

/// Load an IL value into a specific physical register.
/// Like [`resolveVal`], but ensures the value ends up in `rd`.
fn loadVal(s: *mut Selector, rd: super::Reg, val: il::Val) -> super::Reg {
    let rs = resolveVal(s, rd, val);
    emitMv(s, rd, rs);
    return rd;
}

/// Emit a move instruction if source and destination differ.
fn emitMv(s: *mut Selector, rd: super::Reg, rs: super::Reg) {
    if rd.n != rs.n {
        emit::emit(s.e, encode::mv(rd, rs));
    }
}

/// Emit zero-extension from a sub-word type to the full register width.
fn emitZext(e: *mut emit::Emitter, rd: super::Reg, rs: super::Reg, typ: il::Type) {
    match typ {
        case il::Type::W8 => emit::emit(e, encode::andi(rd, rs, MASK_W8)),
        case il::Type::W16 => {
            emit::emit(e, encode::slli(rd, rs, SHIFT_W16));
            emit::emit(e, encode::srli(rd, rd, SHIFT_W16));
        },
        case il::Type::W32 => {
            emit::emit(e, encode::slli(rd, rs, SHIFT_W32));
            emit::emit(e, encode::srli(rd, rd, SHIFT_W32));
        },
        case il::Type::W64 => {}
    }
}

/// Emit sign-extension from a sub-word type to the full register width.
fn emitSext(e: *mut emit::Emitter, rd: super::Reg, rs: super::Reg, typ: il::Type) {
    match typ {
        case il::Type::W8 => {
            emit::emit(e, encode::slli(rd, rs, SHIFT_W8));
            emit::emit(e, encode::srai(rd, rd, SHIFT_W8));
        },
        case il::Type::W16 => {
            emit::emit(e, encode::slli(rd, rs, SHIFT_W16));
            emit::emit(e, encode::srai(rd, rd, SHIFT_W16));
        },
        case il::Type::W32 => {
            emit::emit(e, encode::addiw(rd, rs, 0));
        },
        case il::Type::W64 => {}
    }
}

////////////////////////
// Instruction Select //
////////////////////////

/// Pre-scan all blocks for constant-sized reserve instructions.
/// Returns the total size needed for all static reserves, respecting alignment.
fn computeReserveSize(func: *il::Fn) -> i32 {
    let mut offset: i32 = 0;
    for b in 0..func.blocks.len {
        let block = &func.blocks[b];
        for instr in block.instrs {
            match instr {
                case il::Instr::Reserve { size, alignment, .. } => {
                    if let case il::Val::Imm(sz) = size {
                        offset = mem::alignUpI32(offset, alignment as i32);
                        offset += (sz as i32);
                    }
                },
                else => {},
            }
        }
    }
    return offset;
}

/// Select instructions for a function.
pub fn selectFn(
    e: *mut emit::Emitter,
    dataSyms: *[super::DataSym],
    ralloc: *regalloc::AllocResult,
    func: *il::Fn
) {
    // Reset block offsets for this function.
    labels::resetBlocks(&mut e.labels);
    // Pre-scan for constant-sized reserves to promote to fixed frame slots.
    let reserveSize = computeReserveSize(func);
    // Compute frame layout from spill slots, reserve slots, and used callee-saved registers.
    let frame = emit::computeFrame(
        ralloc.spill.frameSize + reserveSize,
        ralloc.usedCalleeSaved,
        func.blocks.len
    );
    // Synthetic block indices start after real blocks and the epilogue block.
    let mut s = Selector {
        e, ralloc, dataSyms, frameSize: frame.totalSize,
        reserveOffset: 0, pendingSpill: nil,
        nextSynthBlock: func.blocks.len + 1,
    };
    // Record function name for printing.
    emit::recordFunc(s.e, func.name);
    // Record function code offset for call patching.
    emit::recordFuncOffset(s.e, func.name);
    // Emit prologue.
    emit::emitPrologue(s.e, &frame);

    // Move function params from arg registers to assigned registers.
    // Cross-call params may have been assigned to callee-saved registers
    // instead of their natural arg registers. Spilled params are stored
    // directly to their spill slots.
    for funcParam, i in func.params {
        if i < super::ARG_REGS.len {
            let param = funcParam.value;
            let argReg = super::ARG_REGS[i];

            if let slot = regalloc::spill::spillSlot(&ralloc.spill, param) {
                // Spilled parameter: store arg register to spill slot.
                emit::emitSd(s.e, argReg, super::FP, spillOffset(&s, slot));
            } else if let assigned = ralloc.assignments[param.n] {
                emitMv(&mut s, super::Reg { n: assigned }, argReg);
            }
        }
    }

    // Emit each block.
    for i in 0..func.blocks.len {
        selectBlock(&mut s, i, &func.blocks[i], &frame, func);
    }
    // Emit epilogue.
    emit::emitEpilogue(s.e, &frame);
    // Patch local branches now that all blocks are emitted.
    emit::patchLocalBranches(s.e);
}

/// Select instructions for a block.
fn selectBlock(s: *mut Selector, blockIdx: u32, block: *il::Block, frame: *emit::Frame, func: *il::Fn) {
    // Record block address for branch patching.
    emit::recordBlock(s.e, blockIdx);

    // Block parameters are handled at jump sites (in `Jmp`/`Br`).
    // By the time we enter the block, the arguments have already been
    // moved to the parameter registers by the predecessor's terminator.

    // Process each instruction, auto-committing any pending spill after each.
    let hasLocs = block.locs.len > 0;
    for instr, i in block.instrs {
        // Record debug location before emitting machine instructions.
        if hasLocs {
            emit::recordSrcLoc(s.e, block.locs[i]);
        }
        s.pendingSpill = nil;
        selectInstr(s, blockIdx, instr, frame, func);

        // Flush the pending spill store, if any.
        if let p = s.pendingSpill {
            if let slot = regalloc::spill::spillSlot(&s.ralloc.spill, p.ssa) {
                emit::emitSd(s.e, p.rd, super::FP, spillOffset(s, slot));
            }
            s.pendingSpill = nil;
        }
    }
}

/// Select instructions for a single IL instruction.
fn selectInstr(s: *mut Selector, blockIdx: u32, instr: il::Instr, frame: *emit::Frame, func: *il::Fn) {
    match instr {
        case il::Instr::BinOp { op, typ, dst, a, b } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let rs1 = resolveVal(s, super::SCRATCH1, a);
            selectAluBinOp(s, op, typ, rd, rs1, b);
        },
        case il::Instr::UnOp { op, typ, dst, a } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let rs = resolveVal(s, super::SCRATCH1, a);
            selectAluUnOp(s, op, typ, rd, rs);
        },
        case il::Instr::Load { typ, dst, src, offset } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let base = getSrcReg(s, src, super::SCRATCH2);
            emit::emitLoad(s.e, rd, base, offset, typ);
        },
        case il::Instr::Sload { typ, dst, src, offset } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let base = getSrcReg(s, src, super::SCRATCH2);
            emit::emitSload(s.e, rd, base, offset, typ);
        },
        case il::Instr::Store { typ, src, dst, offset } => {
            let base = getSrcReg(s, dst, super::SCRATCH2);
            let rs = resolveVal(s, super::SCRATCH1, src);
            emit::emitStore(s.e, rs, base, offset, typ);
        },
        case il::Instr::Copy { dst, val } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let rs = resolveVal(s, super::SCRATCH1, val);
            emitMv(s, rd, rs);
        },
        case il::Instr::Reserve { dst, size, alignment } => {
            match size {
                case il::Val::Imm(sz) => {
                    // Constant-sized reserve: use pre-allocated frame slot.
                    let rd = getDstReg(s, dst, super::SCRATCH1);
                    let aligned: i32 = mem::alignUpI32(s.reserveOffset, alignment as i32);
                    let fpOffset: i32 = s.ralloc.spill.frameSize + aligned - s.frameSize;

                    emit::emitAddImm(s.e, rd, super::FP, fpOffset);
                    s.reserveOffset = aligned + (sz as i32);
                },
                case il::Val::Reg(r) => {
                    // Dynamic-sized reserve: runtime SP adjustment.
                    let rd = getDstReg(s, dst, super::SCRATCH1);
                    let rs = getSrcReg(s, r, super::SCRATCH2);

                    emit::emit(s.e, encode::sub(super::SP, super::SP, rs));

                    if alignment > 1 {
                        let mask = 0 - alignment as i32;
                        assert encode::isSmallImm(mask);

                        emit::emit(s.e, encode::andi(super::SP, super::SP, mask));
                    }
                    emit::emit(s.e, encode::mv(rd, super::SP));
                },
                else =>
                    panic "selectInstr: invalid reserve operand",
            }
        },
        case il::Instr::Blit { dst, src, size } => {
            let case il::Val::Imm(staticSize) = size
                else panic "selectInstr: blit requires immediate size";

            let bothSpilled = regalloc::spill::isSpilled(&s.ralloc.spill, dst)
                and regalloc::spill::isSpilled(&s.ralloc.spill, src);

            // When both are spilled, offsets must fit 12-bit immediates
            // since we can't advance base registers (they live in spill
            // slots, not real registers we can mutate).
            if bothSpilled and staticSize as i32 > super::MAX_IMM {
                panic "selectInstr: blit both-spilled with large size";
            }

            // Resolve dst/src base registers.
            let mut rdst = super::SCRATCH2;
            let mut rsrc = super::SCRATCH1;
            let mut srcReload: ?i32 = nil;

            if bothSpilled {
                let dstSlot = regalloc::spill::spillSlot(&s.ralloc.spill, dst) else {
                    panic "selectInstr: blit dst not spilled";
                };
                let srcSlot = regalloc::spill::spillSlot(&s.ralloc.spill, src) else {
                    panic "selectInstr: blit src not spilled";
                };
                emit::emitLd(s.e, super::SCRATCH2, super::FP, spillOffset(s, dstSlot));
                srcReload = spillOffset(s, srcSlot);
            } else {
                rdst = getSrcReg(s, dst, super::SCRATCH2);
                rsrc = getSrcReg(s, src, super::SCRATCH2);
            }
            let mut offset: i32 = 0;
            let mut remaining = staticSize as i32;

            // Copy loop: 8 bytes, then 4 bytes, then 1 byte at a time.
            // Before each load/store pair, check whether the offset is
            // about to exceed the 12-bit signed immediate range. When
            // it does, advance the base registers by the accumulated
            // offset and reset to zero.
            while remaining >= super::DWORD_SIZE {
                if offset > super::MAX_IMM - super::DWORD_SIZE {
                    emit::emitAddImm(s.e, rsrc, rsrc, offset);
                    if rdst.n != rsrc.n {
                        emit::emitAddImm(s.e, rdst, rdst, offset);
                    }
                    offset = 0;
                }
                if let off = srcReload {
                    emit::emitLd(s.e, super::SCRATCH1, super::FP, off);
                    emit::emitLd(s.e, super::SCRATCH1, super::SCRATCH1, offset);
                } else {
                    emit::emitLd(s.e, super::SCRATCH1, rsrc, offset);
                }
                emit::emitSd(s.e, super::SCRATCH1, rdst, offset);
                offset += super::DWORD_SIZE;
                remaining -= super::DWORD_SIZE;
            }
            if remaining >= super::WORD_SIZE {
                if offset > super::MAX_IMM - super::WORD_SIZE {
                    emit::emitAddImm(s.e, rsrc, rsrc, offset);
                    if rdst.n != rsrc.n {
                        emit::emitAddImm(s.e, rdst, rdst, offset);
                    }
                    offset = 0;
                }
                if let off = srcReload {
                    emit::emitLd(s.e, super::SCRATCH1, super::FP, off);
                    emit::emitLw(s.e, super::SCRATCH1, super::SCRATCH1, offset);
                } else {
                    emit::emitLw(s.e, super::SCRATCH1, rsrc, offset);
                }
                emit::emitSw(s.e, super::SCRATCH1, rdst, offset);
                offset += super::WORD_SIZE;
                remaining -= super::WORD_SIZE;
            }
            while remaining > 0 {
                if offset > super::MAX_IMM - 1 {
                    emit::emitAddImm(s.e, rsrc, rsrc, offset);
                    if rdst.n != rsrc.n {
                        emit::emitAddImm(s.e, rdst, rdst, offset);
                    }
                    offset = 0;
                }
                if let off = srcReload {
                    emit::emitLd(s.e, super::SCRATCH1, super::FP, off);
                    emit::emitLb(s.e, super::SCRATCH1, super::SCRATCH1, offset);
                } else {
                    emit::emitLb(s.e, super::SCRATCH1, rsrc, offset);
                }
                emit::emitSb(s.e, super::SCRATCH1, rdst, offset);
                offset += 1;
                remaining -= 1;
            }
            // Restore base registers if they were advanced (never happens
            // in the both-spilled case since size <= MAX_IMM).
            if not bothSpilled {
                let advanced = staticSize as i32 - offset;
                if advanced != 0 {
                    emit::emitAddImm(s.e, rsrc, rsrc, 0 - advanced);
                    if rdst.n != rsrc.n {
                        emit::emitAddImm(s.e, rdst, rdst, 0 - advanced);
                    }
                }
            }
        },
        case il::Instr::Zext { typ, dst, val } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let rs = resolveVal(s, super::SCRATCH1, val);
            emitZext(s.e, rd, rs, typ);
        },
        case il::Instr::Sext { typ, dst, val } => {
            let rd = getDstReg(s, dst, super::SCRATCH1);
            let rs = resolveVal(s, super::SCRATCH1, val);
            emitSext(s.e, rd, rs, typ);
        },
        case il::Instr::Ret { val } => {
            if let v = val {
                let rs = resolveVal(s, super::SCRATCH1, v);
                emitMv(s, super::A0, rs);
            }
            emit::emitReturn(s.e, frame);
        },
        case il::Instr::Jmp { target, args } => {
            // Move arguments to target block's parameter registers.
            emitBlockArgs(s, func, target, args);
            // Skip branch if target is the next block (fallthrough).
            if target != blockIdx + 1 {
                emit::recordBranch(s.e, target, emit::BranchKind::Jump);
            }
        },
        case il::Instr::Br { op, typ, a, b, thenTarget, thenArgs, elseTarget, elseArgs } => {
            let rs1 = resolveVal(s, super::SCRATCH1, a);
            let rs2 = resolveVal(s, super::SCRATCH2, b);

            // Normalize sub-word operands so that both registers have the same
            // canonical representation. Without this, eg. `-1 : i8 ` loaded as
            // `0xFFFFFFFFFFFFFFFF` and `255 : i8` loaded as `0xFF` would compare
            // unequal even though they are the same 8-bit pattern.
            if let case il::CmpOp::Slt = op {
                emitSext(s.e, rs1, rs1, typ);
                emitSext(s.e, rs2, rs2, typ);
            } else {
                emitZext(s.e, rs1, rs1, typ);
                emitZext(s.e, rs2, rs2, typ);
            }
            // Block-argument moves must only execute on the taken path.
            // When `thenArgs` is non-empty, invert the branch so that the
            // then-moves land on the fall-through (taken) side.
            //
            // When one target is the next block in layout order, we can
            // eliminate the trailing unconditional jump by arranging the
            // conditional branch to skip to the *other* target and letting
            // execution fall through.
            if thenArgs.len > 0 and elseArgs.len > 0 {
                panic "selectInstr: both `then` and `else` have block arguments";
            } else if thenArgs.len > 0 {
                emit::recordBranch(s.e, elseTarget, emit::BranchKind::InvertedCond { op, rs1, rs2 });
                emitBlockArgs(s, func, thenTarget, thenArgs);
                // Skip trailing jump if then is the next block (fallthrough).
                if thenTarget != blockIdx + 1 {
                    emit::recordBranch(s.e, thenTarget, emit::BranchKind::Jump);
                }
            } else if thenTarget == blockIdx + 1 and elseArgs.len == 0 {
                // Then is the next block and no else args: invert the
                // condition to branch to else and fall through to then.
                emit::recordBranch(s.e, elseTarget, emit::BranchKind::InvertedCond { op, rs1, rs2 });
            } else {
                emit::recordBranch(s.e, thenTarget, emit::BranchKind::Cond { op, rs1, rs2 });
                emitBlockArgs(s, func, elseTarget, elseArgs);
                // Skip trailing jump if else is the next block (fallthrough).
                if elseTarget != blockIdx + 1 {
                    emit::recordBranch(s.e, elseTarget, emit::BranchKind::Jump);
                }
            }
        },
        case il::Instr::Switch { val, defaultTarget, defaultArgs, cases } => {
            let rs1 = resolveVal(s, super::SCRATCH1, val);
            // When a case has block args, invert the branch to skip past
            // the arg moves.
            for c in cases {
                emit::loadImm(s.e, super::SCRATCH2, c.value);

                if c.args.len > 0 {
                    let skip = s.nextSynthBlock;
                    s.nextSynthBlock = skip + 1;

                    emit::recordBranch(s.e, skip, emit::BranchKind::InvertedCond {
                        op: il::CmpOp::Eq, rs1, rs2: super::SCRATCH2,
                    });
                    emitBlockArgs(s, func, c.target, c.args);
                    emit::recordBranch(s.e, c.target, emit::BranchKind::Jump);
                    emit::recordBlock(s.e, skip);
                } else {
                    emit::recordBranch(s.e, c.target, emit::BranchKind::Cond {
                        op: il::CmpOp::Eq, rs1, rs2: super::SCRATCH2,
                    });
                }
            }
            // Fall through to default.
            emitBlockArgs(s, func, defaultTarget, defaultArgs);
            emit::recordBranch(s.e, defaultTarget, emit::BranchKind::Jump);
        },
        case il::Instr::Unreachable => {
            emit::emit(s.e, encode::ebreak());
        },
        case il::Instr::Call { retTy, dst, func, args } => {
            // For indirect calls, save target to scratch register before arg
            // setup can clobber it.
            if let case il::Val::Reg(r) = func {
                let target = getSrcReg(s, r, super::SCRATCH2);
                emitMv(s, super::SCRATCH2, target);
            }
            // Move arguments to A0-A7 using parallel move resolution.
            if args.len > super::ARG_REGS.len {
                panic "selectInstr: too many call arguments";
            }
            emitParallelMoves(s, &super::ARG_REGS[..], args);

            // Emit call.
            match func {
                case il::Val::FnAddr(name) => {
                    emit::recordCall(s.e, name);
                },
                case il::Val::Reg(_) => {
                    emit::emit(s.e, encode::jalr(super::RA, super::SCRATCH2, 0));
                },
                else => {
                    panic "selectInstr: invalid call target";
                }
            }
            // Move result from A0.
            if let d = dst {
                let rd = getDstReg(s, d, super::SCRATCH1);
                emitMv(s, rd, super::A0);
            }
        },
        case il::Instr::Ecall { dst, num, a0, a1, a2, a3 } => {
            // Move arguments using parallel move.
            // TODO: Can't use slice literals here because the lowerer doesn't
            // support const-evaluating struct/union values in them.
            let ecallDsts: [super::Reg; 5] = [super::A7, super::A0, super::A1, super::A2, super::A3];
            let ecallArgs: [il::Val; 5] = [num, a0, a1, a2, a3];

            emitParallelMoves(s, &ecallDsts[..], &ecallArgs[..]);
            emit::emit(s.e, encode::ecall());

            // Result in A0.
            let ecallRd = getDstReg(s, dst, super::SCRATCH1);
            emitMv(s, ecallRd, super::A0);
        },
        case il::Instr::Ebreak => {
            emit::emit(s.e, encode::ebreak());
        },
    }
}

/// Emit runtime trap for division/modulo by zero.
fn emitTrapIfZero(s: *mut Selector, rs: super::Reg) {
    emit::emit(s.e, encode::bne(rs, super::ZERO, super::INSTR_SIZE * 2));
    emit::emit(s.e, encode::ebreak());
}

/// Select a binary ALU operation, dispatching to the appropriate
/// instruction pattern based on the operation kind and type.
fn selectAluBinOp(s: *mut Selector, op: il::BinOp, typ: il::Type, rd: super::Reg, rs1: super::Reg, b: il::Val) {
    match op {
        case il::BinOp::Add => {
            if typ == il::Type::W32 {
                selectBinOpW(s, rd, rs1, b, super::SCRATCH2);
            } else {
                selectBinOp(s, rd, rs1, b, BinOp::Add, super::SCRATCH2);
            }
        }
        case il::BinOp::Sub => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emit::emit(s.e,
                encode::subw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::sub(rd, rs1, rs2));
        }
        case il::BinOp::Mul => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emit::emit(s.e,
                encode::mulw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::mul(rd, rs1, rs2));
        }
        case il::BinOp::Sdiv => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emitTrapIfZero(s, rs2);
            emit::emit(s.e,
                encode::divw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::div(rd, rs1, rs2));
        }
        case il::BinOp::Udiv => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emitTrapIfZero(s, rs2);
            emit::emit(s.e,
                encode::divuw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::divu(rd, rs1, rs2));
        }
        case il::BinOp::Srem => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emitTrapIfZero(s, rs2);
            emit::emit(s.e,
                encode::remw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::rem(rd, rs1, rs2));
        }
        case il::BinOp::Urem => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emitTrapIfZero(s, rs2);
            emit::emit(s.e,
                encode::remuw(rd, rs1, rs2)
                    if typ == il::Type::W32 else
                encode::remu(rd, rs1, rs2));
        }
        case il::BinOp::And =>
            selectBinOp(s, rd, rs1, b, BinOp::And, super::SCRATCH2),
        case il::BinOp::Or =>
            selectBinOp(s, rd, rs1, b, BinOp::Or, super::SCRATCH2),
        case il::BinOp::Xor =>
            selectBinOp(s, rd, rs1, b, BinOp::Xor, super::SCRATCH2),
        case il::BinOp::Shl =>
            selectShift(s, rd, rs1, b, ShiftOp::Sll, typ, super::SCRATCH2),
        case il::BinOp::Sshr =>
            selectShift(s, rd, rs1, b, ShiftOp::Sra, typ, super::SCRATCH2),
        case il::BinOp::Ushr =>
            selectShift(s, rd, rs1, b, ShiftOp::Srl, typ, super::SCRATCH2),
        case il::BinOp::Eq => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            // Canonicalize both operands to the declared width
            // so high-bit junk doesn't affect the result.
            emitZext(s.e, rs1, rs1, typ);
            emitZext(s.e, rs2, rs2, typ);
            emit::emit(s.e, encode::xor(rd, rs1, rs2));
            emit::emit(s.e, encode::sltiu(rd, rd, 1));
        }
        case il::BinOp::Ne => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            // Canonicalize both operands to the declared width
            // so high-bit junk doesn't affect the result.
            emitZext(s.e, rs1, rs1, typ);
            emitZext(s.e, rs2, rs2, typ);
            emit::emit(s.e, encode::xor(rd, rs1, rs2));
            emit::emit(s.e, encode::sltu(rd, super::ZERO, rd));
        }
        case il::BinOp::Slt =>
            selectCmp(s, rd, rs1, b, CmpOp::Slt, super::SCRATCH2),
        case il::BinOp::Ult =>
            selectCmp(s, rd, rs1, b, CmpOp::Ult, super::SCRATCH2),
        case il::BinOp::Sge => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emit::emit(s.e, encode::slt(rd, rs1, rs2));
            emit::emit(s.e, encode::xori(rd, rd, 1)); // flip.
        }
        case il::BinOp::Uge => {
            let rs2 = resolveVal(s, super::SCRATCH2, b);
            emit::emit(s.e, encode::sltu(rd, rs1, rs2));
            emit::emit(s.e, encode::xori(rd, rd, 1)); // flip.
        }
    }
}

/// Select a unary ALU operation.
fn selectAluUnOp(s: *mut Selector, op: il::UnOp, typ: il::Type, rd: super::Reg, rs: super::Reg) {
    match op {
        case il::UnOp::Neg => {
            if typ == il::Type::W32 {
                emit::emit(s.e, encode::subw(rd, super::ZERO, rs));
            } else {
                emit::emit(s.e, encode::neg(rd, rs));
            }
        }
        case il::UnOp::Not =>
            emit::emit(s.e, encode::not_(rd, rs)),
    }
}

/// Select 32-bit addition with immediate optimization.
/// Uses `addiw`/`addw` to operate on 32 bits and sign-extend the result.
fn selectBinOpW(s: *mut Selector, rd: super::Reg, rs1: super::Reg, b: il::Val, scratch: super::Reg) {
    if let case il::Val::Imm(imm) = b {
        if encode::isSmallImm64(imm) {
            emit::emit(s.e, encode::addiw(rd, rs1, imm as i32));
            return;
        }
    }
    let rs2 = resolveVal(s, scratch, b);
    emit::emit(s.e, encode::addw(rd, rs1, rs2));
}

/// Select binary operation with immediate optimization.
fn selectBinOp(s: *mut Selector, rd: super::Reg, rs1: super::Reg, b: il::Val, op: BinOp, scratch: super::Reg) {
    // Try immediate optimization first.
    if let case il::Val::Imm(imm) = b {
        if encode::isSmallImm64(imm) {
            let simm = imm as i32;
            match op {
                case BinOp::Add => emit::emit(s.e, encode::addi(rd, rs1, simm)),
                case BinOp::And => emit::emit(s.e, encode::andi(rd, rs1, simm)),
                case BinOp::Or  => emit::emit(s.e, encode::ori(rd, rs1, simm)),
                case BinOp::Xor => emit::emit(s.e, encode::xori(rd, rs1, simm)),
            }
            return;
        }
    }
    // Fallback: load into register.
    let rs2 = resolveVal(s, scratch, b);
    match op {
        case BinOp::Add => emit::emit(s.e, encode::add(rd, rs1, rs2)),
        case BinOp::And => emit::emit(s.e, encode::and_(rd, rs1, rs2)),
        case BinOp::Or  => emit::emit(s.e, encode::or_(rd, rs1, rs2)),
        case BinOp::Xor => emit::emit(s.e, encode::xor(rd, rs1, rs2)),
    }
}

/// Select shift operation with immediate optimization.
/// For 32-bit operations, uses the `*w` variants that operate on the lower 32 bits
/// and sign-extend the result.
fn selectShift(s: *mut Selector, rd: super::Reg, rs1: super::Reg, b: il::Val, op: ShiftOp, typ: il::Type, scratch: super::Reg) {
    let isW32: bool = typ == il::Type::W32;

    // Try immediate optimization first.
    if let case il::Val::Imm(shamt) = b {
        // Keep immediate forms only for encodable shift amounts.
        // Otherwise fall back to register shifts, which naturally mask the count.
        if shamt >= 0 and ((isW32 and shamt < 32) or (not isW32 and shamt < 64)) {
            let sa = shamt as i32;
            if isW32 {
                match op {
                    case ShiftOp::Sll => emit::emit(s.e, encode::slliw(rd, rs1, sa)),
                    case ShiftOp::Srl => emit::emit(s.e, encode::srliw(rd, rs1, sa)),
                    case ShiftOp::Sra => emit::emit(s.e, encode::sraiw(rd, rs1, sa)),
                }
            } else {
                match op {
                    case ShiftOp::Sll => emit::emit(s.e, encode::slli(rd, rs1, sa)),
                    case ShiftOp::Srl => emit::emit(s.e, encode::srli(rd, rs1, sa)),
                    case ShiftOp::Sra => emit::emit(s.e, encode::srai(rd, rs1, sa)),
                }
            }
            return;
        }
    }
    // Fallback: load into register.
    let rs2 = resolveVal(s, scratch, b);
    if isW32 {
        match op {
            case ShiftOp::Sll => emit::emit(s.e, encode::sllw(rd, rs1, rs2)),
            case ShiftOp::Srl => emit::emit(s.e, encode::srlw(rd, rs1, rs2)),
            case ShiftOp::Sra => emit::emit(s.e, encode::sraw(rd, rs1, rs2)),
        }
    } else {
        match op {
            case ShiftOp::Sll => emit::emit(s.e, encode::sll(rd, rs1, rs2)),
            case ShiftOp::Srl => emit::emit(s.e, encode::srl(rd, rs1, rs2)),
            case ShiftOp::Sra => emit::emit(s.e, encode::sra(rd, rs1, rs2)),
        }
    }
}

/// Resolve parallel moves from IL values to physical destination registers.
///
/// The parallel move problem arises when moving values between registers where
/// there may be dependencies (e.g. moving A0 to A1 and A1 to A0 simultaneously).
///
/// This algorithm:
/// 1. Identifies "ready" moves.
/// 2. Executes ready moves.
/// 3. Breaks cycles using scratch register.
///
/// Entries with `ZERO` destination are skipped, as they are handled by caller.
fn emitParallelMoves(s: *mut Selector, dsts: *[super::Reg], args: *[il::Val]) {
    let n: u32 = args.len;
    if n == 0 {
        return;
    }
    if n > MAX_BLOCK_ARGS {
        panic "emitParallelMoves: too many arguments";
    }
    // Source registers for each arg.
    let mut srcRegs: [super::Reg; MAX_BLOCK_ARGS] = [super::ZERO; MAX_BLOCK_ARGS];
    // If this is a register-to-register move.
    let mut isRegMove: [bool; MAX_BLOCK_ARGS] = [false; MAX_BLOCK_ARGS];
    // If this move still needs to be executed.
    let mut pending: [bool; MAX_BLOCK_ARGS] = [false; MAX_BLOCK_ARGS];
    // Number of pending moves.
    let mut numPending: u32 = 0;

    for i in 0..n {
        let dst = dsts[i];
        if dst != super::ZERO { // Skip entries with no destination.
            match args[i] {
                case il::Val::Reg(r) => {
                    if let _ = regalloc::spill::spillSlot(&s.ralloc.spill, r) {
                        // Spilled value needs load, not a register move.
                        pending[i] = true;
                        numPending += 1;
                    } else {
                        let src = getReg(s, r);
                        if src != dst {
                            // Register-to-register move needed.
                            srcRegs[i] = src;
                            isRegMove[i] = true;
                            pending[i] = true;
                            numPending += 1;
                        } else {
                            // No move needed.
                        }
                    }
                },
                case il::Val::Imm(_), il::Val::DataSym(_), il::Val::FnAddr(_) => {
                    pending[i] = true;
                    numPending += 1;
                },
                case il::Val::Undef => {
                    // Undefined values don't need any move.
                }
            }
        } else {
            // Nothing to do.
        }
    }

    // Execute parallel move algorithm.
    while numPending > 0 {
        let mut found = false;

        // Find a ready move: one whose destination is not a source of any
        // pending register move.
        for i in 0..n {
            if pending[i] {
                let dst = dsts[i];
                let mut isReady = true;

                // Check if `dst` is used as source by any other pending register move.
                for j in 0..n {
                    if j != i and pending[j] and isRegMove[j] and srcRegs[j] == dst {
                        isReady = false;
                        break;
                    }
                }
                if isReady {
                    // Execute this move.
                    if isRegMove[i] {
                        emitMv(s, dst, srcRegs[i]);
                    } else {
                        // Load immediate, symbol, or spilled value.
                        loadVal(s, dst, args[i]);
                    }
                    found = true;
                    pending[i] = false;
                    numPending -= 1;

                    break;
                }
            }
        }

        if not found {
            // No ready move, we have a cycle among register moves.
            // Break it by saving one source to scratch.
            for i in 0..n {
                if pending[i] and isRegMove[i] {
                    let src = srcRegs[i];
                    // Save this source to scratch.
                    emitMv(s, super::SCRATCH1, src);
                    // Update all pending moves that use this source.
                    for j in 0..n {
                        if pending[j] and isRegMove[j] and srcRegs[j] == src {
                            srcRegs[j] = super::SCRATCH1;
                        }
                    }
                    break;
                }
            }
        }
    }
}

/// Emit moves from block arguments to target block's parameter registers.
///
/// Handles spilled destinations directly, then delegates to [`emitParallelMoves`]
/// for the remaining register-to-register parallel move resolution.
fn emitBlockArgs(s: *mut Selector, func: *il::Fn, target: u32, args: *mut [il::Val]) {
    if args.len == 0 {
        return;
    }
    let block = &func.blocks[target];
    if args.len != block.params.len {
        panic "emitBlockArgs: argument/parameter count mismatch";
    }
    if args.len > MAX_BLOCK_ARGS {
        panic "emitBlockArgs: too many block arguments";
    }

    // Destination registers for each arg.
    // Zero means the destination is spilled or skipped.
    let mut dsts: [super::Reg; MAX_BLOCK_ARGS] = [super::ZERO; MAX_BLOCK_ARGS];

    for arg, i in args {
        let param = block.params[i].value;

        // Spilled destinations: store directly to spill slot.
        // These don't participate in the parallel move algorithm.
        if let slot = regalloc::spill::spillSlot(&s.ralloc.spill, param) {
            if let case il::Val::Undef = arg {
                // Undefined values don't need any move.
            } else {
                let rs = resolveVal(s, super::SCRATCH1, arg);
                emit::emitSd(s.e, rs, super::FP, spillOffset(s, slot));
            }
        } else {
            dsts[i] = getReg(s, param);
        }
    }
    emitParallelMoves(s, &dsts[..], args);
}

/// Select comparison with immediate optimization.
fn selectCmp(s: *mut Selector, rd: super::Reg, rs1: super::Reg, b: il::Val, op: CmpOp, scratch: super::Reg) {
    // Try immediate optimization first.
    if let case il::Val::Imm(imm) = b {
        if encode::isSmallImm64(imm) {
            let simm = imm as i32;
            match op {
                case CmpOp::Slt => emit::emit(s.e, encode::slti(rd, rs1, simm)),
                case CmpOp::Ult => emit::emit(s.e, encode::sltiu(rd, rs1, simm)),
            }
            return;
        }
    }
    // Fallback: load into register.
    let rs2 = resolveVal(s, scratch, b);
    match op {
        case CmpOp::Slt => emit::emit(s.e, encode::slt(rd, rs1, rs2)),
        case CmpOp::Ult => emit::emit(s.e, encode::sltu(rd, rs1, rs2)),
    }
}