Use loop for large BLITs

043ee151f8c73233bc23b06a5a0d03efd6dd832158c4ba89506bb8bc4b69052b
This reduces binary size quite a bit, since unrolled loops for large
records are quite inefficient in terms of code size.
Alexis Sellier committed ago 1 parent ab23f64b
lib/std/arch/rv64.rad +4 -0
87 87
/// Instruction size in bytes.
88 88
pub const INSTR_SIZE: i32 = 4;
89 89
/// Stack alignment requirement in bytes.
90 90
pub const STACK_ALIGNMENT: i32 = 16;
91 91
92 +
/// Minimum blit size (in bytes) to use a loop instead of inline copy.
93 +
/// Blits below this threshold are fully unrolled as LD/SD pairs.
94 +
pub const BLIT_LOOP_THRESHOLD: i32 = 256;
95 +
92 96
/////////////////////////
93 97
// Codegen Allocation  //
94 98
/////////////////////////
95 99
96 100
/// Argument registers for function calls.
lib/std/arch/rv64/isel.rad +35 -15
490 490
                rsrc = getSrcReg(s, src, super::SCRATCH2);
491 491
            }
492 492
            let mut offset: i32 = 0;
493 493
            let mut remaining = staticSize as i32;
494 494
495 -
            // Copy loop: 8 bytes, then 4 bytes, then 1 byte at a time.
495 +
            // For large blits where both pointers are in real registers,
496 +
            // use an inline loop instead of unrolled LD/SD pairs.
497 +
            let dwordBytes = remaining & ~(super::DWORD_SIZE - 1);
498 +
            let canLoop = not bothSpilled
499 +
                and *rsrc != *super::SCRATCH1 and *rsrc != *super::SCRATCH2
500 +
                and *rdst != *super::SCRATCH1 and *rdst != *super::SCRATCH2;
501 +
502 +
            if canLoop and dwordBytes >= super::BLIT_LOOP_THRESHOLD {
503 +
                emit::emitAddImm(s.e, super::SCRATCH1, rsrc, dwordBytes);
504 +
505 +
                let loopStart = s.e.codeLen;
506 +
507 +
                emit::emitLd(s.e, super::SCRATCH2, rsrc, 0);
508 +
                emit::emitSd(s.e, super::SCRATCH2, rdst, 0);
509 +
                emit::emit(s.e, encode::addi(rsrc, rsrc, super::DWORD_SIZE));
510 +
511 +
                if *rdst != *rsrc {
512 +
                    emit::emit(s.e, encode::addi(rdst, rdst, super::DWORD_SIZE));
513 +
                }
514 +
                let brOff = (loopStart as i32 - s.e.codeLen as i32) * super::INSTR_SIZE;
515 +
516 +
                emit::emit(s.e, encode::bne(rsrc, super::SCRATCH1, brOff));
517 +
                remaining -= dwordBytes;
518 +
            }
519 +
520 +
            // Copy remaining: 8 bytes, then 4 bytes, then 1 byte at a time.
496 521
            // Before each load/store pair, check whether the offset is
497 522
            // about to exceed the 12-bit signed immediate range. When
498 523
            // it does, advance the base registers by the accumulated
499 524
            // offset and reset to zero.
500 525
            while remaining >= super::DWORD_SIZE {
610 635
            // For SLT: sign-extension needed (signed comparison).
611 636
            // For ULT: zero-extension needed (unsigned magnitude comparison).
612 637
            // For EQ/NE with W32: sign-extension is cheaper.
613 638
            // For EQ/NE with W8/W16: keep zero-extension.
614 639
            // Skip extension for zero register.
615 -
            if typ == il::Type::W32 {
616 -
                // Sign-extension suffices for *all* comparison types on RV64.
617 -
                // For SLT: sign-extension is semantically correct.
618 -
                // For ULT: bltu gives identical results on sign- vs zero-
619 -
                // extended W32 values (the relative ordering is preserved
620 -
                // because the sign bit maps to the same half of 64-bit space).
621 -
                // For EQ/NE: both extensions produce identical equality results.
622 -
                if not aIsZero and not isExtendedImm(a, typ, true) {
623 -
                    emitSext(s.e, rs1, rs1, typ);
624 -
                }
625 -
                if not bIsZero and not isExtendedImm(b, typ, true) {
626 -
                    emitSext(s.e, rs2, rs2, typ);
627 -
                }
628 -
            } else if let case il::CmpOp::Slt = op {
640 +
            // Determine extension mode: sign-extend for W32 or SLT,
641 +
            // zero-extend otherwise.
642 +
            let mut useSext: bool = undefined;
643 +
            if let case il::CmpOp::Slt = op {
644 +
                useSext = true;
645 +
            } else {
646 +
                useSext = typ == il::Type::W32;
647 +
            }
648 +
            if useSext {
629 649
                if not aIsZero and not isExtendedImm(a, typ, true) {
630 650
                    emitSext(s.e, rs1, rs1, typ);
631 651
                }
632 652
                if not bIsZero and not isExtendedImm(b, typ, true) {
633 653
                    emitSext(s.e, rs2, rs2, typ);