radiance · commit

Use loop for large BLITs

043ee151f8c73233bc23b06a5a0d03efd6dd832158c4ba89506bb8bc4b69052b

This reduces binary size quite a bit, since unrolled loops for large
records are quite inefficient in terms of code size.

Alexis Sellier committed 3 months ago 1 parent ab23f64b

lib/std/arch/rv64.rad +4 -0

/// Instruction size in bytes.
pub const INSTR_SIZE: i32 = 4;
/// Stack alignment requirement in bytes.
pub const STACK_ALIGNMENT: i32 = 16;

/// Minimum blit size (in bytes) to use a loop instead of inline copy.
/// Blits below this threshold are fully unrolled as LD/SD pairs.
pub const BLIT_LOOP_THRESHOLD: i32 = 256;

/////////////////////////
// Codegen Allocation  //
/////////////////////////

/// Argument registers for function calls.

lib/std/arch/rv64/isel.rad +35 -15

                rsrc = getSrcReg(s, src, super::SCRATCH2);
            }
            let mut offset: i32 = 0;
            let mut remaining = staticSize as i32;

            // Copy loop: 8 bytes, then 4 bytes, then 1 byte at a time.
            // For large blits where both pointers are in real registers,
            // use an inline loop instead of unrolled LD/SD pairs.
            let dwordBytes = remaining & ~(super::DWORD_SIZE - 1);
            let canLoop = not bothSpilled
                and *rsrc != *super::SCRATCH1 and *rsrc != *super::SCRATCH2
                and *rdst != *super::SCRATCH1 and *rdst != *super::SCRATCH2;

            if canLoop and dwordBytes >= super::BLIT_LOOP_THRESHOLD {
                emit::emitAddImm(s.e, super::SCRATCH1, rsrc, dwordBytes);

                let loopStart = s.e.codeLen;

                emit::emitLd(s.e, super::SCRATCH2, rsrc, 0);
                emit::emitSd(s.e, super::SCRATCH2, rdst, 0);
                emit::emit(s.e, encode::addi(rsrc, rsrc, super::DWORD_SIZE));

                if *rdst != *rsrc {
                    emit::emit(s.e, encode::addi(rdst, rdst, super::DWORD_SIZE));
                }
                let brOff = (loopStart as i32 - s.e.codeLen as i32) * super::INSTR_SIZE;

                emit::emit(s.e, encode::bne(rsrc, super::SCRATCH1, brOff));
                remaining -= dwordBytes;
            }

            // Copy remaining: 8 bytes, then 4 bytes, then 1 byte at a time.
            // Before each load/store pair, check whether the offset is
            // about to exceed the 12-bit signed immediate range. When
            // it does, advance the base registers by the accumulated
            // offset and reset to zero.
            while remaining >= super::DWORD_SIZE {

            // For SLT: sign-extension needed (signed comparison).
            // For ULT: zero-extension needed (unsigned magnitude comparison).
            // For EQ/NE with W32: sign-extension is cheaper.
            // For EQ/NE with W8/W16: keep zero-extension.
            // Skip extension for zero register.
            if typ == il::Type::W32 {
                // Sign-extension suffices for *all* comparison types on RV64.
                // For SLT: sign-extension is semantically correct.
                // For ULT: bltu gives identical results on sign- vs zero-
                // extended W32 values (the relative ordering is preserved
                // because the sign bit maps to the same half of 64-bit space).
                // For EQ/NE: both extensions produce identical equality results.
                if not aIsZero and not isExtendedImm(a, typ, true) {
                    emitSext(s.e, rs1, rs1, typ);
                }
                if not bIsZero and not isExtendedImm(b, typ, true) {
                    emitSext(s.e, rs2, rs2, typ);
                }
            } else if let case il::CmpOp::Slt = op {
            // Determine extension mode: sign-extend for W32 or SLT,
            // zero-extend otherwise.
            let mut useSext: bool = undefined;
            if let case il::CmpOp::Slt = op {
                useSext = true;
            } else {
                useSext = typ == il::Type::W32;
            }
            if useSext {
                if not aIsZero and not isExtendedImm(a, typ, true) {
                    emitSext(s.e, rs1, rs1, typ);
                }
                if not bIsZero and not isExtendedImm(b, typ, true) {
                    emitSext(s.e, rs2, rs2, typ);

87	87		/// Instruction size in bytes.
88	88		pub const INSTR_SIZE: i32 = 4;
89	89		/// Stack alignment requirement in bytes.
90	90		pub const STACK_ALIGNMENT: i32 = 16;
91	91
	92	+	/// Minimum blit size (in bytes) to use a loop instead of inline copy.
	93	+	/// Blits below this threshold are fully unrolled as LD/SD pairs.
	94	+	pub const BLIT_LOOP_THRESHOLD: i32 = 256;
	95	+
92	96		/////////////////////////
93	97		// Codegen Allocation //
94	98		/////////////////////////
95	99
96	100		/// Argument registers for function calls.

490	490		rsrc = getSrcReg(s, src, super::SCRATCH2);
491	491		}
492	492		let mut offset: i32 = 0;
493	493		let mut remaining = staticSize as i32;
494	494
495		-	// Copy loop: 8 bytes, then 4 bytes, then 1 byte at a time.
	495	+	// For large blits where both pointers are in real registers,
	496	+	// use an inline loop instead of unrolled LD/SD pairs.
	497	+	let dwordBytes = remaining & ~(super::DWORD_SIZE - 1);
	498	+	let canLoop = not bothSpilled
	499	+	and rsrc != super::SCRATCH1 and rsrc != super::SCRATCH2
	500	+	and rdst != super::SCRATCH1 and rdst != super::SCRATCH2;
	501	+
	502	+	if canLoop and dwordBytes >= super::BLIT_LOOP_THRESHOLD {
	503	+	emit::emitAddImm(s.e, super::SCRATCH1, rsrc, dwordBytes);
	504	+
	505	+	let loopStart = s.e.codeLen;
	506	+
	507	+	emit::emitLd(s.e, super::SCRATCH2, rsrc, 0);
	508	+	emit::emitSd(s.e, super::SCRATCH2, rdst, 0);
	509	+	emit::emit(s.e, encode::addi(rsrc, rsrc, super::DWORD_SIZE));
	510	+
	511	+	if rdst != rsrc {
	512	+	emit::emit(s.e, encode::addi(rdst, rdst, super::DWORD_SIZE));
	513	+	}
	514	+	let brOff = (loopStart as i32 - s.e.codeLen as i32) * super::INSTR_SIZE;
	515	+
	516	+	emit::emit(s.e, encode::bne(rsrc, super::SCRATCH1, brOff));
	517	+	remaining -= dwordBytes;
	518	+	}
	519	+
	520	+	// Copy remaining: 8 bytes, then 4 bytes, then 1 byte at a time.
496	521		// Before each load/store pair, check whether the offset is
497	522		// about to exceed the 12-bit signed immediate range. When
498	523		// it does, advance the base registers by the accumulated
499	524		// offset and reset to zero.
500	525		while remaining >= super::DWORD_SIZE {

610	635		// For SLT: sign-extension needed (signed comparison).
611	636		// For ULT: zero-extension needed (unsigned magnitude comparison).
612	637		// For EQ/NE with W32: sign-extension is cheaper.
613	638		// For EQ/NE with W8/W16: keep zero-extension.
614	639		// Skip extension for zero register.
615		-	if typ == il::Type::W32 {
616		-	// Sign-extension suffices for all comparison types on RV64.
617		-	// For SLT: sign-extension is semantically correct.
618		-	// For ULT: bltu gives identical results on sign- vs zero-
619		-	// extended W32 values (the relative ordering is preserved
620		-	// because the sign bit maps to the same half of 64-bit space).
621		-	// For EQ/NE: both extensions produce identical equality results.
622		-	if not aIsZero and not isExtendedImm(a, typ, true) {
623		-	emitSext(s.e, rs1, rs1, typ);
624		-	}
625		-	if not bIsZero and not isExtendedImm(b, typ, true) {
626		-	emitSext(s.e, rs2, rs2, typ);
627		-	}
628		-	} else if let case il::CmpOp::Slt = op {
	640	+	// Determine extension mode: sign-extend for W32 or SLT,
	641	+	// zero-extend otherwise.
	642	+	let mut useSext: bool = undefined;
	643	+	if let case il::CmpOp::Slt = op {
	644	+	useSext = true;
	645	+	} else {
	646	+	useSext = typ == il::Type::W32;
	647	+	}
	648	+	if useSext {
629	649		if not aIsZero and not isExtendedImm(a, typ, true) {
630	650		emitSext(s.e, rs1, rs1, typ);
631	651		}
632	652		if not bIsZero and not isExtendedImm(b, typ, true) {
633	653		emitSext(s.e, rs2, rs2, typ);