Use loop for large BLITs
043ee151f8c73233bc23b06a5a0d03efd6dd832158c4ba89506bb8bc4b69052b
This reduces binary size quite a bit, since unrolled loops for large records are quite inefficient in terms of code size.
1 parent
ab23f64b
lib/std/arch/rv64.rad
+4 -0
| 87 | 87 | /// Instruction size in bytes. |
|
| 88 | 88 | pub const INSTR_SIZE: i32 = 4; |
|
| 89 | 89 | /// Stack alignment requirement in bytes. |
|
| 90 | 90 | pub const STACK_ALIGNMENT: i32 = 16; |
|
| 91 | 91 | ||
| 92 | + | /// Minimum blit size (in bytes) to use a loop instead of inline copy. |
|
| 93 | + | /// Blits below this threshold are fully unrolled as LD/SD pairs. |
|
| 94 | + | pub const BLIT_LOOP_THRESHOLD: i32 = 256; |
|
| 95 | + | ||
| 92 | 96 | ///////////////////////// |
|
| 93 | 97 | // Codegen Allocation // |
|
| 94 | 98 | ///////////////////////// |
|
| 95 | 99 | ||
| 96 | 100 | /// Argument registers for function calls. |
lib/std/arch/rv64/isel.rad
+35 -15
| 490 | 490 | rsrc = getSrcReg(s, src, super::SCRATCH2); |
|
| 491 | 491 | } |
|
| 492 | 492 | let mut offset: i32 = 0; |
|
| 493 | 493 | let mut remaining = staticSize as i32; |
|
| 494 | 494 | ||
| 495 | - | // Copy loop: 8 bytes, then 4 bytes, then 1 byte at a time. |
|
| 495 | + | // For large blits where both pointers are in real registers, |
|
| 496 | + | // use an inline loop instead of unrolled LD/SD pairs. |
|
| 497 | + | let dwordBytes = remaining & ~(super::DWORD_SIZE - 1); |
|
| 498 | + | let canLoop = not bothSpilled |
|
| 499 | + | and *rsrc != *super::SCRATCH1 and *rsrc != *super::SCRATCH2 |
|
| 500 | + | and *rdst != *super::SCRATCH1 and *rdst != *super::SCRATCH2; |
|
| 501 | + | ||
| 502 | + | if canLoop and dwordBytes >= super::BLIT_LOOP_THRESHOLD { |
|
| 503 | + | emit::emitAddImm(s.e, super::SCRATCH1, rsrc, dwordBytes); |
|
| 504 | + | ||
| 505 | + | let loopStart = s.e.codeLen; |
|
| 506 | + | ||
| 507 | + | emit::emitLd(s.e, super::SCRATCH2, rsrc, 0); |
|
| 508 | + | emit::emitSd(s.e, super::SCRATCH2, rdst, 0); |
|
| 509 | + | emit::emit(s.e, encode::addi(rsrc, rsrc, super::DWORD_SIZE)); |
|
| 510 | + | ||
| 511 | + | if *rdst != *rsrc { |
|
| 512 | + | emit::emit(s.e, encode::addi(rdst, rdst, super::DWORD_SIZE)); |
|
| 513 | + | } |
|
| 514 | + | let brOff = (loopStart as i32 - s.e.codeLen as i32) * super::INSTR_SIZE; |
|
| 515 | + | ||
| 516 | + | emit::emit(s.e, encode::bne(rsrc, super::SCRATCH1, brOff)); |
|
| 517 | + | remaining -= dwordBytes; |
|
| 518 | + | } |
|
| 519 | + | ||
| 520 | + | // Copy remaining: 8 bytes, then 4 bytes, then 1 byte at a time. |
|
| 496 | 521 | // Before each load/store pair, check whether the offset is |
|
| 497 | 522 | // about to exceed the 12-bit signed immediate range. When |
|
| 498 | 523 | // it does, advance the base registers by the accumulated |
|
| 499 | 524 | // offset and reset to zero. |
|
| 500 | 525 | while remaining >= super::DWORD_SIZE { |
| 610 | 635 | // For SLT: sign-extension needed (signed comparison). |
|
| 611 | 636 | // For ULT: zero-extension needed (unsigned magnitude comparison). |
|
| 612 | 637 | // For EQ/NE with W32: sign-extension is cheaper. |
|
| 613 | 638 | // For EQ/NE with W8/W16: keep zero-extension. |
|
| 614 | 639 | // Skip extension for zero register. |
|
| 615 | - | if typ == il::Type::W32 { |
|
| 616 | - | // Sign-extension suffices for *all* comparison types on RV64. |
|
| 617 | - | // For SLT: sign-extension is semantically correct. |
|
| 618 | - | // For ULT: bltu gives identical results on sign- vs zero- |
|
| 619 | - | // extended W32 values (the relative ordering is preserved |
|
| 620 | - | // because the sign bit maps to the same half of 64-bit space). |
|
| 621 | - | // For EQ/NE: both extensions produce identical equality results. |
|
| 622 | - | if not aIsZero and not isExtendedImm(a, typ, true) { |
|
| 623 | - | emitSext(s.e, rs1, rs1, typ); |
|
| 624 | - | } |
|
| 625 | - | if not bIsZero and not isExtendedImm(b, typ, true) { |
|
| 626 | - | emitSext(s.e, rs2, rs2, typ); |
|
| 627 | - | } |
|
| 628 | - | } else if let case il::CmpOp::Slt = op { |
|
| 640 | + | // Determine extension mode: sign-extend for W32 or SLT, |
|
| 641 | + | // zero-extend otherwise. |
|
| 642 | + | let mut useSext: bool = undefined; |
|
| 643 | + | if let case il::CmpOp::Slt = op { |
|
| 644 | + | useSext = true; |
|
| 645 | + | } else { |
|
| 646 | + | useSext = typ == il::Type::W32; |
|
| 647 | + | } |
|
| 648 | + | if useSext { |
|
| 629 | 649 | if not aIsZero and not isExtendedImm(a, typ, true) { |
|
| 630 | 650 | emitSext(s.e, rs1, rs1, typ); |
|
| 631 | 651 | } |
|
| 632 | 652 | if not bIsZero and not isExtendedImm(b, typ, true) { |
|
| 633 | 653 | emitSext(s.e, rs2, rs2, typ); |