Skip to content

Commit

Permalink
[RISC-V][LoongArch] Reduce memory usage for shrinking sections
Browse files Browse the repository at this point in the history
Previously, 4 bytes were allocated for each relocation in a text
section to store the number of bytes removed before each relocation.
Now, only the locations where bytes are removed are stored.

This commit improves performance by ~4% when linking clang-19 for RISC-V.
  • Loading branch information
rui314 committed Jan 4, 2025
1 parent 1e8cee8 commit 3234d88
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 91 deletions.
53 changes: 33 additions & 20 deletions src/arch-loongarch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -265,10 +265,8 @@ void EhFrameSection<E>::apply_eh_reloc(Context<E> &ctx, const ElfRel<E> &rel,
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);

auto get_r_delta = [&](i64 idx) {
return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
};
std::span<RelocDelta> deltas = extra.r_deltas;
i64 k = 0;

for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
Expand All @@ -278,9 +276,20 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
rel.r_type == R_LARCH_ALIGN)
continue;

i64 removed_bytes = 0;
i64 r_delta = 0;

if (!deltas.empty()) {
while (k < deltas.size() && deltas[k].offset < rel.r_offset)
k++;
if (k < deltas.size() && deltas[k].offset == rel.r_offset)
removed_bytes = get_removed_bytes(deltas, k);
if (k > 0)
r_delta = deltas[k - 1].delta;
}

Symbol<E> &sym = *file.symbols[rel.r_sym];
i64 r_offset = rel.r_offset - get_r_delta(i);
i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
i64 r_offset = rel.r_offset - r_delta;
u8 *loc = base + r_offset;

auto check = [&](i64 val, i64 lo, i64 hi) {
Expand Down Expand Up @@ -861,13 +870,16 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
template <>
void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
isec.extra.r_deltas.resize(rels.size() + 1);
i64 delta = 0;
std::vector<RelocDelta> &deltas = isec.extra.r_deltas;

for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &r = rels[i];
Symbol<E> &sym = *isec.file.symbols[r.r_sym];
isec.extra.r_deltas[i] = delta;

auto remove = [&](u32 d) {
u32 sum = deltas.empty() ? 0 : deltas.back().delta;
deltas.emplace_back((u32)r.r_offset, sum + d);
};

// A R_LARCH_ALIGN relocation refers to the beginning of a nop
// sequence. We need to remove some or all of them so that the
Expand All @@ -892,9 +904,10 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
alignment = r.r_addend + 4;
}

u64 delta = deltas.empty() ? 0 : deltas.back().delta;
u64 loc = isec.get_addr() + r.r_offset - delta;
u64 next_loc = loc + alignment - 4;
delta += next_loc - align_to(loc, alignment);
remove(next_loc - align_to(loc, alignment));
continue;
}

Expand Down Expand Up @@ -924,7 +937,7 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
// addi.d $t0, $tp, <tp-offset>
if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
sign_extend(val, 11) == val)
delta += 4;
remove(4);
break;
case R_LARCH_PCALA_HI20:
// The following two instructions are used to materialize a
Expand All @@ -949,7 +962,7 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) &&
is_addi_d && get_rd(insn1) == get_rd(insn2) &&
get_rd(insn2) == get_rj(insn2))
delta += 4;
remove(4);
}
break;
case R_LARCH_CALL36:
Expand All @@ -965,7 +978,7 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
-(1 << 27) <= dist && dist < (1 << 27))
if (u32 jirl = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4);
get_rd(jirl) == 0 || get_rd(jirl) == 1)
delta += 4;
remove(4);
break;
case R_LARCH_GOT_PC_HI20:
// The following two instructions are used to load a symbol address
Expand All @@ -981,33 +994,33 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
if (is_relaxable_got_load(ctx, isec, i)) {
i64 dist = compute_distance(ctx, sym, isec, r);
if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21))
delta += 4;
remove(4);
}
break;
case R_LARCH_TLS_DESC_PC_HI20:
if (sym.has_tlsdesc(ctx)) {
u64 P = isec.get_addr() + r.r_offset;
i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P;
if (-(1 << 21) <= dist && dist < (1 << 21))
delta += 4;
remove(4);
} else {
delta += 4;
remove(4);
}
break;
case R_LARCH_TLS_DESC_PC_LO12:
if (!sym.has_tlsdesc(ctx))
delta += 4;
remove(4);
break;
case R_LARCH_TLS_DESC_LD:
if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx) &&
sym.get_addr(ctx) + r.r_addend - ctx.tp_addr < 0x1000)
delta += 4;
remove(4);
break;
}
}

isec.extra.r_deltas[rels.size()] = delta;
isec.sh_size -= delta;
if (!deltas.empty())
isec.sh_size -= deltas.back().delta;
}

} // namespace mold
Expand Down
78 changes: 45 additions & 33 deletions src/arch-riscv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,19 +206,33 @@ static inline bool is_hi20(const ElfRel<E> &rel) {
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
std::span<RelocDelta> deltas = extra.r_deltas;
i64 k = 0;

auto get_r_delta = [&](i64 idx) {
return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
// Returns the rd register of an R/I/U/J-type instruction.
auto get_rd = [&](i64 offset) {
return bits(*(ul32 *)(contents.data() + offset), 11, 7);
};

for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
continue;

i64 removed_bytes = 0;
i64 r_delta = 0;

if (!deltas.empty()) {
while (k < deltas.size() && deltas[k].offset < rel.r_offset)
k++;
if (k < deltas.size() && deltas[k].offset == rel.r_offset)
removed_bytes = get_removed_bytes(deltas, k);
if (k > 0)
r_delta = deltas[k - 1].delta;
}

Symbol<E> &sym = *file.symbols[rel.r_sym];
i64 r_offset = rel.r_offset - get_r_delta(i);
i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
i64 r_offset = rel.r_offset - r_delta;
u8 *loc = base + r_offset;

auto check = [&](i64 val, i64 lo, i64 hi) {
Expand All @@ -229,24 +243,19 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
};

auto find_paired_reloc = [&] {
if (sym.value <= rels[i].r_offset - get_r_delta(i)) {
u64 value = sym.esym().st_value;
if (value <= rels[i].r_offset) {
for (i64 j = i - 1; j >= 0; j--)
if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
if (is_hi20(rels[j]) && value == rels[j].r_offset)
return j;
} else {
for (i64 j = i + 1; j < rels.size(); j++)
if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j))
if (is_hi20(rels[j]) && value == rels[j].r_offset)
return j;
}

Fatal(ctx) << *this << ": paired relocation is missing: " << i;
};

auto get_rd = [&](i64 offset) {
// Returns the rd register of an R/I/U/J-type instruction.
return bits(*(ul32 *)(contents.data() + offset), 11, 7);
};

u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + r_offset;
Expand Down Expand Up @@ -369,7 +378,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {

u64 S = sym2.get_addr(ctx);
u64 A = rel2.r_addend;
u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset);
u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);

switch (rel2.r_type) {
Expand Down Expand Up @@ -487,7 +496,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {

u64 S = sym2.get_addr(ctx);
u64 A = rel2.r_addend;
u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset);

switch (rel.r_type) {
case R_RISCV_TLSDESC_LOAD_LO12:
Expand Down Expand Up @@ -803,18 +812,20 @@ u64 get_eflags(Context<E> &ctx) {
template <>
void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
isec.extra.r_deltas.resize(rels.size() + 1);
std::vector<RelocDelta> &deltas = isec.extra.r_deltas;

auto get_rd = [&](i64 offset) {
return bits(*(ul32 *)(isec.contents.data() + offset), 11, 7);
};

i64 delta = 0;

for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &r = rels[i];
Symbol<E> &sym = *isec.file.symbols[r.r_sym];
isec.extra.r_deltas[i] = delta;

auto remove = [&](u32 d) {
u32 sum = deltas.empty() ? 0 : deltas.back().delta;
deltas.emplace_back((u32)r.r_offset, sum + d);
};

// Handling R_RISCV_ALIGN is mandatory.
//
Expand All @@ -824,11 +835,12 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
if (r.r_type == R_RISCV_ALIGN) {
// The total bytes of NOPs is stored to r_addend, so the next
// instruction is r_addend away.
u64 delta = deltas.empty() ? 0 : deltas.back().delta;
u64 loc = isec.get_addr() + r.r_offset - delta;
u64 next_loc = loc + r.r_addend;
u64 alignment = bit_ceil(r.r_addend + 1);
assert(alignment <= (1 << isec.p2align));
delta += next_loc - align_to(loc, alignment);
remove(next_loc - align_to(loc, alignment));
continue;
}

Expand Down Expand Up @@ -874,14 +886,14 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
if (use_rvc && rd == 0 && sign_extend(dist, 11) == dist) {
// If rd is x0 and the jump target is within ±2 KiB, we can use
// C.J, saving 6 bytes.
delta += 6;
remove(6);
} else if (use_rvc && !E::is_64 && rd == 1 && sign_extend(dist, 11) == dist) {
// If rd is x1 and the jump target is within ±2 KiB, we can use
// C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
delta += 6;
remove(6);
} else if (sign_extend(dist, 20) == dist) {
// If the jump target is within ±1 MiB, we can use JAL.
delta += 4;
remove(4);
}
break;
}
Expand All @@ -904,10 +916,10 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {

if (use_rvc && rd != 0 && sign_extend(val, 5) == val) {
// Replace AUIPC + LD with C.LI.
delta += 6;
remove(6);
} else if (sign_extend(val, 11) == val) {
// Replace AUIPC + LD with ADDI.
delta += 4;
remove(4);
}
}
}
Expand All @@ -921,11 +933,11 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
// We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)`
// instruction pair with `add t0, x0, %lo(foo)` if foo's bits
// [32:11] are all one or all zero.
delta += 4;
remove(4);
} else if (use_rvc && rd != 0 && rd != 2 && sign_extend(val, 17) == val) {
// If the upper 20 bits can actually be represented in 6 bits,
// we can use C.LUI instead of LUI.
delta += 2;
remove(2);
}
break;
}
Expand All @@ -952,11 +964,11 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
// Here, we remove `lui` and `add` if the offset is within ±2 KiB.
if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
sign_extend(val, 11) == val)
delta += 4;
remove(4);
break;
case R_RISCV_TLSDESC_HI20:
if (!sym.has_tlsdesc(ctx))
delta += 4;
remove(4);
break;
case R_RISCV_TLSDESC_LOAD_LO12:
case R_RISCV_TLSDESC_ADD_LO12: {
Expand All @@ -965,21 +977,21 @@ void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {

if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) {
if (!sym2.has_tlsdesc(ctx))
delta += 4;
remove(4);
} else {
assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12);
if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx))
if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr;
sign_extend(val, 11) == val)
delta += 4;
remove(4);
}
break;
}
}
}

isec.extra.r_deltas[rels.size()] = delta;
isec.sh_size -= delta;
if (!deltas.empty())
isec.sh_size -= deltas.back().delta;
}

// ISA name handlers
Expand Down
28 changes: 12 additions & 16 deletions src/input-sections.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,27 +225,23 @@ void InputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
// relocations are allowed to remove bytes from the middle of a
// section and shrink the overall size of it.
if constexpr (is_riscv<E> || is_loongarch<E>) {
if (extra.r_deltas.empty()) {
std::span<RelocDelta> deltas = extra.r_deltas;

if (deltas.empty()) {
// If a section is not relaxed, we can copy it as a one big chunk.
copy_contents(ctx, buf);
} else {
// A relaxed section is copied piece-wise.
std::span<const ElfRel<E>> rels = get_rels(ctx);
u8 *buf2 = buf;
i64 pos = 0;

for (i64 i = 0; i < rels.size(); i++) {
i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
if (delta == 0)
continue;
assert(delta > 0);

const ElfRel<E> &r = rels[i];
memcpy(buf2, contents.data() + pos, r.r_offset - pos);
buf2 += r.r_offset - pos;
pos = r.r_offset + delta;
memcpy(buf, contents.data(), deltas[0].offset);

for (i64 i = 0; i < deltas.size(); i++) {
RelocDelta x = deltas[i];
i64 end = (i + 1 == deltas.size()) ? contents.size() : deltas[i + 1].offset;
i64 removed_bytes = get_removed_bytes(deltas, i);
memcpy(buf + x.offset - x.delta + removed_bytes,
contents.data() + x.offset + removed_bytes,
end - x.offset - removed_bytes);
}
memcpy(buf2, contents.data() + pos, contents.size() - pos);
}
} else {
copy_contents(ctx, buf);
Expand Down
Loading

0 comments on commit 3234d88

Please sign in to comment.