diff --git a/src/arch-loongarch.cc b/src/arch-loongarch.cc index 560dc94697..e878b0b7a2 100644 --- a/src/arch-loongarch.cc +++ b/src/arch-loongarch.cc @@ -265,10 +265,8 @@ void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); - - auto get_r_delta = [&](i64 idx) { - return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx]; - }; + std::span deltas = extra.r_deltas; + i64 k = 0; for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; @@ -278,9 +276,20 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { rel.r_type == R_LARCH_ALIGN) continue; + i64 removed_bytes = 0; + i64 r_delta = 0; + + if (!deltas.empty()) { + while (k < deltas.size() && deltas[k].offset < rel.r_offset) + k++; + if (k < deltas.size() && deltas[k].offset == rel.r_offset) + removed_bytes = get_removed_bytes(deltas, k); + if (k > 0) + r_delta = deltas[k - 1].delta; + } + Symbol &sym = *file.symbols[rel.r_sym]; - i64 r_offset = rel.r_offset - get_r_delta(i); - i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i); + i64 r_offset = rel.r_offset - r_delta; u8 *loc = base + r_offset; auto check = [&](i64 val, i64 lo, i64 hi) { @@ -861,13 +870,16 @@ void InputSection::scan_relocations(Context &ctx) { template <> void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { std::span> rels = isec.get_rels(ctx); - isec.extra.r_deltas.resize(rels.size() + 1); - i64 delta = 0; + std::vector &deltas = isec.extra.r_deltas; for (i64 i = 0; i < rels.size(); i++) { const ElfRel &r = rels[i]; Symbol &sym = *isec.file.symbols[r.r_sym]; - isec.extra.r_deltas[i] = delta; + + auto remove = [&](u32 d) { + u32 sum = deltas.empty() ? 0 : deltas.back().delta; + deltas.emplace_back((u32)r.r_offset, sum + d); + }; // A R_LARCH_ALIGN relocation refers to the beginning of a nop // sequence. We need to remove some or all of them so that the @@ -892,9 +904,10 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { alignment = r.r_addend + 4; } + u64 delta = deltas.empty() ? 0 : deltas.back().delta; u64 loc = isec.get_addr() + r.r_offset - delta; u64 next_loc = loc + alignment - 4; - delta += next_loc - align_to(loc, alignment); + remove(next_loc - align_to(loc, alignment)); continue; } @@ -924,7 +937,7 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { // addi.d $t0, $tp, if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; sign_extend(val, 11) == val) - delta += 4; + remove(4); break; case R_LARCH_PCALA_HI20: // The following two instructions are used to materialize a @@ -949,7 +962,7 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21) && is_addi_d && get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2)) - delta += 4; + remove(4); } break; case R_LARCH_CALL36: @@ -965,7 +978,7 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { -(1 << 27) <= dist && dist < (1 << 27)) if (u32 jirl = *(ul32 *)(isec.contents.data() + rels[i].r_offset + 4); get_rd(jirl) == 0 || get_rd(jirl) == 1) - delta += 4; + remove(4); break; case R_LARCH_GOT_PC_HI20: // The following two instructions are used to load a symbol address @@ -981,7 +994,7 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (is_relaxable_got_load(ctx, isec, i)) { i64 dist = compute_distance(ctx, sym, isec, r); if (dist % 4 == 0 && -(1 << 21) <= dist && dist < (1 << 21)) - delta += 4; + remove(4); } break; case R_LARCH_TLS_DESC_PC_HI20: @@ -989,25 +1002,25 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { u64 P = isec.get_addr() + r.r_offset; i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P; if (-(1 << 21) <= dist && dist < (1 << 21)) - delta += 4; + remove(4); } else { - delta += 4; + remove(4); } break; case R_LARCH_TLS_DESC_PC_LO12: if (!sym.has_tlsdesc(ctx)) - delta += 4; + remove(4); break; case R_LARCH_TLS_DESC_LD: if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx) && sym.get_addr(ctx) + r.r_addend - ctx.tp_addr < 0x1000) - delta += 4; + remove(4); break; } } - isec.extra.r_deltas[rels.size()] = delta; - isec.sh_size -= delta; + if (!deltas.empty()) + isec.sh_size -= deltas.back().delta; } } // namespace mold diff --git a/src/arch-riscv.cc b/src/arch-riscv.cc index 6c8394ec5e..a6a2452652 100644 --- a/src/arch-riscv.cc +++ b/src/arch-riscv.cc @@ -206,9 +206,12 @@ static inline bool is_hi20(const ElfRel &rel) { template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); + std::span deltas = extra.r_deltas; + i64 k = 0; - auto get_r_delta = [&](i64 idx) { - return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx]; + // Returns the rd register of an R/I/U/J-type instruction. + auto get_rd = [&](i64 offset) { + return bits(*(ul32 *)(contents.data() + offset), 11, 7); }; for (i64 i = 0; i < rels.size(); i++) { @@ -216,9 +219,20 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX) continue; + i64 removed_bytes = 0; + i64 r_delta = 0; + + if (!deltas.empty()) { + while (k < deltas.size() && deltas[k].offset < rel.r_offset) + k++; + if (k < deltas.size() && deltas[k].offset == rel.r_offset) + removed_bytes = get_removed_bytes(deltas, k); + if (k > 0) + r_delta = deltas[k - 1].delta; + } + Symbol &sym = *file.symbols[rel.r_sym]; - i64 r_offset = rel.r_offset - get_r_delta(i); - i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i); + i64 r_offset = rel.r_offset - r_delta; u8 *loc = base + r_offset; auto check = [&](i64 val, i64 lo, i64 hi) { @@ -229,24 +243,19 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { }; auto find_paired_reloc = [&] { - if (sym.value <= rels[i].r_offset - get_r_delta(i)) { + u64 value = sym.esym().st_value; + if (value <= rels[i].r_offset) { for (i64 j = i - 1; j >= 0; j--) - if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j)) + if (is_hi20(rels[j]) && value == rels[j].r_offset) return j; } else { for (i64 j = i + 1; j < rels.size(); j++) - if (is_hi20(rels[j]) && sym.value == rels[j].r_offset - get_r_delta(j)) + if (is_hi20(rels[j]) && value == rels[j].r_offset) return j; } - Fatal(ctx) << *this << ": paired relocation is missing: " << i; }; - auto get_rd = [&](i64 offset) { - // Returns the rd register of an R/I/U/J-type instruction. - return bits(*(ul32 *)(contents.data() + offset), 11, 7); - }; - u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + r_offset; @@ -369,7 +378,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 S = sym2.get_addr(ctx); u64 A = rel2.r_addend; - u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2); + u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset); u64 G = sym2.get_got_idx(ctx) * sizeof(Word); switch (rel2.r_type) { @@ -487,7 +496,7 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { u64 S = sym2.get_addr(ctx); u64 A = rel2.r_addend; - u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2); + u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset); switch (rel.r_type) { case R_RISCV_TLSDESC_LOAD_LO12: @@ -803,18 +812,20 @@ u64 get_eflags(Context &ctx) { template <> void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { std::span> rels = isec.get_rels(ctx); - isec.extra.r_deltas.resize(rels.size() + 1); + std::vector &deltas = isec.extra.r_deltas; auto get_rd = [&](i64 offset) { return bits(*(ul32 *)(isec.contents.data() + offset), 11, 7); }; - i64 delta = 0; - for (i64 i = 0; i < rels.size(); i++) { const ElfRel &r = rels[i]; Symbol &sym = *isec.file.symbols[r.r_sym]; - isec.extra.r_deltas[i] = delta; + + auto remove = [&](u32 d) { + u32 sum = deltas.empty() ? 0 : deltas.back().delta; + deltas.emplace_back((u32)r.r_offset, sum + d); + }; // Handling R_RISCV_ALIGN is mandatory. // @@ -824,11 +835,12 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (r.r_type == R_RISCV_ALIGN) { // The total bytes of NOPs is stored to r_addend, so the next // instruction is r_addend away. + u64 delta = deltas.empty() ? 0 : deltas.back().delta; u64 loc = isec.get_addr() + r.r_offset - delta; u64 next_loc = loc + r.r_addend; u64 alignment = bit_ceil(r.r_addend + 1); assert(alignment <= (1 << isec.p2align)); - delta += next_loc - align_to(loc, alignment); + remove(next_loc - align_to(loc, alignment)); continue; } @@ -874,14 +886,14 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (use_rvc && rd == 0 && sign_extend(dist, 11) == dist) { // If rd is x0 and the jump target is within ±2 KiB, we can use // C.J, saving 6 bytes. - delta += 6; + remove(6); } else if (use_rvc && !E::is_64 && rd == 1 && sign_extend(dist, 11) == dist) { // If rd is x1 and the jump target is within ±2 KiB, we can use // C.JAL. This is RV32 only because C.JAL is RV32-only instruction. - delta += 6; + remove(6); } else if (sign_extend(dist, 20) == dist) { // If the jump target is within ±1 MiB, we can use JAL. - delta += 4; + remove(4); } break; } @@ -904,10 +916,10 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (use_rvc && rd != 0 && sign_extend(val, 5) == val) { // Replace AUIPC + LD with C.LI. - delta += 6; + remove(6); } else if (sign_extend(val, 11) == val) { // Replace AUIPC + LD with ADDI. - delta += 4; + remove(4); } } } @@ -921,11 +933,11 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { // We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)` // instruction pair with `add t0, x0, %lo(foo)` if foo's bits // [32:11] are all one or all zero. - delta += 4; + remove(4); } else if (use_rvc && rd != 0 && rd != 2 && sign_extend(val, 17) == val) { // If the upper 20 bits can actually be represented in 6 bits, // we can use C.LUI instead of LUI. - delta += 2; + remove(2); } break; } @@ -952,11 +964,11 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { // Here, we remove `lui` and `add` if the offset is within ±2 KiB. if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; sign_extend(val, 11) == val) - delta += 4; + remove(4); break; case R_RISCV_TLSDESC_HI20: if (!sym.has_tlsdesc(ctx)) - delta += 4; + remove(4); break; case R_RISCV_TLSDESC_LOAD_LO12: case R_RISCV_TLSDESC_ADD_LO12: { @@ -965,21 +977,21 @@ void shrink_section(Context &ctx, InputSection &isec, bool use_rvc) { if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) { if (!sym2.has_tlsdesc(ctx)) - delta += 4; + remove(4); } else { assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12); if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx)) if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr; sign_extend(val, 11) == val) - delta += 4; + remove(4); } break; } } } - isec.extra.r_deltas[rels.size()] = delta; - isec.sh_size -= delta; + if (!deltas.empty()) + isec.sh_size -= deltas.back().delta; } // ISA name handlers diff --git a/src/input-sections.cc b/src/input-sections.cc index 28c28f23c2..7e4025451e 100644 --- a/src/input-sections.cc +++ b/src/input-sections.cc @@ -225,27 +225,23 @@ void InputSection::write_to(Context &ctx, u8 *buf) { // relocations are allowed to remove bytes from the middle of a // section and shrink the overall size of it. if constexpr (is_riscv || is_loongarch) { - if (extra.r_deltas.empty()) { + std::span deltas = extra.r_deltas; + + if (deltas.empty()) { // If a section is not relaxed, we can copy it as a one big chunk. copy_contents(ctx, buf); } else { // A relaxed section is copied piece-wise. - std::span> rels = get_rels(ctx); - u8 *buf2 = buf; - i64 pos = 0; - - for (i64 i = 0; i < rels.size(); i++) { - i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i]; - if (delta == 0) - continue; - assert(delta > 0); - - const ElfRel &r = rels[i]; - memcpy(buf2, contents.data() + pos, r.r_offset - pos); - buf2 += r.r_offset - pos; - pos = r.r_offset + delta; + memcpy(buf, contents.data(), deltas[0].offset); + + for (i64 i = 0; i < deltas.size(); i++) { + RelocDelta x = deltas[i]; + i64 end = (i + 1 == deltas.size()) ? contents.size() : deltas[i + 1].offset; + i64 removed_bytes = get_removed_bytes(deltas, i); + memcpy(buf + x.offset - x.delta + removed_bytes, + contents.data() + x.offset + removed_bytes, + end - x.offset - removed_bytes); } - memcpy(buf2, contents.data() + pos, contents.size() - pos); } } else { copy_contents(ctx, buf); diff --git a/src/mold.h b/src/mold.h index ea916adcee..9de6c68e0e 100644 --- a/src/mold.h +++ b/src/mold.h @@ -259,11 +259,22 @@ struct FdeRecord { template struct InputSectionExtras {}; +struct RelocDelta { + u32 offset; + u32 delta; +}; + template requires is_riscv || is_loongarch struct InputSectionExtras { - std::vector r_deltas; + std::vector r_deltas; }; +static i64 get_removed_bytes(std::span deltas, i64 i) { + if (i == 0) + return deltas[i].delta; + return deltas[i].delta - deltas[i - 1].delta; +} + // InputSection represents a section in an input object file. template class __attribute__((aligned(4))) InputSection { @@ -1562,6 +1573,9 @@ void shrink_sections(Context &ctx); template void shrink_section(Context &ctx, InputSection &isec, bool use_rvc); +template +i64 get_r_delta(InputSection &isec, u64 offset); + template i64 compute_distance(Context &ctx, Symbol &sym, InputSection &isec, const ElfRel &rel); diff --git a/src/shrink-sections.cc b/src/shrink-sections.cc index f47829ce08..48d87bced9 100644 --- a/src/shrink-sections.cc +++ b/src/shrink-sections.cc @@ -72,7 +72,23 @@ static bool is_resizable(InputSection *isec) { } template <> -void shrink_sections(Context &ctx) { +i64 get_r_delta(InputSection &isec, u64 offset) { + std::span deltas = isec.extra.r_deltas; + if (deltas.empty()) + return 0; + + auto it = std::upper_bound(deltas.begin(), deltas.end(), offset, + [](u64 val, const RelocDelta &x) { + return val <= x.offset; + }); + + if (it == deltas.begin()) + return 0; + return (it - 1)->delta; +} + +template <> +void shrink_sections(Context &ctx) { Timer t(ctx, "shrink_sections"); // True if we can use the 2-byte instructions. This is usually true on @@ -92,29 +108,22 @@ void shrink_sections(Context &ctx) { // only ~0.04% larger than that of GNU ld), so we don't bother to handle // them. We scan relocations only once here. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (std::unique_ptr> &isec : file->sections) - if (is_resizable(isec.get())) + for (std::unique_ptr> &isec : file->sections) { + if (is_resizable(isec.get())) { + if (isec->sh_size > UINT32_MAX) + Fatal(ctx) << *isec << ": input section too large"; shrink_section(ctx, *isec, use_rvc); + } + } }); // Fix symbol values. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { - for (Symbol *sym : file->symbols) { - if (sym->file != file) - continue; - - InputSection *isec = sym->get_input_section(); - if (!isec || isec->extra.r_deltas.empty()) - continue; - - std::span> rels = isec->get_rels(ctx); - auto it = std::lower_bound(rels.begin(), rels.end(), sym->value, - [](const ElfRel &r, u64 val) { - return r.r_offset < val; - }); - - sym->value -= isec->extra.r_deltas[it - rels.begin()]; - } + for (Symbol *sym : file->symbols) + if (sym->file == file) + if (InputSection *isec = sym->get_input_section()) + if (!isec->extra.r_deltas.empty()) + sym->value -= get_r_delta(*isec, sym->value); }); // Recompute sizes of executable sections @@ -126,8 +135,8 @@ void shrink_sections(Context &ctx) { // Returns the distance between a relocated place and a symbol. template <> -i64 compute_distance(Context &ctx, Symbol &sym, - InputSection &isec, const ElfRel &rel) { +i64 compute_distance(Context &ctx, Symbol &sym, + InputSection &isec, const ElfRel &rel) { // We handle absolute symbols as if they were infinitely far away // because `shrink_section` may increase a distance between a branch // instruction and an absolute symbol. Branching to an absolute