From 08c68744e616c4ed2d194a5c6fb49cc4df7c611b Mon Sep 17 00:00:00 2001
From: Yorick Peterse <yorick@yorickpeterse.com>
Date: Tue, 14 Jan 2025 21:58:18 +0100
Subject: [PATCH] Enable LLVM optimizations

This enables a set of LLVM optimization passes for the "balanced" and
"aggressive" profiles. Both are based on the "default<O2>" pipeline,
with the removal of some irrelevant passes. In the case of the
"balanced" profile we also remove some additional passes that aren't
likely to be useful in most cases.

This fixes https://github.com/inko-lang/inko/issues/595.

Changelog: added
---
 compiler/src/llvm.rs        |   1 +
 compiler/src/llvm/opt.rs    | 213 ++++++++++++++++++++++++++++++++++++
 compiler/src/llvm/passes.rs |  48 ++++----
 3 files changed, 240 insertions(+), 22 deletions(-)
 create mode 100644 compiler/src/llvm/opt.rs

diff --git a/compiler/src/llvm.rs b/compiler/src/llvm.rs
index 4a005ff06..8d795be01 100644
--- a/compiler/src/llvm.rs
+++ b/compiler/src/llvm.rs
@@ -6,5 +6,6 @@ pub(crate) mod layouts;
 pub(crate) mod method_hasher;
 pub(crate) mod methods;
 pub(crate) mod module;
+pub(crate) mod opt;
 pub(crate) mod passes;
 pub(crate) mod runtime_function;
diff --git a/compiler/src/llvm/opt.rs b/compiler/src/llvm/opt.rs
new file mode 100644
index 000000000..f61273f1c
--- /dev/null
+++ b/compiler/src/llvm/opt.rs
@@ -0,0 +1,213 @@
+//! LLVM optimization passes to run for different compiler optimization levels.
+//!
+//! The pipelines here are based on the output of `opt -passes='default<O2>'
+//! -print-pipeline-passes`, with various redundant passes (e.g. those related
+//! to OpenMP) removed.
+pub(crate) const BALANCED: &str = "\
+    inferattrs,\
+    function<eager-inv>(\
+      lower-expect,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+      sroa<modify-cfg>,\
+      early-cse\
+    ),\
+    ipsccp,\
+    called-value-propagation,\
+    globalopt,\
+    function<eager-inv>(\
+      mem2reg,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>\
+    ),\
+    require<globals-aa>,\
+    function(invalidate<aa>),\
+    cgscc(\
+      inline<only-mandatory>,\
+      inline,\
+      function-attrs<skip-non-recursive>,\
+      function<eager-inv;no-rerun>(\
+        sroa<modify-cfg>,\
+        early-cse<memssa>,\
+        speculative-execution,\
+        jump-threading,\
+        correlated-propagation,\
+        simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+        instcombine<max-iterations=1000;no-use-loop-info>,\
+        aggressive-instcombine,\
+        constraint-elimination,\
+        tailcallelim,\
+        simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+        reassociate,\
+        loop-mssa(\
+          loop-instsimplify,\
+          loop-simplifycfg,\
+          licm<no-allowspeculation>,\
+          loop-rotate<header-duplication;no-prepare-for-lto>,\
+          licm<allowspeculation>,\
+          simple-loop-unswitch<no-nontrivial;trivial>\
+        ),\
+        simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+        instcombine<max-iterations=1000;no-use-loop-info>,\
+        loop(loop-idiom,indvars,loop-deletion,loop-unroll-full),\
+        sroa<modify-cfg>,\
+        vector-combine,\
+        mldst-motion<no-split-footer-bb>,\
+        gvn,\
+        sccp,\
+        bdce,\
+        instcombine<max-iterations=1000;no-use-loop-info>,\
+        jump-threading,\
+        correlated-propagation,\
+        adce,\
+        memcpyopt,\
+        dse,\
+        move-auto-init,\
+        loop-mssa(licm<allowspeculation>),\
+        simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+        instcombine<max-iterations=1000;no-use-loop-info>\
+      ),\
+      function-attrs,\
+      function(require<should-not-run-function-passes>)\
+    ),\
+    deadargelim,\
+    globalopt,\
+    globaldce,\
+    elim-avail-extern,\
+    rpo-function-attrs,\
+    recompute-globalsaa,\
+    function<eager-inv>(\
+      float2int,\
+      lower-constant-intrinsics,\
+      loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),\
+      loop-distribute,\
+      inject-tli-mappings,\
+      loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,\
+      loop-load-elim,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+      slp-vectorizer,\
+      vector-combine,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      loop-unroll<O2>,\
+      sroa<preserve-cfg>,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      loop-mssa(licm<allowspeculation>),\
+      alignment-from-assumptions,\
+      loop-sink,\
+      instsimplify,\
+      div-rem-pairs,\
+      tailcallelim,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>\
+    ),\
+    globaldce,\
+    constmerge,\
+    rel-lookup-table-converter,\
+    function(annotation-remarks),\
+    verify\
+";
+
+pub(crate) const AGGRESSIVE: &str = "\
+    inferattrs,\
+    function<eager-inv>(\
+      lower-expect,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+      sroa<modify-cfg>,\
+      early-cse\
+    ),\
+    ipsccp,\
+    called-value-propagation,\
+    globalopt,\
+    function<eager-inv>(\
+      mem2reg,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>\
+    ),\
+    require<globals-aa>,\
+    function(invalidate<aa>),\
+    cgscc(\
+      devirt<4>(\
+        inline<only-mandatory>,\
+        inline,\
+        function-attrs<skip-non-recursive>,\
+        function<eager-inv;no-rerun>(\
+          sroa<modify-cfg>,\
+          early-cse<memssa>,\
+          speculative-execution,\
+          jump-threading,\
+          correlated-propagation,\
+          simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+          instcombine<max-iterations=1000;no-use-loop-info>,\
+          aggressive-instcombine,\
+          constraint-elimination,\
+          libcalls-shrinkwrap,\
+          tailcallelim,\
+          simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+          reassociate,\
+          loop-mssa(\
+            loop-instsimplify,\
+            loop-simplifycfg,\
+            licm<no-allowspeculation>,\
+            loop-rotate<header-duplication;no-prepare-for-lto>,\
+            licm<allowspeculation>,\
+            simple-loop-unswitch<no-nontrivial;trivial>\
+          ),\
+          simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+          instcombine<max-iterations=1000;no-use-loop-info>,\
+          loop(loop-idiom,indvars,loop-deletion,loop-unroll-full),\
+          sroa<modify-cfg>,\
+          vector-combine,\
+          mldst-motion<no-split-footer-bb>,\
+          gvn,\
+          sccp,\
+          bdce,\
+          instcombine<max-iterations=1000;no-use-loop-info>,\
+          jump-threading,\
+          correlated-propagation,\
+          adce,\
+          memcpyopt,\
+          dse,\
+          move-auto-init,\
+          loop-mssa(licm<allowspeculation>),\
+          simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+          instcombine<max-iterations=1000;no-use-loop-info>\
+        ),\
+        function-attrs,\
+        function(require<should-not-run-function-passes>)\
+      )\
+    ),\
+    deadargelim,\
+    globalopt,\
+    globaldce,\
+    elim-avail-extern,\
+    rpo-function-attrs,\
+    recompute-globalsaa,\
+    function<eager-inv>(\
+      float2int,\
+      lower-constant-intrinsics,\
+      loop(loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion),\
+      loop-distribute,\
+      inject-tli-mappings,\
+      loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,\
+      loop-load-elim,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>,\
+      slp-vectorizer,\
+      vector-combine,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      loop-unroll<O2>,\
+      sroa<preserve-cfg>,\
+      instcombine<max-iterations=1000;no-use-loop-info>,\
+      loop-mssa(licm<allowspeculation>),\
+      alignment-from-assumptions,\
+      loop-sink,\
+      instsimplify,\
+      div-rem-pairs,\
+      tailcallelim,\
+      simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>\
+    ),\
+    globaldce,\
+    constmerge,\
+    rel-lookup-table-converter,\
+    function(annotation-remarks),\
+    verify\
+";
diff --git a/compiler/src/llvm/passes.rs b/compiler/src/llvm/passes.rs
index 2e5ef92ef..9a3449725 100644
--- a/compiler/src/llvm/passes.rs
+++ b/compiler/src/llvm/passes.rs
@@ -14,6 +14,7 @@ use crate::llvm::layouts::{
 };
 use crate::llvm::methods::Methods;
 use crate::llvm::module::Module;
+use crate::llvm::opt;
 use crate::llvm::runtime_function::RuntimeFunction;
 use crate::mir::{
     CastType, Constant, Instruction, InstructionLocation, Method, Mir,
@@ -300,22 +301,17 @@ pub(crate) fn lower_all(
         }
     }
 
-    // LLVM's optimisation level controls which passes to run, but some/many of
-    // those may not be relevant to Inko, while slowing down compile times. Thus
-    // instead of using this knob, we provide our own list of passes. Swift and
-    // Rust (and possibly others) take a similar approach.
+    // The code generation optimization level to use. This is separate from the
+    // optimization passes to run.
     //
-    // For the aggressive mode we simply enable the full suite of LLVM
-    // optimizations, likely greatly increasing the compilation times.
+    // It's unclear what the difference is between Default and Aggressive, and
+    // we've not been able to measure a difference in runtime performance. Swift
+    // also appears to just use Default when optimizations are enabled
+    // (https://github.com/swiftlang/swift/blob/09d122af7c08e1a6e7fe76f122ddab05b0bbda59/lib/IRGen/IRGen.cpp#L929-L931),
+    // so we'll assume this is good enough.
     let level = match state.config.opt {
         Opt::None => OptimizationLevel::None,
-
-        // We have yet to figure out what optimizations we want to enable
-        // here, hence we don't apply any at all.
-        Opt::Balanced => OptimizationLevel::None,
-
-        // This is the equivalent of -O3 for clang.
-        Opt::Aggressive => OptimizationLevel::Aggressive,
+        _ => OptimizationLevel::Default,
     };
 
     // Our "queue" is just an atomic integer in the range 0..N where N is the
@@ -547,19 +543,27 @@ impl<'a> Worker<'a> {
     fn run_passes(&self, module: &Module, layouts: &Layouts) {
         let layout = layouts.target_data.get_data_layout();
         let opts = PassBuilderOptions::create();
-        let passes = ["mem2reg"].join(",");
+
+        // The LLVM pipeline to run, including passes that we must run
+        // regardless of the optimization level.
+        //
+        // We need to scope pass names properly, otherwise we may run into
+        // issues similar to https://github.com/llvm/llvm-project/issues/81128)
+        let mut passes = ["function(mem2reg)"].join(",");
+        let extra = match self.shared.state.config.opt {
+            Opt::Balanced => Some(opt::BALANCED),
+            Opt::Aggressive => Some(opt::AGGRESSIVE),
+            _ => None,
+        };
+
+        if let Some(v) = extra {
+            passes.push(',');
+            passes.push_str(v);
+        }
 
         module.set_data_layout(&layout);
         module.set_triple(&self.machine.get_triple());
         module.run_passes(passes.as_str(), &self.machine, opts).unwrap();
-
-        // The pass "aliases" such as "default<O3>" can't be combined together
-        // with other passes, so we have to handle them separately.
-        if let Opt::Aggressive = self.shared.state.config.opt {
-            let opts = PassBuilderOptions::create();
-
-            module.run_passes("default<O3>", &self.machine, opts).unwrap();
-        }
     }
 
     fn write_object_file(