From 3222c42b2d3041bd4c0ed01bda66277b378c5656 Mon Sep 17 00:00:00 2001 From: Tomoya Iwata Date: Mon, 30 Dec 2024 15:21:07 +0900 Subject: [PATCH 1/5] tests: parametrize bench mark tests In the previous implementation, it was necessary to adjust the timeout value every time a benchmark test added. By parametrizing the benchmark tests, the time required for each test becomes predictable, eliminating the need to adjust the timeout value Signed-off-by: Tomoya Iwata --- .../performance/test_benchmarks.py | 135 +++++++++++------- 1 file changed, 86 insertions(+), 49 deletions(-) diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 6e6541a688d..4e3e5b7228a 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -5,6 +5,7 @@ import json import logging import platform +import re import shutil from pathlib import Path @@ -17,78 +18,114 @@ LOGGER = logging.getLogger(__name__) +def get_executables(): + """ + Get a list of binaries for benchmarking + """ + + # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the + # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we + # extract the path to the compiled benchmark binary. + _, stdout, _ = cargo( + "bench", + f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", + ) + + executables = [] + for line in stdout.split("\n"): + if line: + msg = json.loads(line) + executable = msg.get("executable") + if executable: + executables.append(executable) + + return executables + + @pytest.mark.no_block_pr -@pytest.mark.timeout(900) -def test_no_regression_relative_to_target_branch(): +@pytest.mark.timeout(600) +@pytest.mark.parametrize("executable", get_executables()) +def test_no_regression_relative_to_target_branch(executable): """ Run the microbenchmarks in this repository, comparing results from pull request target branch against what's achieved on HEAD """ + run_criterion = get_run_criterion(executable) + compare_results = get_compare_results(executable) git_ab_test(run_criterion, compare_results) -def run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: +def get_run_criterion(executable): """ - Executes all benchmarks by running "cargo bench --no-run", finding the executables, and running them pinned to some CPU + Get function that executes specified benchmarks, and running them pinned to some CPU """ - baseline_name = "a_baseline" if is_a else "b_baseline" - - with contextlib.chdir(firecracker_checkout): - # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the - # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we - # extract the path to the compiled benchmark binary. - _, stdout, _ = cargo( - "bench", - f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", - ) - executables = [] - for line in stdout.split("\n"): - if line: - msg = json.loads(line) - executable = msg.get("executable") - if executable: - executables.append(executable) + def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: + baseline_name = "a_baseline" if is_a else "b_baseline" - for executable in executables: + with contextlib.chdir(firecracker_checkout): utils.check_output( f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}" ) - return firecracker_checkout / "build" / "cargo_target" / "criterion" + return firecracker_checkout / "build" / "cargo_target" / "criterion" + + return _run_criterion + +def get_compare_results(executable): + """ + Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main + """ -def compare_results(location_a_baselines: Path, location_b_baselines: Path): - """Compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main""" - for benchmark in location_b_baselines.glob("*"): - data = json.loads( - (benchmark / "b_baseline" / "estimates.json").read_text("utf-8") + def _compare_results(location_a_baselines: Path, location_b_baselines: Path): + + list_result = utils.check_output( + f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --list" ) - average_ns = data["mean"]["point_estimate"] + # Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`. + # Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create. + bench_marks = [ + re.sub(r"\s#(?P[1-9]+)", r"_\g", i.split(":")[0]) + for i in list_result.stdout.split("\n") + if i.endswith(": benchmark") + ] + + for benchmark in bench_marks: + data = json.loads( + ( + location_b_baselines / benchmark / "b_baseline" / "estimates.json" + ).read_text("utf-8") + ) - LOGGER.info("%s mean: %iµs", benchmark.name, average_ns / 1000) + average_ns = data["mean"]["point_estimate"] - # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here - # to do the comparison - for benchmark in location_a_baselines.glob("*"): - shutil.copytree( - benchmark / "a_baseline", - location_b_baselines / benchmark.name / "a_baseline", + LOGGER.info("%s mean: %iµs", benchmark, average_ns / 1000) + + # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here + # to do the comparison + + for benchmark in bench_marks: + shutil.copytree( + location_a_baselines / benchmark / "a_baseline", + location_b_baselines / benchmark / "a_baseline", + ) + + bench_result = utils.check_output( + f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --baseline a_baseline --load-baseline b_baseline", + True, + Path.cwd().parent, ) - _, stdout, _ = cargo( - "bench", - f"--all --target {platform.machine()}-unknown-linux-musl", - "--baseline a_baseline --load-baseline b_baseline", - ) + regressions_only = "\n\n".join( + result + for result in bench_result.stdout.split("\n\n") + if "Performance has regressed." in result + ) - regressions_only = "\n\n".join( - result - for result in stdout.split("\n\n") - if "Performance has regressed." in result - ) + # If this string is anywhere in stdout, then at least one of our benchmarks + # is now performing worse with the PR changes. + assert not regressions_only, "\n" + regressions_only - # If this string is anywhere in stdout, then at least one of our benchmarks - # is now performing worse with the PR changes. - assert not regressions_only, "\n" + regressions_only + return _compare_results From b95900e0bca410722add733a8bb2370e3080ecbf Mon Sep 17 00:00:00 2001 From: Tomoya Iwata Date: Sun, 12 Jan 2025 17:38:50 +0900 Subject: [PATCH 2/5] tests: use splitlines use `splitlines()` instead of `split("\n")`. Signed-off-by: Tomoya Iwata --- tests/integration_tests/performance/test_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 4e3e5b7228a..1f43cd2ab79 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -32,7 +32,7 @@ def get_executables(): ) executables = [] - for line in stdout.split("\n"): + for line in stdout.splitlines(): if line: msg = json.loads(line) executable = msg.get("executable") From 215af23e54aa51859c6a0fcd07e0625615026f84 Mon Sep 17 00:00:00 2001 From: Tomoya Iwata Date: Sun, 12 Jan 2025 17:43:01 +0900 Subject: [PATCH 3/5] tests: delete specific timeout parameter for performance test No longer need to set individual timeout values, Because parameterized performance tests. Signed-off-by: Tomoya Iwata --- tests/integration_tests/performance/test_benchmarks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 1f43cd2ab79..49e31df065b 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -43,7 +43,6 @@ def get_executables(): @pytest.mark.no_block_pr -@pytest.mark.timeout(600) @pytest.mark.parametrize("executable", get_executables()) def test_no_regression_relative_to_target_branch(executable): """ From d42d39ff88c6fec1a42463633eb9afc5fe518043 Mon Sep 17 00:00:00 2001 From: Tomoya Iwata Date: Sun, 19 Jan 2025 12:47:20 +0900 Subject: [PATCH 4/5] tests: build and run benchmark tests in each branch In the previous implementation, same binary that built in the PR branch execute twice, which was not a correct A/B test. This has been fixed. Signed-off-by: Tomoya Iwata --- .../performance/test_benchmarks.py | 86 ++++++++----------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 49e31df065b..6d3988c23d9 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -2,12 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 """Optional benchmarks-do-not-regress test""" import contextlib -import json import logging import platform import re import shutil from pathlib import Path +from typing import Callable, List import pytest @@ -18,43 +18,40 @@ LOGGER = logging.getLogger(__name__) -def get_executables(): +def get_benchmark_names() -> List[str]: """ - Get a list of binaries for benchmarking + Get a list of benchmark test names """ - # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the - # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we - # extract the path to the compiled benchmark binary. _, stdout, _ = cargo( "bench", - f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run", + f"--workspace --quiet --target {platform.machine()}-unknown-linux-musl", + "--list", ) - executables = [] - for line in stdout.splitlines(): - if line: - msg = json.loads(line) - executable = msg.get("executable") - if executable: - executables.append(executable) + # Format a string like `page_fault #2: benchmark` to a string like `page_fault`. + benchmark_names = [ + re.sub(r"\s#([0-9]*)", "", i.split(":")[0]) + for i in stdout.split("\n") + if i.endswith(": benchmark") + ] - return executables + return list(set(benchmark_names)) @pytest.mark.no_block_pr -@pytest.mark.parametrize("executable", get_executables()) -def test_no_regression_relative_to_target_branch(executable): +@pytest.mark.parametrize("benchname", get_benchmark_names()) +def test_no_regression_relative_to_target_branch(benchname): """ Run the microbenchmarks in this repository, comparing results from pull request target branch against what's achieved on HEAD """ - run_criterion = get_run_criterion(executable) - compare_results = get_compare_results(executable) + run_criterion = get_run_criterion(benchname) + compare_results = get_compare_results(benchname) git_ab_test(run_criterion, compare_results) -def get_run_criterion(executable): +def get_run_criterion(benchmark_name) -> Callable[[Path, bool], Path]: """ Get function that executes specified benchmarks, and running them pinned to some CPU """ @@ -64,7 +61,7 @@ def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: with contextlib.chdir(firecracker_checkout): utils.check_output( - f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}" + f"taskset -c 1 cargo bench --workspace --quiet -- {benchmark_name} --exact --save-baseline {baseline_name}" ) return firecracker_checkout / "build" / "cargo_target" / "criterion" @@ -72,54 +69,45 @@ def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path: return _run_criterion -def get_compare_results(executable): +def get_compare_results(benchmark_name) -> Callable[[Path, Path], None]: """ Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main """ def _compare_results(location_a_baselines: Path, location_b_baselines: Path): - list_result = utils.check_output( - f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --list" + _, stdout, _ = cargo( + "bench", + f"--workspace --target {platform.machine()}-unknown-linux-musl --quiet", + f"--exact {benchmark_name} --list", ) # Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`. # Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create. - bench_marks = [ - re.sub(r"\s#(?P[1-9]+)", r"_\g", i.split(":")[0]) - for i in list_result.stdout.split("\n") + bench_mark_targets = [ + re.sub(r"\s#(?P[0-9]*)", r"_\g", i.split(":")[0]) + for i in stdout.split("\n") if i.endswith(": benchmark") ] - for benchmark in bench_marks: - data = json.loads( - ( - location_b_baselines / benchmark / "b_baseline" / "estimates.json" - ).read_text("utf-8") - ) - - average_ns = data["mean"]["point_estimate"] - - LOGGER.info("%s mean: %iµs", benchmark, average_ns / 1000) - - # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here - # to do the comparison - - for benchmark in bench_marks: + # If benchmark test has multiple targets, the results of a single benchmark test will be output to multiple directories. + # For example, `page_fault` and `page_fault_2`. + # We need copy benchmark results each directories. + for bench_mark_target in bench_mark_targets: shutil.copytree( - location_a_baselines / benchmark / "a_baseline", - location_b_baselines / benchmark / "a_baseline", + location_a_baselines / bench_mark_target / "a_baseline", + location_b_baselines / bench_mark_target / "a_baseline", ) - bench_result = utils.check_output( - f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --baseline a_baseline --load-baseline b_baseline", - True, - Path.cwd().parent, + _, stdout, _ = cargo( + "bench", + f"--workspace --target {platform.machine()}-unknown-linux-musl", + f"{benchmark_name} --baseline a_baseline --load-baseline b_baseline", ) regressions_only = "\n\n".join( result - for result in bench_result.stdout.split("\n\n") + for result in stdout.split("\n\n") if "Performance has regressed." in result ) From 033ca8f4fcaea4e4e9f44e907542f6ed101e4b0a Mon Sep 17 00:00:00 2001 From: Tomoya Iwata Date: Sun, 19 Jan 2025 22:19:38 +0900 Subject: [PATCH 5/5] tests: optimize a/b test fixture In the previous implementation, git clone executed for each parameter of the parametize test. This has a large overhead, adjusted it so that fixtures only called once per class. Signed-off-by: Tomoya Iwata --- tests/framework/ab_test.py | 30 ++++++++++++++-- .../performance/test_benchmarks.py | 36 ++++++++++++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py index 2ef3e2350a7..65773c6f693 100644 --- a/tests/framework/ab_test.py +++ b/tests/framework/ab_test.py @@ -22,9 +22,9 @@ does not block PRs). If not, it fails, preventing PRs from introducing new vulnerable dependencies. """ import statistics -from pathlib import Path +from pathlib import Path, PosixPath from tempfile import TemporaryDirectory -from typing import Callable, List, Optional, TypeVar +from typing import Callable, Generator, List, Optional, Tuple, TypeVar import scipy @@ -98,6 +98,32 @@ def git_ab_test( return result_a, result_b, comparison +def git_clone_ab_dirs( + a_revision: str = DEFAULT_A_REVISION, + b_revision: Optional[str] = None, +) -> Generator[Tuple[PosixPath, PosixPath], None, None]: + """ + Prepare cloned Git repository to run A/B tests. + + :param a_revision: The revision to checkout for the "A" part of the test. Defaults to the pull request target branch + if run in CI, and "main" otherwise. + :param b_revision: The git revision to check out for "B" part of the test. Defaults to whatever is currently checked + out (in which case no temporary directory will be created). + """ + + with TemporaryDirectory() as tmp_dir: + dir_a = git_clone(Path(tmp_dir) / a_revision, a_revision) + + if b_revision: + dir_b = git_clone(Path(tmp_dir) / b_revision, b_revision) + else: + # By default, pytest execution happens inside the `tests` subdirectory. Pass the repository root, as + # documented. + dir_b = Path.cwd().parent + + yield (dir_a, dir_b) + + DEFAULT_A_DIRECTORY = FC_WORKSPACE_DIR / "build" / "main" DEFAULT_B_DIRECTORY = FC_WORKSPACE_DIR / "build" / "cargo_target" / DEFAULT_TARGET_DIR diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py index 6d3988c23d9..81afd37a0fb 100644 --- a/tests/integration_tests/performance/test_benchmarks.py +++ b/tests/integration_tests/performance/test_benchmarks.py @@ -12,10 +12,11 @@ import pytest from framework import utils -from framework.ab_test import git_ab_test +from framework.ab_test import binary_ab_test, git_clone_ab_dirs from host_tools.cargo_build import cargo LOGGER = logging.getLogger(__name__) +git_clone_ab_dirs_one_time = pytest.fixture(git_clone_ab_dirs, scope="class") def get_benchmark_names() -> List[str]: @@ -39,16 +40,33 @@ def get_benchmark_names() -> List[str]: return list(set(benchmark_names)) -@pytest.mark.no_block_pr -@pytest.mark.parametrize("benchname", get_benchmark_names()) -def test_no_regression_relative_to_target_branch(benchname): +class TestBenchMarks: """ - Run the microbenchmarks in this repository, comparing results from pull - request target branch against what's achieved on HEAD + This class is used to prevent fixtures from being executed for each parameter in + a parametrize test. """ - run_criterion = get_run_criterion(benchname) - compare_results = get_compare_results(benchname) - git_ab_test(run_criterion, compare_results) + + @pytest.mark.no_block_pr + @pytest.mark.parametrize("benchname", get_benchmark_names()) + def test_no_regression_relative_to_target_branch( + self, benchname, git_clone_ab_dirs_one_time + ): + """ + Run the microbenchmarks in this repository, comparing results from pull + request target branch against what's achieved on HEAD + """ + + dir_a = git_clone_ab_dirs_one_time[0] + dir_b = git_clone_ab_dirs_one_time[1] + run_criterion = get_run_criterion(benchname) + compare_results = get_compare_results(benchname) + + binary_ab_test( + test_runner=run_criterion, + comparator=compare_results, + a_directory=dir_a, + b_directory=dir_b, + ) def get_run_criterion(benchmark_name) -> Callable[[Path, bool], Path]: