From 6ec90ba527cda43c742eff66cdb9cfc33e608c99 Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Mon, 18 Mar 2024 11:50:45 -0700 Subject: [PATCH 1/2] ci: Enable collection of logs and metrics --- .../workflows/notify-metrics-availability.sh | 19 +++ .github/workflows/tests.yml | 62 ++++++++- scripts/run_prometheus.sh | 120 ++++++++++++++++++ scripts/run_promtail.sh | 115 +++++++++++++++++ tests/README.md | 8 ++ tests/load/load_test.go | 1 + tests/utils/tmpnet.go | 3 +- tests/warp/warp_test.go | 1 + 8 files changed, 325 insertions(+), 4 deletions(-) create mode 100755 .github/workflows/notify-metrics-availability.sh create mode 100755 scripts/run_prometheus.sh create mode 100755 scripts/run_promtail.sh diff --git a/.github/workflows/notify-metrics-availability.sh b/.github/workflows/notify-metrics-availability.sh new file mode 100755 index 0000000000..fd69064045 --- /dev/null +++ b/.github/workflows/notify-metrics-availability.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Timestamps are in seconds +from_timestamp="$(date '+%s')" +monitoring_period=900 # 15 minutes +to_timestamp="$((from_timestamp + monitoring_period))" + +# Grafana expects microseconds, so pad timestamps with 3 zeros +metrics_url="${GRAFANA_URL}&var-filter=gh_job_id%7C%3D%7C${GH_JOB_ID}&from=${from_timestamp}000&to=${to_timestamp}000" + +# Optionally ensure that the link displays metrics only for the shared +# network rather than mixing it with the results for private networks. +if [[ -n "${FILTER_BY_OWNER:-}" ]]; then + metrics_url="${metrics_url}&var-filter=network_owner%7C%3D%7C${FILTER_BY_OWNER}" +fi + +echo "::notice links::metrics ${metrics_url}" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 686e09f893..03b64e8986 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,8 +9,8 @@ on: pull_request: env: - tmpnet_data_path: ~/.tmpnet/networks min_go_version: '~1.21.7' + grafana_url: https://grafana-experimental.avax-dev.network/d/kBQpRdWnk/avalanche-main-dashboard?orgId=1&refresh=10s&var-filter=is_ephemeral_node%7C%3D%7Cfalse&var-filter=gh_repo%7C%3D%7Cava-labs%2Fsubnet-evm&var-filter=gh_run_id%7C%3D%7C${{ github.run_id }}&var-filter=gh_run_attempt%7C%3D%7C${{ github.run_attempt }} jobs: lint_test: @@ -126,15 +126,43 @@ jobs: - name: Build Subnet-EVM Plugin Binary shell: bash run: ./scripts/build.sh /tmp/e2e-test/avalanchego/plugins/srEXiWaHuhNyGwPUi444Tu47ZEDwxTWrbQiuD7FmgSAQ6X7Dy + - name: Start prometheus + shell: bash + run: bash -x ./scripts/run_prometheus.sh + env: + PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }} + PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }} + - name: Start promtail + shell: bash + run: bash -x ./scripts/run_promtail.sh + env: + LOKI_ID: ${{ secrets.LOKI_ID }} + LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }} + - name: Notify of metrics availability + shell: bash + run: .github/workflows/notify-metrics-availability.sh + env: + GRAFANA_URL: ${{ env.grafana_url }} + GH_JOB_ID: ${{ github.job }} - name: Run Warp E2E Tests shell: bash run: AVALANCHEGO_BUILD_PATH=/tmp/e2e-test/avalanchego ./scripts/run_ginkgo_warp.sh + env: + GH_REPO: ${{ github.repository }} + GH_WORKFLOW: ${{ github.workflow }} + GH_RUN_ID: ${{ github.run_id }} + GH_RUN_NUMBER: ${{ github.run_number }} + GH_RUN_ATTEMPT: ${{ github.run_attempt }} + GH_JOB_ID: ${{ github.job }} - name: Upload tmpnet network dir for warp testing if: always() uses: actions/upload-artifact@v4 with: name: warp-tmpnet-data - path: ${{ env.tmpnet_data_path }} + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log if-no-files-found: error e2e_load: name: e2e load tests @@ -155,15 +183,43 @@ jobs: - name: Build Subnet-EVM Plugin Binary shell: bash run: ./scripts/build.sh /tmp/e2e-test/avalanchego/plugins/srEXiWaHuhNyGwPUi444Tu47ZEDwxTWrbQiuD7FmgSAQ6X7Dy + - name: Start prometheus + shell: bash + run: bash -x ./scripts/run_prometheus.sh + env: + PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }} + PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }} + - name: Start promtail + shell: bash + run: bash -x ./scripts/run_promtail.sh + env: + LOKI_ID: ${{ secrets.LOKI_ID }} + LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }} + - name: Notify of metrics availability + shell: bash + run: .github/workflows/notify-metrics-availability.sh + env: + GRAFANA_URL: ${{ env.grafana_url }} + GH_JOB_ID: ${{ github.job }} - name: Run E2E Load Tests shell: bash run: AVALANCHEGO_BUILD_PATH=/tmp/e2e-test/avalanchego ./scripts/run_ginkgo_load.sh + env: + GH_REPO: ${{ github.repository }} + GH_WORKFLOW: ${{ github.workflow }} + GH_RUN_ID: ${{ github.run_id }} + GH_RUN_NUMBER: ${{ github.run_number }} + GH_RUN_ATTEMPT: ${{ github.run_attempt }} + GH_JOB_ID: ${{ github.job }} - name: Upload tmpnet network dir for load testing if: always() uses: actions/upload-artifact@v4 with: name: load-tmpnet-data - path: ${{ env.tmpnet_data_path }} + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log if-no-files-found: error mock_gen: name: MockGen Check diff --git a/scripts/run_prometheus.sh b/scripts/run_prometheus.sh new file mode 100755 index 0000000000..1952227231 --- /dev/null +++ b/scripts/run_prometheus.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Starts a prometheus instance in agent-mode, forwarding to a central +# instance. Intended to enable metrics collection from temporary networks running +# locally and in CI. +# +# The prometheus instance will remain running in the background and will forward +# metrics to the central instance for all tmpnet networks. +# +# To stop it: +# +# $ kill -9 `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid +# + +# e.g., +# PROMETHEUS_ID= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh +if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then + echo "must be run from repository root" + exit 255 +fi + +PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus" +PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid + +# First check if an agent-mode prometheus is already running. A single instance can collect +# metrics from all local temporary networks. +if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then + echo "prometheus is already running locally with --enable-feature=agent" + exit 0 +fi + +PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-experimental.avax-dev.network}" +if [[ -z "${PROMETHEUS_URL}" ]]; then + echo "Please provide a value for PROMETHEUS_URL" + exit 1 +fi + +PROMETHEUS_ID="${PROMETHEUS_ID:-}" +if [[ -z "${PROMETHEUS_ID}" ]]; then + echo "Please provide a value for PROMETHEUS_ID" + exit 1 +fi + +PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}" +if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then + echo "Plase provide a value for PROMETHEUS_PASSWORD" + exit 1 +fi + +# This was the LTS version when this script was written. Probably not +# much reason to update it unless something breaks since the usage +# here is only to collect metrics from temporary networks. +VERSION="2.45.3" + +# Ensure the prometheus command is locally available +CMD=prometheus +if ! command -v "${CMD}" &> /dev/null; then + # Try to use a local version + CMD="${PWD}/bin/prometheus" + if ! command -v "${CMD}" &> /dev/null; then + echo "prometheus not found, attempting to install..." + + # Determine the arch + if which sw_vers &> /dev/null; then + echo "on macos, only amd64 binaries are available so rosetta is required on apple silicon machines." + echo "to avoid using rosetta, install via homebrew: brew install prometheus" + DIST=darwin + else + ARCH="$(uname -i)" + if [[ "${ARCH}" != "x86_64" ]]; then + echo "on linux, only amd64 binaries are available. manual installation of prometheus is required." + exit 1 + else + DIST="linux" + fi + fi + + # Install the specified release + PROMETHEUS_FILE="prometheus-${VERSION}.${DIST}-amd64" + URL="https://github.com/prometheus/prometheus/releases/download/v${VERSION}/${PROMETHEUS_FILE}.tar.gz" + curl -s -L "${URL}" | tar zxv -C /tmp > /dev/null + mkdir -p "$(dirname "${CMD}")" + cp /tmp/"${PROMETHEUS_FILE}/prometheus" "${CMD}" + fi +fi + +# Configure prometheus +FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs" +mkdir -p "${FILE_SD_PATH}" + +echo "writing configuration..." +cat >"${PROMETHEUS_WORKING_DIR}"/prometheus.yaml < prometheus.log 2>&1 & +echo $! > "${PIDFILE}" +echo "running with pid $(cat "${PIDFILE}")" diff --git a/scripts/run_promtail.sh b/scripts/run_promtail.sh new file mode 100755 index 0000000000..9b386d3d55 --- /dev/null +++ b/scripts/run_promtail.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Starts a promtail instance to collect logs from temporary networks +# running locally and in CI. +# +# The promtail instance will remain running in the background and will forward +# logs to the central instance for all tmpnet networks. +# +# To stop it: +# +# $ kill -9 `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid +# + +# e.g., +# LOKI_ID= LOKI_PASSWORD= ./scripts/run_promtail.sh +if ! [[ "$0" =~ scripts/run_promtail.sh ]]; then + echo "must be run from repository root" + exit 255 +fi + +PROMTAIL_WORKING_DIR="${HOME}/.tmpnet/promtail" +PIDFILE="${PROMTAIL_WORKING_DIR}"/run.pid + +# First check if promtail is already running. A single instance can +# collect logs from all local temporary networks. +if pgrep --pidfile="${PIDFILE}" &> /dev/null; then + echo "promtail is already running" + exit 0 +fi + +LOKI_URL="${LOKI_URL:-https://loki-experimental.avax-dev.network}" +if [[ -z "${LOKI_URL}" ]]; then + echo "Please provide a value for LOKI_URL" + exit 1 +fi + +LOKI_ID="${LOKI_ID:-}" +if [[ -z "${LOKI_ID}" ]]; then + echo "Please provide a value for LOKI_ID" + exit 1 +fi + +LOKI_PASSWORD="${LOKI_PASSWORD:-}" +if [[ -z "${LOKI_PASSWORD}" ]]; then + echo "Plase provide a value for LOKI_PASSWORD" + exit 1 +fi + +# Version as of this writing +VERSION="v2.9.5" + +# Ensure the promtail command is locally available +CMD=promtail +if ! command -v "${CMD}" &> /dev/null; then + # Try to use a local version + CMD="${PWD}/bin/promtail" + if ! command -v "${CMD}" &> /dev/null; then + echo "promtail not found, attempting to install..." + # Determine the arch + if which sw_vers &> /dev/null; then + DIST="darwin-$(uname -m)" + else + ARCH="$(uname -i)" + if [[ "${ARCH}" == "aarch64" ]]; then + ARCH="arm64" + elif [[ "${ARCH}" == "x86_64" ]]; then + ARCH="amd64" + fi + DIST="linux-${ARCH}" + fi + + # Install the specified release + PROMTAIL_FILE="promtail-${DIST}" + ZIP_PATH="/tmp/${PROMTAIL_FILE}.zip" + BIN_DIR="$(dirname "${CMD}")" + URL="https://github.com/grafana/loki/releases/download/${VERSION}/promtail-${DIST}.zip" + curl -L -o "${ZIP_PATH}" "${URL}" + unzip "${ZIP_PATH}" -d "${BIN_DIR}" + mv "${BIN_DIR}/${PROMTAIL_FILE}" "${CMD}" + fi +fi + +# Configure promtail +FILE_SD_PATH="${PROMTAIL_WORKING_DIR}/file_sd_configs" +mkdir -p "${FILE_SD_PATH}" + +echo "writing configuration..." +cat >"${PROMTAIL_WORKING_DIR}"/promtail.yaml < promtail.log 2>&1 & +echo $! > "${PIDFILE}" +echo "running with pid $(cat "${PIDFILE}")" diff --git a/tests/README.md b/tests/README.md index a99cac0b6f..0b0cf9d6d4 100644 --- a/tests/README.md +++ b/tests/README.md @@ -29,3 +29,11 @@ The network started by `tmpnetctl` won't come with subnets configured, so the test suite will add them to the network the first time it runs. Subsequent test runs will be able to reuse those subnets without having to set them up. + +## Collection of logs and metrics + +Logs and metrics can be optionally collected for tmpnet networks and +viewed in grafana. The details of configuration and usage for +subnet-evm mirror those of avalanchego and the same +[documentation](https://github.com/ava-labs/avalanchego/blob/master/tests/fixture/tmpnet/README.md#Monitoring) +applies. diff --git a/tests/load/load_test.go b/tests/load/load_test.go index 5a2b5c21e5..99094291ce 100644 --- a/tests/load/load_test.go +++ b/tests/load/load_test.go @@ -68,6 +68,7 @@ var _ = ginkgo.Describe("[Load Simulator]", ginkgo.Ordered, func() { env = e2e.NewTestEnvironment( flagVars, utils.NewTmpnetNetwork( + "subnet-evm-small-load", nodes, tmpnet.FlagsMap{ // The default tmpnet log level (debug) induces too much overhead for load testing. diff --git a/tests/utils/tmpnet.go b/tests/utils/tmpnet.go index af1e24908a..babb70bc58 100644 --- a/tests/utils/tmpnet.go +++ b/tests/utils/tmpnet.go @@ -24,7 +24,7 @@ func NewTmpnetNodes(count int) []*tmpnet.Node { return nodes } -func NewTmpnetNetwork(nodes []*tmpnet.Node, flags tmpnet.FlagsMap, subnets ...*tmpnet.Subnet) *tmpnet.Network { +func NewTmpnetNetwork(owner string, nodes []*tmpnet.Node, flags tmpnet.FlagsMap, subnets ...*tmpnet.Subnet) *tmpnet.Network { defaultFlags := tmpnet.FlagsMap{} defaultFlags.SetDefaults(flags) defaultFlags.SetDefaults(tmpnet.FlagsMap{ @@ -33,6 +33,7 @@ func NewTmpnetNetwork(nodes []*tmpnet.Node, flags tmpnet.FlagsMap, subnets ...*t config.ProposerVMUseCurrentHeightKey: true, }) return &tmpnet.Network{ + Owner: owner, DefaultFlags: defaultFlags, Nodes: nodes, Subnets: subnets, diff --git a/tests/warp/warp_test.go b/tests/warp/warp_test.go index 3475aecbc8..df97f1c9c5 100644 --- a/tests/warp/warp_test.go +++ b/tests/warp/warp_test.go @@ -104,6 +104,7 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { env := e2e.NewTestEnvironment( flagVars, utils.NewTmpnetNetwork( + "subnet-evm-warp-e2e", nodes, tmpnet.FlagsMap{}, utils.NewTmpnetSubnet(subnetAName, genesisPath, chainConfig, nodes...), From c507aa6072e814cbff649ce817e343ca47270eec Mon Sep 17 00:00:00 2001 From: Maru Newby Date: Tue, 26 Mar 2024 01:44:21 -0700 Subject: [PATCH 2/2] fixup: Cleanup log configuration for load and warp testing --- tests/load/load_test.go | 12 ++---------- tests/utils/tmpnet.go | 7 +++++-- tests/warp/warp_test.go | 9 ++------- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/tests/load/load_test.go b/tests/load/load_test.go index 99094291ce..5ab4da1223 100644 --- a/tests/load/load_test.go +++ b/tests/load/load_test.go @@ -19,7 +19,6 @@ import ( "github.com/ethereum/go-ethereum/log" - "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/ids" "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" @@ -59,10 +58,6 @@ var _ = ginkgo.Describe("[Load Simulator]", ginkgo.Ordered, func() { ginkgo.BeforeAll(func() { genesisPath := filepath.Join(repoRootPath, "tests/load/genesis/genesis.json") - // The load tests are flaky at high levels of evm logging, so leave it at - // the default level instead of raising it to debug (as the warp testing does). - chainConfig := tmpnet.FlagsMap{} - nodes := utils.NewTmpnetNodes(nodeCount) env = e2e.NewTestEnvironment( @@ -70,11 +65,8 @@ var _ = ginkgo.Describe("[Load Simulator]", ginkgo.Ordered, func() { utils.NewTmpnetNetwork( "subnet-evm-small-load", nodes, - tmpnet.FlagsMap{ - // The default tmpnet log level (debug) induces too much overhead for load testing. - config.LogLevelKey: "info", - }, - utils.NewTmpnetSubnet(subnetAName, genesisPath, chainConfig, nodes...), + tmpnet.FlagsMap{}, + utils.NewTmpnetSubnet(subnetAName, genesisPath, utils.DefaultChainConfig, nodes...), ), ) }) diff --git a/tests/utils/tmpnet.go b/tests/utils/tmpnet.go index babb70bc58..5c5258e366 100644 --- a/tests/utils/tmpnet.go +++ b/tests/utils/tmpnet.go @@ -14,6 +14,11 @@ import ( "github.com/ava-labs/subnet-evm/plugin/evm" ) +var DefaultChainConfig = tmpnet.FlagsMap{ + "log-level": "debug", + "warp-api-enabled": true, +} + func NewTmpnetNodes(count int) []*tmpnet.Node { nodes := make([]*tmpnet.Node, count) for i := range nodes { @@ -28,8 +33,6 @@ func NewTmpnetNetwork(owner string, nodes []*tmpnet.Node, flags tmpnet.FlagsMap, defaultFlags := tmpnet.FlagsMap{} defaultFlags.SetDefaults(flags) defaultFlags.SetDefaults(tmpnet.FlagsMap{ - // Remove when vendored tmpnet default is `off`. tmpnet nodes are run headless so stdout logging is unnecessary. - config.LogDisplayLevelKey: "off", config.ProposerVMUseCurrentHeightKey: true, }) return &tmpnet.Network{ diff --git a/tests/warp/warp_test.go b/tests/warp/warp_test.go index df97f1c9c5..5bfa82f7f1 100644 --- a/tests/warp/warp_test.go +++ b/tests/warp/warp_test.go @@ -94,11 +94,6 @@ func TestE2E(t *testing.T) { var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { // Run only once in the first ginkgo process - chainConfig := tmpnet.FlagsMap{ - "log-level": "debug", - "warp-api-enabled": true, - } - nodes := utils.NewTmpnetNodes(tmpnet.DefaultNodeCount) env := e2e.NewTestEnvironment( @@ -107,8 +102,8 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { "subnet-evm-warp-e2e", nodes, tmpnet.FlagsMap{}, - utils.NewTmpnetSubnet(subnetAName, genesisPath, chainConfig, nodes...), - utils.NewTmpnetSubnet(subnetBName, genesisPath, chainConfig, nodes...), + utils.NewTmpnetSubnet(subnetAName, genesisPath, utils.DefaultChainConfig, nodes...), + utils.NewTmpnetSubnet(subnetBName, genesisPath, utils.DefaultChainConfig, nodes...), ), )