holochain · cdunster · Sep 17, 2024 · Sep 6, 2024 · Sep 9, 2024 · Sep 10, 2024
diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml
@@ -0,0 +1,152 @@
+name: "Performance Tests"
+
+on:
+  workflow_dispatch:
+
+env:
+  INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }}
+  WT_METRICS_DIR: "${{ github.workspace }}/telegraf/metrics"
+
+jobs:
+  local-test:
+    runs-on: [self-hosted, wind-tunnel]
+    strategy:
+      fail-fast: false
+      matrix:
+        # To run a local test with default configuration, add the scenario name to this array.
+        scenario: [ zome_call_single_value, single_write_many_read, write_read, write_query, local_signals, write_validated ]
+        # To run a local test with additional configuration, add the scenario name and `extra-args` as an `include` item.
+        include:
+          - scenario: dht_sync_lag
+            extra-args: "--agents 2 --behaviour write:1 --behaviour record_lag:1"
+
+            # Test how long it takes to install a minimally-small hApp.
+          - scenario: app_install
+            extra-args: "--behaviour minimal:1"
+
+            # Test how long it takes to install a large hApp.
+          - scenario: app_install
+            extra-args: "--behaviour large:1"
+
+          - scenario: first_call
+            extra-args: "--agents 1 --behaviour local:1"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run - ${{ matrix.scenario }}
+        id: run_test
+        run: |
+          # Start a sandbox conductor and run it in the background
+          nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &"
+
+          RUST_LOG=info nix run .#${{ matrix.scenario }} -- --connection-string ws://localhost:8888 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2)
+
+          pkill hc && pkill holochain && pkill lair-keystore
+
+          RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/')
+          echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT"
+          echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY
+
+      - name: Run Telegraf to upload influx metrics
+        run: |
+          if ! nix run .#ci-telegraf
+          then
+            echo "::group::Telegraf errors"
+            status=1
+            # Print errors as such in GitHub logs.
+            grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::"
+            echo "::endgroup::"
+          fi
+
+          echo "::group::Telegraf warnings"
+          # Print warnings as such in GitHub logs.
+          grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::"
+          echo "::endgroup::"
+
+          exit ${status-0}
+
+      - name: Upload logs as artifacts
+        if: success() || failure()
+        id: upload-artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}"
+          path: |
+            logs/scenario-stdout.log
+            logs/scenario-stderr.log
+            logs/telegraf-stdout.log
+            logs/telegraf-stderr.log
+
+      - name: Output Path to logs in summary
+        run: |
+          echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY
+
+  trycp-test:
+    runs-on: [self-hosted, wind-tunnel]
+    strategy:
+      fail-fast: false
+      matrix:
+        # To run a test with TryCP and default configuration, add the scenario name to this array.
+        scenario: [ trycp_write_validated, remote_call_rate, validation_receipts ]
+        # To run a test with TryCP and additional configuration, add the scenario name and `extra-args` as an `include` item.
+        include:
+          - scenario: two_party_countersigning
+            extra-args: "--behaviour initiate:1 --behaviour participate:1"
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run - ${{ matrix.scenario }}
+        id: run_test
+        run: |
+          set -x
+
+          # Start local network services
+          nix develop .#ci -c bash -c "hc-run-local-services --bootstrap-port 4422 --signal-port 4423 &"
+          # Start a TryCP instance
+          nix develop .#ci -c bash -c "source ./scripts/trycp.sh && start_trycp &"
+
+          RUST_LOG=info CONDUCTOR_CONFIG="CI" TRYCP_RUST_LOG="info" MIN_PEERS=2 nix run .#${{ matrix.scenario }} -- --targets targets-ci.yaml --instances-per-target 2 --duration 120 --no-progress --reporter influx-file ${{ matrix.extra-args }} > >(tee logs/scenario-stdout.log) 2> >(tee logs/scenario-stderr.log >&2)
+
+          # Stop the TryCP instance
+          nix develop .#ci -c bash -c "source ./scripts/trycp.sh && stop_trycp"
+          # Stop local network services
+          pkill hc-run-local
+
+          RUN_ID=$(grep -m1 "#RunId" logs/scenario-stdout.log | sed 's/#RunId: \[\(.\+\)\]/\1/')
+          echo "RUN_ID=$RUN_ID" >> "$GITHUB_OUTPUT"
+          echo "# Run ID: $RUN_ID" >> $GITHUB_STEP_SUMMARY
+
+      - name: Run Telegraf to upload influx metrics
+        run: |
+          if ! nix run .#ci-telegraf
+          then
+            echo "::group::Telegraf errors"
+            status=1
+            # Print errors as such in GitHub logs.
+            grep "E!" logs/telegraf-stderr.log | xargs -l echo "::error ::"
+            echo "::endgroup::"
+          fi
+
+          echo "::group::Telegraf warnings"
+          # Print warnings as such in GitHub logs.
+          grep "W!" logs/telegraf-stderr.log | xargs -l echo "::warning ::"
+          echo "::endgroup::"
+
+          exit ${status-0}
+
+      - name: Upload logs as artifacts
+        if: success() || failure()
+        id: upload-artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: "logs_${{ matrix.scenario }}_${{ steps.run_test.outputs.RUN_ID }}"
+          path: |
+            logs/scenario-stdout.log
+            logs/scenario-stderr.log
+            logs/telegraf-stdout.log
+            logs/telegraf-stderr.log
+            logs/${{ steps.run_test.outputs.RUN_ID }}/
+
+      - name: Output Path to logs in summary
+        run: |
+          echo "# Logs: [${{ steps.upload-artifact.outputs.artifact-id }}](${{ steps.upload-artifact.outputs.artifact-url }})" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -72,7 +72,6 @@ jobs:
           # Start a sandbox conductor and run it in the background
           nix develop .#ci -c bash -c "hc s clean && echo "1234" | hc s --piped create && echo "1234" | hc s --piped -f 8888 run &"
 
-          # TODO using `localhost` is resolving to an IPv6 address, but why is that giving a connection refused?
           # Run the scenario for 5 seconds
           RUST_LOG=info nix run .#zome_call_single_value -- --connection-string ws://localhost:8888 --duration 5 --no-progress
 

diff --git a/flake.nix b/flake.nix
@@ -111,6 +111,11 @@
         packages = {
           default = config.workspace.workspace;
           inherit (config.workspace) workspace;
+          ci-telegraf = pkgs.writeShellApplication {
+            name = "ci-telegraf";
+            runtimeInputs = [ pkgs.telegraf ];
+            text = "telegraf --config telegraf/runner-telegraf.conf --once > >(tee logs/telegraf-stdout.log) 2> >(tee logs/telegraf-stderr.log >&2)";
+          };
         };
 
         checks = {

diff --git a/telegraf/runner-telegraf.conf b/telegraf/runner-telegraf.conf
@@ -0,0 +1,216 @@
+# Telegraf Configuration
+#
+# Telegraf is entirely plugin driven. All metrics are gathered from the
+# declared inputs, and sent to the declared outputs.
+#
+# Plugins must be declared in here to be active.
+# To deactivate a plugin, comment out the name and any variables.
+#
+# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
+# file would generate.
+#
+# Environment variables can be used anywhere in this config file, simply surround
+# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"),
+# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR})
+
+
+# Configuration for telegraf agent
+[agent]
+  ## Default data collection interval for all inputs
+  interval = "10s"
+  ## Rounds collection interval to 'interval'
+  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+  round_interval = true
+
+  ## Telegraf will send metrics to outputs in batches of at most
+  ## metric_batch_size metrics.
+  ## This controls the size of writes that Telegraf sends to output plugins.
+  metric_batch_size = 1000
+
+  ## Maximum number of unwritten metrics per output.  Increasing this value
+  ## allows for longer periods of output downtime without dropping metrics at the
+  ## cost of higher maximum memory usage.
+  metric_buffer_limit = 1000000
+
+  ## Collection jitter is used to jitter the collection by a random amount.
+  ## Each plugin will sleep for a random time within jitter before collecting.
+  ## This can be used to avoid many plugins querying things like sysfs at the
+  ## same time, which can have a measurable effect on the system.
+  collection_jitter = "0s"
+
+  ## Collection offset is used to shift the collection by the given amount.
+  ## This can be be used to avoid many plugins querying constraint devices
+  ## at the same time by manually scheduling them in time.
+  # collection_offset = "0s"
+
+  ## Default flushing interval for all outputs. Maximum flush_interval will be
+  ## flush_interval + flush_jitter
+  flush_interval = "60s"
+  ## Jitter the flush interval by a random amount. This is primarily to avoid
+  ## large write spikes for users running a large number of telegraf instances.
+  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+  flush_jitter = "0s"
+
+  ## Collected metrics are rounded to the precision specified. Precision is
+  ## specified as an interval with an integer + unit (e.g. 0s, 10ms, 2us, 4s).
+  ## Valid time units are "ns", "us" (or "µs"), "ms", "s".
+  ##
+  ## By default or when set to "0s", precision will be set to the same
+  ## timestamp order as the collection interval, with the maximum being 1s:
+  ##   ie, when interval = "10s", precision will be "1s"
+  ##       when interval = "250ms", precision will be "1ms"
+  ##
+  ## Precision will NOT be used for service inputs. It is up to each individual
+  ## service input to set the timestamp at the appropriate precision.
+  precision = "0s"
+
+  ## Log at debug level.
+  # debug = false
+  ## Log only error level messages.
+  # quiet = false
+
+  ## Log target controls the destination for logs and can be one of "file",
+  ## "stderr" or, on Windows, "eventlog".  When set to "file", the output file
+  ## is determined by the "logfile" setting.
+  # logtarget = "file"
+
+  ## Name of the file to be logged to when using the "file" logtarget.  If set to
+  ## the empty string then logs are written to stderr.
+  # logfile = ""
+
+  ## The logfile will be rotated after the time interval specified.  When set
+  ## to 0 no time based rotation is performed.  Logs are rotated only when
+  ## written to, if there is no log activity rotation may be delayed.
+  # logfile_rotation_interval = "0h"
+
+  ## The logfile will be rotated when it becomes larger than the specified
+  ## size.  When set to 0 no size based rotation is performed.
+  # logfile_rotation_max_size = "0MB"
+
+  ## Maximum number of rotated archives to keep, any older logs are deleted.
+  ## If set to -1, no archives are removed.
+  # logfile_rotation_max_archives = 5
+
+  ## Pick a timezone to use when logging or type 'local' for local time.
+  ## Example: America/Chicago
+  # log_with_timezone = ""
+
+  ## Override default hostname, if empty use os.Hostname()
+  hostname = ""
+  ## If set to true, do no set the "host" tag in the telegraf agent.
+  omit_hostname = false
+
+  ## Method of translating SNMP objects. Can be "netsnmp" (deprecated) which
+  ## translates by calling external programs snmptranslate and snmptable,
+  ## or "gosmi" which translates using the built-in gosmi library.
+  # snmp_translator = "netsnmp"
+
+  ## Name of the file to load the state of plugins from and store the state to.
+  ## If uncommented and not empty, this file will be used to save the state of
+  ## stateful plugins on termination of Telegraf. If the file exists on start,
+  ## the state in the file will be restored for the plugins.
+  # statefile = ""
+
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+
+# Configuration for sending metrics to InfluxDB 2.0
+[[outputs.influxdb_v2]]
+  ## The URLs of the InfluxDB cluster nodes.
+  ##
+  ## Multiple URLs can be specified for a single cluster, only ONE of the
+  ## urls will be written to each interval.
+  ##   ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]
+  urls = ["https://ifdb.holochain.org"]
+
+  ## Token for authentication.
+  token = "${INFLUX_TOKEN}"
+
+  ## Organization is the name of the organization you wish to write to.
+  organization = "holo"
+
+  ## Destination bucket to write into.
+  bucket = "windtunnel"
+
+  ## The value of this tag will be used to determine the bucket.  If this
+  ## tag is not set the 'bucket' option is used as the default.
+  # bucket_tag = ""
+
+  ## If true, the bucket tag will not be added to the metric.
+  # exclude_bucket_tag = false
+
+  ## Timeout for HTTP messages.
+  # timeout = "5s"
+
+  ## Additional HTTP headers
+  # http_headers = {"X-Special-Header" = "Special-Value"}
+
+  ## HTTP Proxy override, if unset values the standard proxy environment
+  ## variables are consulted to determine which proxy, if any, should be used.
+  # http_proxy = "http://corporate.proxy:3128"
+
+  ## HTTP User-Agent
+  # user_agent = "telegraf"
+
+  ## Content-Encoding for write request body, can be set to "gzip" to
+  ## compress body or "identity" to apply no encoding.
+  # content_encoding = "gzip"
+
+  ## Enable or disable uint support for writing uints influxdb 2.0.
+  # influx_uint_support = false
+
+  ## HTTP/2 Timeouts
+  ## The following values control the HTTP/2 client's timeouts. These settings
+  ## are generally not required unless a user is seeing issues with client
+  ## disconnects. If a user does see issues, then it is suggested to set these
+  ## values to "15s" for ping timeout and "30s" for read idle timeout and
+  ## retry.
+  ##
+  ## Note that the timer for read_idle_timeout begins at the end of the last
+  ## successful write and not at the beginning of the next write.
+  # ping_timeout = "0s"
+  # read_idle_timeout = "0s"
+
+  ## Optional TLS Config for use on HTTP connections.
+  # tls_ca = "/etc/telegraf/ca.pem"
+  # tls_cert = "/etc/telegraf/cert.pem"
+  # tls_key = "/etc/telegraf/key.pem"
+  ## Use TLS but skip chain & host verification
+  # insecure_skip_verify = false
+
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+
+[[inputs.file]]
+  ## Files to parse each interval.  Accept standard unix glob matching rules,
+  ## as well as ** to match recursive files and directories.
+  files = ["${WT_METRICS_DIR}/*.influx"]
+
+  ## Character encoding to use when interpreting the file contents.  Invalid
+  ## characters are replaced using the unicode replacement character.  When set
+  ## to the empty string the data is not decoded to text.
+  ##   ex: character_encoding = "utf-8"
+  ##       character_encoding = "utf-16le"
+  ##       character_encoding = "utf-16be"
+  ##       character_encoding = ""
+  character_encoding = "utf-8"
+
+  ## Data format to consume.
+  ## Each data format has its own unique set of configuration options, read
+  ## more about them here:
+  ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
+  data_format = "influx"
+
+
+  ## Name a tag containing the name of the file the data was parsed from.  Leave empty
+  ## to disable. Cautious when file name variation is high, this can increase the cardinality
+  ## significantly. Read more about cardinality here:
+  ## https://docs.influxdata.com/influxdb/cloud/reference/glossary/#series-cardinality
+  # file_tag = ""
+