[infrastructure] add initial Locust load testing scripts (#4160)

theopensystemslab · Jan 24, 2025 · 1c3b7a8 · 1c3b7a8
1 parent 88e9b72
commit 1c3b7a8
Show file tree

Hide file tree

Showing 10 changed files with 932 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -21,8 +21,12 @@ hasura.planx.uk/.env.test
 /playwright-report/
 /playwright/.cache/
 api.planx.uk/tmp/
+
+# Python
 .python-version
 __pycache__
+.venv/
+.ruff_cache/
 
 # Ignore certificate files
 **/*.chain

diff --git a/infrastructure/performance/README.md b/infrastructure/performance/README.md
@@ -0,0 +1,38 @@
+# Performance
+
+## Load testing with Locust
+
+This directory contains Python scripts for load testing using [Locust](https://locust.io/) ([docs](https://docs.locust.io/en/stable/)).
+
+### Setup
+
+We use `uv` to manage dependencies for this project. If you aren't already familiar with this Python project manager, [get set up](https://docs.astral.sh/uv/).
+
+Then:
+- run `uv sync` (`pyproject.toml` and `uv.lock` together completely determine the setup)
+- run `source .venv/bin/activate` to [activate the virtual environment](https://docs.astral.sh/uv/pip/environments/#using-a-virtual-environment)
+
+### Usage
+
+The `run_locust.sh` script is intended to encode some sensible assumptions and do some of the heavy lifting to make it very easy to run load tests.
+
+Note that it assumes your machine has 8 cores, each of which can handle a workload of ~ 300 users. If you're on Mac/Linux you can check your core count with `lscpu`, and adjust the script accordingly. The latter will vary depending on the workload you're running, so feel free to play around with it (keep an eye on the 'Workers' tab in Locust to track CPU usage).
+
+As an example, the following command will simulate 500 users hitting PlanX staging (`editor.planx.dev`) with a series of requests to Hasura's GraphQL endpoint every 10 seconds (after a period of ramping up):
+
+```sh
+./run_locust.sh test_hasura.py 500 staging
+```
+
+Then find the Locust GUI at `http://localhost:8089/`.
+
+### Development
+
+The `OpenWorkloadBase` class in `base_workload.py` provides a base class which all the `test_*.py` scripts inherit from. Any new workload should follow the same pattern.
+
+Also note that this project using [ruff](https://docs.astral.sh/ruff/) for linting and formatting. So before pushing up changes (and with the venv activated), run the following:
+
+```
+ruff check
+ruff format
+```
diff --git a/infrastructure/performance/base_workload.py b/infrastructure/performance/base_workload.py
@@ -0,0 +1,34 @@
+from locust import (
+  constant_pacing,
+  FastHttpUser,
+  stats,
+)
+
+
+# we want the double & triple nine percentiles to be reported in the chart and statistics (already in csv)
+stats.PERCENTILES_TO_CHART = (0.5, 0.95, 0.99, 0.999)
+stats.PERCENTILES_TO_STATISTICS = (0.95, 0.99, 0.999)
+
+# by default, we attempt to have each user run a task every second (or as fast as possible if latency is greater)
+# this means that user count will correspond roughly to request rate (assuming most tasks emit 1 request)
+TASK_INVOCATION_RATE_SECONDS = 1
+
+
+class OpenWorkloadBase(FastHttpUser):
+  # this is a base class, intended to be subclassed for each test workload
+  abstract = True
+  wait_time = constant_pacing(TASK_INVOCATION_RATE_SECONDS)
+  default_headers = {
+    "accept": "*/*",
+    "accept-encoding": "gzip, deflate, br, zstd",
+    "accept-language": "en-GB,en;q=0.9",
+    "cache-control": "no-cache",  # locust has no cache
+    "pragma": "no-cache",
+    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+  }
+
+  def on_start(self):
+    pass
+
+  def on_stop(self):
+    pass
diff --git a/infrastructure/performance/pyproject.toml b/infrastructure/performance/pyproject.toml
@@ -0,0 +1,41 @@
+[project]
+name = "performance"
+version = "0.1.0"
+description = "Load testing scripts for PlanX infra"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "blinker==1.8.2",
+    "brotli==1.1.0",
+    "certifi==2024.8.30",
+    "charset-normalizer==3.4.0",
+    "click==8.1.7",
+    "configargparse==1.7",
+    "flask==3.0.3",
+    "flask-cors==5.0.0",
+    "flask-login==0.6.3",
+    "gevent==24.10.3",
+    "geventhttpclient==2.3.1",
+    "greenlet==3.1.1",
+    "har2locust==0.9.3",
+    "idna==3.10",
+    "itsdangerous==2.2.0",
+    "jinja2==3.1.4",
+    "locust==2.32.1",
+    "locust-plugins==4.5.3",
+    "markupsafe==3.0.2",
+    "msgpack==1.1.0",
+    "psutil==6.1.0",
+    "pyzmq==26.2.0",
+    "requests==2.32.3",
+    "ruff==0.7.4",
+    "setuptools==75.3.0",
+    "typing-extensions==4.12.2",
+    "urllib3==2.2.3",
+    "werkzeug==3.1.2",
+    "zope-event==5.0",
+    "zope-interface==7.1.1",
+]
+
+[tool.ruff]
+indent-width=2
diff --git a/infrastructure/performance/run_locust.sh b/infrastructure/performance/run_locust.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# grab filename and number of users from command line args
+LOCUSTFILE=$1
+USERS=$2
+
+# export env for scripts to reference (accepts local/staging, but not production)
+if [ -z $3 ]; then
+  echo "No environment passed in, assuming local"
+  export TARGET_ENV=local
+else
+  echo "Setting target environment as: $3"
+  export TARGET_ENV=$3
+fi
+
+# this script assumes your machine has 8 cores, each of which can handle ~ 300 users (will depend on workload)
+# check core count with lscpu, test and adjust constants accordingly for your use case
+LOCAL_CORES=8
+USERS_PER_CORE=300
+
+# get the ceiling of division, rather than floor (i.e. lean towards more workers)
+PROCESSES=$((($USERS + $USERS_PER_CORE - 1) / $USERS_PER_CORE))
+PROCESSES=$(($PROCESSES > 0 ? $PROCESSES : 1))
+WORKERS=$PROCESSES
+if [ $WORKERS -gt $LOCAL_CORES ]; then
+  PROCESSES=-1
+  WORKERS=$LOCAL_CORES
+fi
+
+# we keep spawn rate relatively low to avoid overwhelming CPUs during ramp up
+SPAWN_RATE=5
+
+# run load test for thrice as long as total ramp up time (or 5 minutes, whichever is higher)
+RUN_TIME_SECONDS=$((($USERS / $SPAWN_RATE) * 3))
+RUN_TIME_SECONDS=$(($RUN_TIME_SECONDS > 300 ? $RUN_TIME_SECONDS : 300))
+
+echo "Running $LOCUSTFILE load test across $WORKERS workers (i.e. CPUs)"
+python -m locust --locustfile $LOCUSTFILE --users $USERS --spawn-rate $SPAWN_RATE --run-time $RUN_TIME_SECONDS --processes $PROCESSES --autostart
diff --git a/infrastructure/performance/test_hasura.py b/infrastructure/performance/test_hasura.py
@@ -0,0 +1,143 @@
+import os
+import random
+import time
+
+from locust import (
+  constant_pacing,
+  task,
+)
+
+from base_workload import OpenWorkloadBase
+from utils import (
+  get_nested_key,
+  get_target_host,
+)
+
+
+TASK_INVOCATION_RATE_SECONDS = 10
+HOST_BY_ENV = {
+  "local": os.getenv("HASURA_GRAPHQL_URL", "http://localhost:7100"),
+  "staging": "https://hasura.editor.planx.dev",
+}
+HASURA_GRAPHQL_ENDPOINT = "/v1/graphql"
+
+
+class HasuraWorkload(OpenWorkloadBase):
+  wait_time = constant_pacing(TASK_INVOCATION_RATE_SECONDS)
+  host = get_target_host(HOST_BY_ENV)
+
+  @task
+  def get_random_flow_metadata(self) -> None:
+    # first we simulate hitting the splash page (for a logged in user), where teams are listed
+    teams = None
+    with self.rest(
+      "POST",
+      HASURA_GRAPHQL_ENDPOINT,
+      name="GetTeams",
+      json={
+        "operationName": "GetTeams",
+        "query": """
+          query GetTeams {
+            teams(order_by: {name: asc}) {
+              id
+              name
+            }
+          }
+          """,
+      },
+    ) as resp:
+      teams = get_nested_key(resp.js, "data", "teams")
+    # we choose a team at random to fetch flows for, as if to display the list of services
+    team_id = random.choice(teams)["id"]
+    # then we sleep for a bit to simulate the user choosing from the list of teams
+    time.sleep(2)
+
+    flows = None
+    with self.rest(
+      "POST",
+      HASURA_GRAPHQL_ENDPOINT,
+      name="GetFlows",
+      json={
+        "operationName": "GetFlows",
+        "variables": {"team_id": team_id},
+        "query": """
+          query GetFlows($team_id: Int!) {
+            flows(where: {team_id: {_eq: $team_id}}) {
+              id
+              name
+              slug
+              updated_at
+            }
+          }
+          """,
+      },
+    ) as resp:
+      flows = get_nested_key(resp.js, "data", "flows")
+    if not flows:
+      return
+    # now we choose a random flow from that team to get more information about
+    flow_slug = random.choice(flows)["slug"]
+    time.sleep(2)
+
+    flow_id, aggregate_count = None, None
+    with self.rest(
+      "POST",
+      HASURA_GRAPHQL_ENDPOINT,
+      name="GetFlowMetadata",
+      json={
+        "operationName": "GetFlowMetadata",
+        "variables": {
+          "team_id": team_id,
+          "slug": flow_slug,
+        },
+        "query": """
+          query GetFlowMetadata($team_id: Int!, $slug: String!) {
+            flows(where: {team_id: {_eq: $team_id}, slug: {_eq: $slug}}) {
+              id
+              published_flows_aggregate {
+                aggregate {
+                  count
+                }
+              }
+            }
+          }
+          """,
+      },
+    ) as resp:
+      flows = get_nested_key(resp.js, "data", "flows")
+      if flows:
+        flow_id = flows[0]["id"]
+        aggregate_count = get_nested_key(
+          flows[0], "published_flows_aggregate", "aggregate", "count"
+        )
+    # it may be that the flow is not published (in which case aggregate count will be 0)
+    if not aggregate_count:
+      return
+
+    # this last request comes immediately after the last, and will likely be the heaviest
+    # (because published flow data is one of the largest fetches we ever handle)
+    with self.rest(
+      "POST",
+      HASURA_GRAPHQL_ENDPOINT,
+      name="GetLastPublishedFlow",
+      json={
+        "operationName": "GetLastPublishedFlow",
+        "variables": {
+          "id": flow_id,
+        },
+        "query": """
+          query GetLastPublishedFlow($id: uuid) {
+            flows(where: {id: {_eq: $id}}) {
+              published_flows(limit: 1, order_by: {data: asc, created_at: desc}) {
+                created_at
+                data
+              }
+            }
+          }
+          """,
+      },
+    ) as resp:
+      flows = get_nested_key(resp.js, "data", "flows")
+      if flows:
+        published_flows = flows[0]["published_flows"]
+        assert published_flows[0]["created_at"] is not None
diff --git a/infrastructure/performance/test_splash.py b/infrastructure/performance/test_splash.py
@@ -0,0 +1,21 @@
+import os
+
+from locust import task
+
+from base_workload import OpenWorkloadBase
+from utils import get_target_host
+
+
+HOST_BY_ENV = {
+  "local": os.getenv("EDITOR_URL_EXT", "http://localhost:3000"),
+  "staging": "https://editor.planx.dev",
+}
+
+
+class SplashWorkload(OpenWorkloadBase):
+  host = get_target_host(HOST_BY_ENV)
+
+  # simple test to simulate users hitting splash page (without auth)
+  @task
+  def get_splash(self):
+    self.client.get("/")
diff --git a/infrastructure/performance/utils.py b/infrastructure/performance/utils.py
@@ -0,0 +1,21 @@
+import os
+from typing import Any
+
+
+VALID_TARGET_ENVIRONMENTS = ("local", "staging")
+
+
+def get_nested_key(dct: dict[Any, Any], *keys: str) -> Any:
+  for key in keys:
+    try:
+      dct = dct[key]
+    except KeyError:
+      return None
+  return dct
+
+
+def get_target_host(host_by_env: dict[str, str]) -> str:
+  env = os.getenv("TARGET_ENV", "local")
+  if env not in VALID_TARGET_ENVIRONMENTS:
+    raise ValueError(f"Invalid environment submitted (accepts local/staging): {env}")
+  return host_by_env[env]