feature: Add workload testing agent

This adds a test agent for running cluster workload tests. These are tests that run some sort of workload on the cluster to verify system functionality. Signed-off-by: Sean McGinnis <[email protected]>
bottlerocket-os · Nov 23, 2022 · 70fa21f · 70fa21f
1 parent 05d9c30
commit 70fa21f
Show file tree

Hide file tree

Showing 8 changed files with 567 additions and 61 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -198,3 +198,34 @@ COPY --from=build-src /src/bottlerocket/agents/src/bin/migration-test-agent/ssm-
 COPY --from=build-src /usr/share/licenses/testsys /licenses/testsys
 
 ENTRYPOINT ["./migration-test-agent"]
+
+# =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^=
+# Builds the Kubernetes Workload test agent image
+FROM public.ecr.aws/amazonlinux/amazonlinux:2 AS k8s-workload-agent
+ARG ARCH
+
+# TODO remove unzip once aws-cli moves out
+RUN yum install -y unzip iproute && yum clean all
+ARG AWS_CLI_URL=https://awscli.amazonaws.com/awscli-exe-linux-${ARCH}.zip
+
+# Copy aws-iam-authenticator
+COPY --from=tools /aws-iam-authenticator /usr/bin/aws-iam-authenticator
+COPY --from=tools /licenses/aws-iam-authenticator /licenses/aws-iam-authenticator
+
+# TODO move this out, get hashes, and attribute licenses
+# Download aws-cli
+RUN temp_dir="$(mktemp -d --suffix aws-cli)" && \
+    curl -fsSL "${AWS_CLI_URL}" -o "${temp_dir}/awscliv2.zip" && \
+    unzip "${temp_dir}/awscliv2.zip" -d "${temp_dir}" && \
+    ${temp_dir}/aws/install && \
+    rm -rf ${temp_dir}
+
+# Copy sonobuoy
+COPY --from=tools /sonobuoy /usr/bin/sonobuoy
+COPY --from=tools /licenses/sonobuoy /licenses/sonobuoy
+
+# Copy k8s-workload-agent
+COPY --from=build-src /src/bottlerocket/agents/bin/k8s-workload-agent ./
+COPY --from=build-src /usr/share/licenses/testsys /licenses/testsys
+
+ENTRYPOINT ["./k8s-workload-agent"]
diff --git a/Makefile b/Makefile
@@ -22,7 +22,8 @@ TESTSYS_BUILD_GOPROXY ?= direct
 # The set of bottlerocket images to create. Add new artifacts here when added
 # to the project.
 IMAGES = controller sonobuoy-test-agent ec2-resource-agent eks-resource-agent ecs-resource-agent \
-	migration-test-agent vsphere-vm-resource-agent vsphere-k8s-cluster-resource-agent ecs-test-agent
+	migration-test-agent vsphere-vm-resource-agent vsphere-k8s-cluster-resource-agent ecs-test-agent \
+	k8s-workload-agent
 
 # Store targets for tagging images
 TAG_IMAGES = $(addprefix tag-, $(IMAGES))
@@ -138,7 +139,7 @@ tools:
 		./tools
 
 # Build the container image for a testsys agent
-eks-resource-agent ec2-resource-agent ecs-resource-agent vsphere-vm-resource-agent vsphere-k8s-cluster-resource-agent sonobuoy-test-agent migration-test-agent ecs-test-agent: show-variables fetch
+eks-resource-agent ec2-resource-agent ecs-resource-agent vsphere-vm-resource-agent vsphere-k8s-cluster-resource-agent sonobuoy-test-agent migration-test-agent ecs-test-agent k8s-workload-agent: show-variables fetch
 	docker build $(DOCKER_BUILD_FLAGS) \
 		--build-arg ARCH="$(TESTSYS_BUILD_HOST_UNAME_ARCH)" \
 		--build-arg BUILDER_IMAGE="$(BUILDER_IMAGE)" \

diff --git a/bottlerocket/agents/src/bin/k8s-workload-agent/main.rs b/bottlerocket/agents/src/bin/k8s-workload-agent/main.rs
@@ -0,0 +1,116 @@
+/*!
+
+This is a test-agent for running workload tests on Kubernetes.
+It expects to be run in a pod launched by the TestSys controller.
+
+You can configure the workload agent to run different types of plugins and tests.
+See `WorkloadConfig` for the different configuration values.
+
+To build the container for the workload test agent, run `make k8s-workload-agent-image` from the
+root directory of this repository.
+
+Here is an example manifest for deploying the test definition for the workload test agent to a K8s cluster:
+
+```yaml
+apiVersion: testsys.system/v1
+kind: Test
+metadata:
+  name: workload-full
+  namespace: testsys
+spec:
+  agent:
+    configuration:
+      kubeconfigBase64: <Base64 encoded kubeconfig for the test cluster workload runs the tests in>
+      plugins:
+      - name: nvidia-workload
+        image: testsys-nvidia-workload-test:v0.0.3
+    image: <your k8s-workload-agent image URI>
+    name: workload-test-agent
+    keep_running: true
+  resources: {}
+```
+
+!*/
+
+use agent_utils::{base64_decode_write_file, init_agent_logger};
+use async_trait::async_trait;
+use bottlerocket_agents::constants::TEST_CLUSTER_KUBECONFIG_PATH;
+use bottlerocket_agents::error::Error;
+use bottlerocket_agents::workload::{delete_workload, rerun_failed_workload, run_workload};
+use bottlerocket_types::agent_config::WorkloadConfig;
+use log::{debug, info};
+use model::TestResults;
+use std::path::PathBuf;
+use test_agent::{BootstrapData, ClientError, DefaultClient, Spec, TestAgent};
+
+struct WorkloadTestRunner {
+    config: WorkloadConfig,
+    results_dir: PathBuf,
+}
+
+#[async_trait]
+impl test_agent::Runner for WorkloadTestRunner {
+    type C = WorkloadConfig;
+    type E = Error;
+
+    async fn new(spec: Spec<Self::C>) -> Result<Self, Self::E> {
+        info!("Initializing Workload test agent...");
+        Ok(Self {
+            config: spec.configuration,
+            results_dir: spec.results_dir,
+        })
+    }
+
+    async fn run(&mut self) -> Result<TestResults, Self::E> {
+        debug!("Decoding kubeconfig for test cluster");
+        base64_decode_write_file(&self.config.kubeconfig_base64, TEST_CLUSTER_KUBECONFIG_PATH)
+            .await?;
+        info!("Stored kubeconfig in {}", TEST_CLUSTER_KUBECONFIG_PATH);
+
+        run_workload(
+            TEST_CLUSTER_KUBECONFIG_PATH,
+            &self.config,
+            &self.results_dir,
+        )
+        .await
+    }
+
+    async fn rerun_failed(&mut self, _prev_results: &TestResults) -> Result<TestResults, Self::E> {
+        delete_workload(TEST_CLUSTER_KUBECONFIG_PATH).await?;
+
+        debug!("Decoding kubeconfig for test cluster");
+        base64_decode_write_file(&self.config.kubeconfig_base64, TEST_CLUSTER_KUBECONFIG_PATH)
+            .await?;
+        info!("Stored kubeconfig in {}", TEST_CLUSTER_KUBECONFIG_PATH);
+
+        rerun_failed_workload(
+            TEST_CLUSTER_KUBECONFIG_PATH,
+            &self.config,
+            &self.results_dir,
+        )
+        .await
+    }
+
+    async fn terminate(&mut self) -> Result<(), Self::E> {
+        delete_workload(TEST_CLUSTER_KUBECONFIG_PATH).await
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    init_agent_logger(env!("CARGO_CRATE_NAME"), None);
+    if let Err(e) = run().await {
+        eprintln!("{}", e);
+        std::process::exit(1);
+    }
+}
+
+async fn run() -> Result<(), test_agent::error::Error<ClientError, Error>> {
+    let mut agent = TestAgent::<DefaultClient, WorkloadTestRunner>::new(
+        BootstrapData::from_env().unwrap_or_else(|_| BootstrapData {
+            test_name: "workload_test".to_string(),
+        }),
+    )
+    .await?;
+    agent.run().await
+}
diff --git a/bottlerocket/agents/src/error.rs b/bottlerocket/agents/src/error.rs
@@ -157,4 +157,25 @@ pub enum Error {
     #[snafu(context(false))]
     #[snafu(display("{}", source))]
     Utils { source: agent_utils::Error },
+
+    #[snafu(display("Failed to create workload process: {}", source))]
+    WorkloadProcess { source: std::io::Error },
+
+    #[snafu(display("Failed to run workload test"))]
+    WorkloadRun,
+
+    #[snafu(display("Failed to initialize workload test plugin: {}", plugin))]
+    WorkloadPlugin { plugin: String },
+
+    #[snafu(display(
+        "Failed to write workload test plugin configuration yaml for: {}",
+        plugin
+    ))]
+    WorkloadWritePlugin { plugin: String },
+
+    #[snafu(display("Failed to clean-up workload resources"))]
+    WorkloadDelete,
+
+    #[snafu(display("Missing '{}' field from workload status", field))]
+    MissingWorkloadStatusField { field: String },
 }
diff --git a/bottlerocket/agents/src/lib.rs b/bottlerocket/agents/src/lib.rs
@@ -14,6 +14,7 @@ pub mod error;
 pub mod sonobuoy;
 pub mod tuf;
 pub mod vsphere;
+pub mod workload;
 
 /// Determines whether a cluster resource needs to be created given its creation policy
 pub async fn is_cluster_creation_required(