Minor enhancement

Signed-off-by: Kedar Vijay Kulkarni <[email protected]>
kedark3 · Nov 15, 2021 · 0607a4e · 0607a4e
1 parent 9d12d3a
commit 0607a4e
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,9 @@ This tool allows OpenShift users to run a watcher for Prometheus queries and def
 * [x] Slack Notification
 * [x] Notify/Do Something(e.g. Pause/Kill benchmark jobs to preserve cluster) when results don't match conditions
 * [x] Spawn goroutines to keep running queries and evaluating results to handle scale - e.g. when we have very large number of queries in the yaml file, we can divide and concurrently run queries
-
+* [] debug mode
+* [] make slack optional
+* [] use env vars
 
 
 ## Usage:
@@ -33,12 +35,19 @@ This tool allows OpenShift users to run a watcher for Prometheus queries and def
 * You can then run the following command:
 ```sh
 
-./bin/cpa --help
-Usage: cpa [--noclrscr] [--queries QUERIES] [--timeout TIMEOUT]
+./bin/cpa -t 60s -h
+Usage: cpa [--noclrscr] [--queries QUERIES] [--query-frequency QUERY-FREQUENCY] [--timeout TIMEOUT] [--log-output] [--terminate-benchmark TERMINATE-BENCHMARK]
 
 Options:
-  --noclrscr             Do not clear screen after each iteration. [default: false]
-  --queries QUERIES      queries file to use [default: queries.yaml]
-  --timeout TIMEOUT      Duration to run Continuous Performance Analysis. You can pass values like 4h or 1h10m10s [default: 4h]
+  --noclrscr             Do not clear screen after each iteration. Clears screen by default. [default: false]
+  --queries QUERIES, -q QUERIES
+                         queries file to use [default: queries.yaml]
+  --query-frequency QUERY-FREQUENCY, -f QUERY-FREQUENCY
+                         How often do we run queries. You can pass values like 4h or 1h10m10s [default: 20s]
+  --timeout TIMEOUT, -t TIMEOUT
+                         Duration to run Continuous Performance Analysis. You can pass values like 4h or 1h10m10s [default: 4h]
+  --log-output, -l       Output will be stored in a log file(cpa.log) in addition to stdout. [default: false]
+  --terminate-benchmark TERMINATE-BENCHMARK, -k TERMINATE-BENCHMARK
+                         When CPA is running in parallel with benchmark job, let CPA know to kill benchmark if any query fail. (E.g. -k <processID>) Helpful to preserve cluster for further analysis.
   --help, -h             display this help and exit
 ```
diff --git a/bin/cpa b/bin/cpa
diff --git a/config/queries.yaml b/config/queries.yaml
@@ -7,57 +7,64 @@
     operator: lte
   - key: phase
     val: "Failed"
-    threshold: 0
+    threshold: 10
+    operator: lte
+- query: "sum(kube_node_status_condition{status='true'}) by (condition) > 0"
+  watchFor:
+  - key: condition
+    val: "Ready"
+    threshold: 6
     operator: eq
-  - key: phase
-    val: "Succeeded"
-    threshold: 77
-    operator: gte
-- query: "sum(kube_namespace_status_phase) by (phase)"
+- query: "sum by (condition)(cluster_operator_conditions{condition!=''})"
   watchFor:
-  - key: phase
-    val: "Terminating"
+  - key: condition
+    val: "Failing"
     threshold: 0
     operator: eq
-- query: 'max(sum by (instance) (rate(ovnkube_master_pod_creation_latency_seconds_sum[20m])))'                                                                      # Pod annoation latency
-  watchFor: # watchFor will have only 1 value for latency/duration queries
-  - key: nil
-    val: nil
-    threshold: 0.02
-    operator: lt
-- query: 'max(sum by (instance) (rate(ovnkube_node_cni_request_duration_seconds_sum{command="ADD"}[20m])))'                                                          # CNI Request duration for "ADD" command over 2m interval
+  - key: condition
+    val: "Degraded"
+    threshold: 0
+    operator: eq
+  - key: condition
+    val: "Available"
+    threshold: 33
+    operator: eq
+- query: 'max(sum(container_memory_rss{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000'   # 1073742000 is bytes per GiB
   watchFor:
   - key: nil
     val: nil
-    threshold: 0.1
+    threshold: 7 # GiB
     operator: lt
-- query: 'max(sum by (instance) (rate(ovnkube_node_cni_request_duration_seconds_sum{command="DEL"}[20m])))'                                                          # CNI Request duration for "DEL" command over 2m interval
+- query: 'max(sum(container_memory_working_set_bytes{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000'   # 1073742000 is bytes per GiB,  if container_memory_working_set_bytes or container_memory_rss reached to the limits, the pod will be killed.But, container_memory_usage_bytes reaches to the limits, your pod will NOT get oom-killed
   watchFor:
   - key: nil
     val: nil
-    threshold: 0.02
+    threshold: 7 # GiB
     operator: lt
-- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container=""}) by (node))'
+- query: "sum(kube_namespace_status_phase) by (phase)"
+  watchFor:
+  - key: phase
+    val: "Terminating"
+    threshold: 0
+    operator: eq
+- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container=""}) by (pod, node))/1073742000'
   watchFor:
   - key: nil
     val: nil
-    threshold: 209715200
-    operator: lt
-- query: 'max(sum(container_memory_rss{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000'   # 1073742000 is bytes per GiB
+    threshold: 4 # GiB
+    operator: lte
+- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container=""}) by (pod, node))/1073742000'
   watchFor:
   - key: nil
     val: nil
-    threshold: 2 # GiB
-    operator: lt
-# - query: 'topk(10, rate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)'        # top 10 - ovn-controller cpu usage
-#   watchFor:
-# - query: 'topk(10, sum(container_memory_working_set_bytes{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))' # top 10 - ovn-controller memory usage
-#   watchFor:
-# - query: 'sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)'                                            # Prometheus replica 0 rss memory
-#   watchFor:
-# - query: 'sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)'                                            # Prometheus replica 1 rss memory
-#   watchFor:
-# - query: 'rate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!=""}[2m])*100'                         # CPU usage ovnkube-master components over 2m interval
+    threshold: 4 # GiB
+    operator: lte
+# - query: 'max(container_runtime_crio_containers_oom_total)'
 #   watchFor:
-# - query: 'sum by (condition)(cluster_operator_conditions{condition!=""})'
-#   watchFor:
+#     key: nil
+#     val: nil
+#     threshold: 3
+#     operator: lt
+
+# Metrics of Interest: ovnkube_master_requeue_service_total, ovnkube_master_skipped_nbctl_daemon_total, ovnkube_master_sync_service_total, ovnkube_master_ovn_cli_latency_seconds_sum
+#                      max(ovnkube_master_pod_creation_latency_seconds_bucket), ovnkube_master_workqueue_depth, max(ovnkube_master_workqueue_retries_total),ovnkube_node_cni_request_duration_seconds_count
diff --git a/main.go b/main.go
@@ -106,6 +106,7 @@ func main() {
 		log.Println(item.Query)
 	}
 	thread_ts := slackConfig.SlackNotify("New benchmark started, we will monitor it for performance and notify here with the issues.", "")
+	defer slackConfig.SlackNotify(fmt.Sprintf("Continuous Perf Analysis has ended all iterations. Total time spent: %s", args.Timeout.String()), thread_ts)
 	go func(c chan string) {
 		for i := 1; ; i++ {
 			log.Printf("\n%[2]s\nIteration no. %[1]d\n%[2]s\n", i, strings.Repeat("~", 80))
@@ -127,5 +128,5 @@ func main() {
 		log.Println(err)
 	}
 	time.Sleep(d)
-	slackConfig.SlackNotify(fmt.Sprintf("Continuous Perf Analysis has ended all iterations. Total time spent: %s", d.String()), thread_ts)
+
 }