Skip to content

Commit

Permalink
Minor enhancement
Browse files Browse the repository at this point in the history
Signed-off-by: Kedar Vijay Kulkarni <[email protected]>
  • Loading branch information
Kedar Vijay Kulkarni committed Nov 15, 2021
1 parent 9d12d3a commit 0607a4e
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 42 deletions.
21 changes: 15 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ This tool allows OpenShift users to run a watcher for Prometheus queries and def
* [x] Slack Notification
* [x] Notify/Do Something(e.g. Pause/Kill benchmark jobs to preserve cluster) when results don't match conditions
* [x] Spawn goroutines to keep running queries and evaluating results to handle scale - e.g. when we have very large number of queries in the yaml file, we can divide and concurrently run queries

* [] debug mode
* [] make slack optional
* [] use env vars


## Usage:
Expand All @@ -33,12 +35,19 @@ This tool allows OpenShift users to run a watcher for Prometheus queries and def
* You can then run the following command:
```sh

./bin/cpa --help
Usage: cpa [--noclrscr] [--queries QUERIES] [--timeout TIMEOUT]
./bin/cpa -t 60s -h
Usage: cpa [--noclrscr] [--queries QUERIES] [--query-frequency QUERY-FREQUENCY] [--timeout TIMEOUT] [--log-output] [--terminate-benchmark TERMINATE-BENCHMARK]

Options:
--noclrscr Do not clear screen after each iteration. [default: false]
--queries QUERIES queries file to use [default: queries.yaml]
--timeout TIMEOUT Duration to run Continuous Performance Analysis. You can pass values like 4h or 1h10m10s [default: 4h]
--noclrscr Do not clear screen after each iteration. Clears screen by default. [default: false]
--queries QUERIES, -q QUERIES
queries file to use [default: queries.yaml]
--query-frequency QUERY-FREQUENCY, -f QUERY-FREQUENCY
How often do we run queries. You can pass values like 4h or 1h10m10s [default: 20s]
--timeout TIMEOUT, -t TIMEOUT
Duration to run Continuous Performance Analysis. You can pass values like 4h or 1h10m10s [default: 4h]
--log-output, -l Output will be stored in a log file(cpa.log) in addition to stdout. [default: false]
--terminate-benchmark TERMINATE-BENCHMARK, -k TERMINATE-BENCHMARK
When CPA is running in parallel with benchmark job, let CPA know to kill benchmark if any query fail. (E.g. -k <processID>) Helpful to preserve cluster for further analysis.
--help, -h display this help and exit
```
Binary file modified bin/cpa
Binary file not shown.
77 changes: 42 additions & 35 deletions config/queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,57 +7,64 @@
operator: lte
- key: phase
val: "Failed"
threshold: 0
threshold: 10
operator: lte
- query: "sum(kube_node_status_condition{status='true'}) by (condition) > 0"
watchFor:
- key: condition
val: "Ready"
threshold: 6
operator: eq
- key: phase
val: "Succeeded"
threshold: 77
operator: gte
- query: "sum(kube_namespace_status_phase) by (phase)"
- query: "sum by (condition)(cluster_operator_conditions{condition!=''})"
watchFor:
- key: phase
val: "Terminating"
- key: condition
val: "Failing"
threshold: 0
operator: eq
- query: 'max(sum by (instance) (rate(ovnkube_master_pod_creation_latency_seconds_sum[20m])))' # Pod annoation latency
watchFor: # watchFor will have only 1 value for latency/duration queries
- key: nil
val: nil
threshold: 0.02
operator: lt
- query: 'max(sum by (instance) (rate(ovnkube_node_cni_request_duration_seconds_sum{command="ADD"}[20m])))' # CNI Request duration for "ADD" command over 2m interval
- key: condition
val: "Degraded"
threshold: 0
operator: eq
- key: condition
val: "Available"
threshold: 33
operator: eq
- query: 'max(sum(container_memory_rss{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000' # 1073742000 is bytes per GiB
watchFor:
- key: nil
val: nil
threshold: 0.1
threshold: 7 # GiB
operator: lt
- query: 'max(sum by (instance) (rate(ovnkube_node_cni_request_duration_seconds_sum{command="DEL"}[20m])))' # CNI Request duration for "DEL" command over 2m interval
- query: 'max(sum(container_memory_working_set_bytes{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000' # 1073742000 is bytes per GiB, if container_memory_working_set_bytes or container_memory_rss reached to the limits, the pod will be killed.But, container_memory_usage_bytes reaches to the limits, your pod will NOT get oom-killed
watchFor:
- key: nil
val: nil
threshold: 0.02
threshold: 7 # GiB
operator: lt
- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container=""}) by (node))'
- query: "sum(kube_namespace_status_phase) by (phase)"
watchFor:
- key: phase
val: "Terminating"
threshold: 0
operator: eq
- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-master-.*",namespace="openshift-ovn-kubernetes",container=""}) by (pod, node))/1073742000'
watchFor:
- key: nil
val: nil
threshold: 209715200
operator: lt
- query: 'max(sum(container_memory_rss{namespace!="",name!="",container="prometheus"}) by (pod))/1073742000' # 1073742000 is bytes per GiB
threshold: 4 # GiB
operator: lte
- query: 'max(sum(container_memory_working_set_bytes{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container=""}) by (pod, node))/1073742000'
watchFor:
- key: nil
val: nil
threshold: 2 # GiB
operator: lt
# - query: 'topk(10, rate(container_cpu_usage_seconds_total{pod=~"ovnkube-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}[2m])*100)' # top 10 - ovn-controller cpu usage
# watchFor:
# - query: 'topk(10, sum(container_memory_working_set_bytes{pod=~"ovnkube-node-.*",namespace="openshift-ovn-kubernetes",container="ovn-controller"}) by (node))' # top 10 - ovn-controller memory usage
# watchFor:
# - query: 'sum(container_memory_rss{pod="prometheus-k8s-0",namespace!="",name!="",container="prometheus"}) by (pod)' # Prometheus replica 0 rss memory
# watchFor:
# - query: 'sum(container_memory_rss{pod="prometheus-k8s-1",namespace!="",name!="",container="prometheus"}) by (pod)' # Prometheus replica 1 rss memory
# watchFor:
# - query: 'rate(container_cpu_usage_seconds_total{pod=~"ovnkube-master.*",namespace="openshift-ovn-kubernetes",container!=""}[2m])*100' # CPU usage ovnkube-master components over 2m interval
threshold: 4 # GiB
operator: lte
# - query: 'max(container_runtime_crio_containers_oom_total)'
# watchFor:
# - query: 'sum by (condition)(cluster_operator_conditions{condition!=""})'
# watchFor:
# key: nil
# val: nil
# threshold: 3
# operator: lt

# Metrics of Interest: ovnkube_master_requeue_service_total, ovnkube_master_skipped_nbctl_daemon_total, ovnkube_master_sync_service_total, ovnkube_master_ovn_cli_latency_seconds_sum
# max(ovnkube_master_pod_creation_latency_seconds_bucket), ovnkube_master_workqueue_depth, max(ovnkube_master_workqueue_retries_total),ovnkube_node_cni_request_duration_seconds_count
3 changes: 2 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ func main() {
log.Println(item.Query)
}
thread_ts := slackConfig.SlackNotify("New benchmark started, we will monitor it for performance and notify here with the issues.", "")
defer slackConfig.SlackNotify(fmt.Sprintf("Continuous Perf Analysis has ended all iterations. Total time spent: %s", args.Timeout.String()), thread_ts)
go func(c chan string) {
for i := 1; ; i++ {
log.Printf("\n%[2]s\nIteration no. %[1]d\n%[2]s\n", i, strings.Repeat("~", 80))
Expand All @@ -127,5 +128,5 @@ func main() {
log.Println(err)
}
time.Sleep(d)
slackConfig.SlackNotify(fmt.Sprintf("Continuous Perf Analysis has ended all iterations. Total time spent: %s", d.String()), thread_ts)

}

0 comments on commit 0607a4e

Please sign in to comment.