diff --git a/README.md b/README.md index e5a9854..e48f15d 100644 --- a/README.md +++ b/README.md @@ -59,9 +59,10 @@ This tool allows OpenShift users to run a watcher for Prometheus queries and def * [x] Notify/Do Something(e.g. Pause/Kill benchmark jobs to preserve cluster) when results don't match conditions * [x] Spawn goroutines to keep running queries and evaluating results to handle scale - e.g. when we have very large number of queries in the yaml file, we can divide and concurrently run queries * [x] If slack config is not set, it is ignored and no attempts will be made to notify via slack -* [ ] debug mode -* [ ] use env vars -* [ ] Enhance log files to include uuid/time +* [x] debug/verbose mode +* [x] Enhance log files to include uuid/time +* [x] Use env vars +* [x] RFE: come up with a basic "cluster health" profile that anyone can use. Operator monitoring + some best practice monitors from the dittybopper dashboards ## Usage: diff --git a/cmd/analyze/analyze.go b/cmd/analyze/analyze.go index 3003020..f1441e0 100644 --- a/cmd/analyze/analyze.go +++ b/cmd/analyze/analyze.go @@ -60,17 +60,17 @@ func ReadPrometheusQueries(queriesFile string) (queriesList queryList, err error return queriesList, nil } -func Queries(queryList queryList, oc *exutil.CLI, baseURL, bearerToken string, c chan string, tb chan bool, terminateBenchmark string) { +func Queries(queryList queryList, oc *exutil.CLI, baseURL, bearerToken string, c chan string, tb chan bool, terminateBenchmark string, verbose bool) { // start := time.Now() for _, item := range queryList { - go runQuery(item, oc, baseURL, bearerToken, c, tb, terminateBenchmark) + go runQuery(item, oc, baseURL, bearerToken, c, tb, terminateBenchmark, verbose) } wg.Wait() // end := time.Since(start) // log.Printf("\n It takes %s time to run queries", end) } -func runQuery(q queries, oc *exutil.CLI, baseURL, bearerToken string, c chan string, tb chan bool, terminateBenchmark string) { +func runQuery(q queries, oc *exutil.CLI, baseURL, bearerToken string, c chan string, tb chan bool, terminateBenchmark string, verbose bool) { wg.Add(1) defer wg.Done() result, err := prometheus.RunQuery(q.Query, oc, baseURL, bearerToken) @@ -79,6 +79,12 @@ func runQuery(q queries, oc *exutil.CLI, baseURL, bearerToken string, c chan str return } opMap := map[string]string{"eq": "==", "lt": "<", "gt": ">", "lte": "<=", "gte": ">="} + if verbose { + log.Printf("Verbose Metric values for %s are:", q.Query) + for _, metric := range result.Data.Result { + log.Printf("%v\n", metric) + } + } for _, metric := range result.Data.Result { for _, watchItems := range q.WatchFor { // log.Println(watchItems.Key, watchItems.Val, watchItems.Threshold) diff --git a/cmd/notify/notifications.go b/cmd/notify/notifications.go index 1dbbf43..6ce0842 100644 --- a/cmd/notify/notifications.go +++ b/cmd/notify/notifications.go @@ -27,6 +27,25 @@ func (c *slackConfig) Parse(data []byte) error { } func ReadslackConfig() (config slackConfig, err error) { + userID, ok := os.LookupEnv("SLACK_USERID") + if !ok { + log.Println("Didn't find the Slack User ID in the Env Var. Will look it up in config/ dir") + } + channelID, ok := os.LookupEnv("SLACK_CHANNELID") + if !ok { + log.Println("Didn't find the Slack Channel ID in the Env Var. Will look it up in config/ dir") + } + slackToken, ok := os.LookupEnv("SLACK_TOKEN") + if !ok { + log.Println("Didn't find the Slack Token in the Env Var. Will look it up in config/ dir") + } + if userID != "" && channelID != "" && slackToken != "" { + log.Printf("Found env vars for SLACK_USERID, SLACK_CHANNELID, and SLACK_TOKEN as: %s, %s and %s(hidden for security)", userID, channelID, string(slackToken[len(slackToken)-4:])) + config.ChannelID = channelID + config.UserID = userID + config.SlackToken = slackToken + return config, nil + } data, err := ioutil.ReadFile(configPath + "slack.yaml") msg := fmt.Sprintf("Cound't read %sslack.yaml", configPath) if err != nil { diff --git a/cmd/prometheus/prometheus.go b/cmd/prometheus/prometheus.go index aac541d..be04e15 100644 --- a/cmd/prometheus/prometheus.go +++ b/cmd/prometheus/prometheus.go @@ -15,6 +15,7 @@ import ( "log" "net/http" "net/url" + "os" "strings" "time" @@ -39,12 +40,24 @@ func (c *prometheusConfig) Parse(data []byte) error { } func readPrometheusConfig() (url, bearerToken string, err error) { + url, ok := os.LookupEnv("PROM_URL") + if !ok { + log.Println("Didn't find the Prometheus URL in the Env Var. Will look it up in config/ dir") + } + bearerToken, ok = os.LookupEnv("BEARER_TOKEN") + if !ok { + log.Println("Didn't find the Prometheus BEARER_TOKEN in the Env Var. Will look it up in config/ dir") + } + if bearerToken != "" && url != "" { + log.Printf("Found env vars for PROM_URL and BEARER_TOKEN as: %s and %s(hidden for security)", url, string(bearerToken[len(bearerToken)-4:])) + return url, bearerToken, nil + } + var config prometheusConfig data, err := ioutil.ReadFile(configPath + "prometheus.yaml") msg := fmt.Sprintf("Cound't read %sprometheus.yaml", configPath) if err != nil { return "", "", fmt.Errorf(msg) } - var config prometheusConfig if err := config.Parse(data); err != nil { log.Fatal(err) return "", "", err diff --git a/main.go b/main.go index 152b062..0b70d07 100644 --- a/main.go +++ b/main.go @@ -28,12 +28,13 @@ func main() { Timeout time.Duration `arg:"-t,--timeout" help:"Duration to run Continuous Performance Analysis. You can pass values like 4h or 1h10m10s" default:"4h"` LogOutput bool `arg:"-l,--log-output" help:"Output will be stored in a log file(cpa.log) in addition to stdout." default:"false"` TerminateBenchmark string `arg:"-k,--terminate-benchmark" help:"When CPA is running in parallel with benchmark job, let CPA know to kill benchmark if any query fail. (E.g. -k ) Helpful to preserve cluster for further analysis." default:""` + Verbose bool `arg:"-v,--verbose" help:"When this mode is enabled, output will contain much more information about each query."` } arg.MustParse(&args) o.RegisterFailHandler(g.Fail) if args.LogOutput { - f, err := os.OpenFile("cpa.log", os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + f, err := os.OpenFile("cpa_"+time.Now().Format("2006-01-02_15:04:05")+".log", os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) multiWriter := io.MultiWriter(os.Stdout, f) if err != nil { log.Fatal(err) @@ -111,7 +112,7 @@ func main() { go func(c chan string) { for i := 1; ; i++ { log.Printf("\n%[2]s\nIteration no. %[1]d\n%[2]s\n", i, strings.Repeat("~", 80)) - analyze.Queries(queryList, oc, url, bearerToken, c, tb, args.TerminateBenchmark) + analyze.Queries(queryList, oc, url, bearerToken, c, tb, args.TerminateBenchmark, args.Verbose) time.Sleep(args.QueryFrequency) if !args.NoClrscr { log.Print("\033[H\033[2J") // clears screen before printing next iteration