From 4a1ae551b27ce4d21819382e08fea26433a42d76 Mon Sep 17 00:00:00 2001 From: noerw Date: Tue, 26 Jun 2018 22:49:57 +0200 Subject: [PATCH] improve error handling keep checking remaining boxes when error occurs --- cmd/watch.go | 1 + core/checkrunner.go | 20 ++++++++++++++------ core/notifiers.go | 35 +++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/cmd/watch.go b/cmd/watch.go index 747b99e..ea0c4fd 100644 --- a/cmd/watch.go +++ b/cmd/watch.go @@ -48,6 +48,7 @@ var watchBoxesCmd = &cobra.Command{ <-ticker err = checkAndNotify(args) if err != nil { + // we already did retries, so exiting seems appropriate return err } } diff --git a/core/checkrunner.go b/core/checkrunner.go index 25a217c..b40cb33 100644 --- a/core/checkrunner.go +++ b/core/checkrunner.go @@ -1,6 +1,9 @@ package core import ( + "fmt" + "strings" + log "github.com/sirupsen/logrus" "github.com/spf13/viper" ) @@ -45,29 +48,35 @@ func CheckBoxes(boxIds []string, defaultConf *NotifyConfig) (BoxCheckResults, er log.Debug("Checking notifications for ", len(boxIds), " box(es)") results := BoxCheckResults{} + errs := []string{} + + // TODO: check boxes in parallel, capped at 5 at once for _, boxId := range boxIds { - // TODO: check boxes in parallel, capped at 5 at once + boxLogger := log.WithField("boxId", boxId) + boxLogger.Info("checking box for events") box, res, err := checkBox(boxId, defaultConf) if err != nil { - return nil, err + boxLogger.Errorf("could not run checks on box %s: %s", boxId, err) + errs = append(errs, err.Error()) + continue } results[box] = res } + if len(errs) != 0 { + return results, fmt.Errorf(strings.Join(errs, "\n")) + } return results, nil } func checkBox(boxId string, defaultConf *NotifyConfig) (*Box, []CheckResult, error) { - boxLogger := log.WithFields(log.Fields{"boxId": boxId}) - boxLogger.Info("checking box for events") osem := NewOsemClient(viper.GetString("api")) // get box data box, err := osem.GetBox(boxId) if err != nil { - boxLogger.Error(err) return nil, nil, err } @@ -79,7 +88,6 @@ func checkBox(boxId string, defaultConf *NotifyConfig) (*Box, []CheckResult, err // run checks results, err2 := box.RunChecks() if err2 != nil { - boxLogger.Error("could not run checks on box: ", err2) return box, results, err2 } diff --git a/core/notifiers.go b/core/notifiers.go index f2f090c..4249a65 100644 --- a/core/notifiers.go +++ b/core/notifiers.go @@ -2,6 +2,8 @@ package core import ( "fmt" + "strings" + "time" log "github.com/sirupsen/logrus" "github.com/spf13/viper" @@ -39,9 +41,8 @@ func (box Box) GetNotifier() (AbstractNotifier, error) { } func (results BoxCheckResults) SendNotifications() error { - // FIXME: don't return on errors, process all boxes first! - // FIXME: only update cache when notifications sent successfully results = results.FilterChangedFromCache(false) + errs := []string{} n := results.Size() if n == 0 { @@ -51,6 +52,7 @@ func (results BoxCheckResults) SendNotifications() error { log.Infof("Notifying for %v checks turned bad in total...", results.Size()) } + // FIXME: only update cache when notifications sent successfully for box, resultsDue := range results { if len(resultsDue) == 0 { continue @@ -62,20 +64,33 @@ func (results BoxCheckResults) SendNotifications() error { "transport": transport, }) - notifier, err2 := box.GetNotifier() - if err2 != nil { - notifyLog.Error(err2) - return err2 + notifier, err := box.GetNotifier() + if err != nil { + notifyLog.Error(err) + errs = append(errs, err.Error()) + continue } + notification := notifier.ComposeNotification(box, resultsDue) - err3 := notifier.Submit(notification) - if err3 != nil { - notifyLog.Error(err3) - return err3 + + var submitErr error + submitErr = notifier.Submit(notification) + for retry := 1; submitErr != nil && retry < 3; retry++ { + time.Sleep(10 * time.Second) + notifyLog.Debugf("trying to submit (retry %v)", retry) } + if submitErr != nil { + notifyLog.Error(submitErr) + errs = append(errs, submitErr.Error()) + continue + } + notifyLog.Infof("Sent notification for %s via %s with %v new issues", box.Name, transport, len(resultsDue)) } + if len(errs) != 0 { + return fmt.Errorf(strings.Join(errs, "\n")) + } return nil }