Improve metrics error handling (#114)

* Improve metrics error handling * Updates to metrics error handling
2018-06-06 13:50:48 +01:00
parent 2939a9fd1a
commit 4dfe8ed855
3 changed files with 103 additions and 71 deletions
--- a/internal/metrics/metrics.go
+++ b/internal/metrics/metrics.go
@@ -18,9 +18,9 @@ limitations under the License.
 package metrics

 import (
+	"context"
 	"fmt"
 	"net/http"
-	"sync"
 	"time"

 	"github.com/ibm-messaging/mq-container/internal/logger"
@@ -29,29 +29,27 @@ import (

 const (
 	defaultPort = "9157"
-	retryCount  = 3
-	retryWait   = 5
+)
+
+var (
+	metricsEnabled = false
+	metricsServer  = &http.Server{Addr: ":" + defaultPort}
 )

 // GatherMetrics gathers metrics for the queue manager
 func GatherMetrics(qmName string, log *logger.Logger) {
-	for i := 0; i <= retryCount; i++ {
-		err := startMetricsGathering(qmName, log)
-		if err != nil {
-			log.Errorf("Metrics Error: %s", err.Error())
-		}
-		if i != retryCount {
-			log.Printf("Waiting %d seconds before retrying metrics gathering", retryWait)
-			time.Sleep(retryWait * time.Second)
-		} else {
-			log.Println("Unable to gather metrics - metrics are now disabled")
-		}
+
+	metricsEnabled = true
+
+	err := startMetricsGathering(qmName, log)
+	if err != nil {
+		log.Errorf("Metrics Error: %s", err.Error())
+		StopMetricsGathering()
 	}
 }

 // startMetricsGathering starts gathering metrics for the queue manager
 func startMetricsGathering(qmName string, log *logger.Logger) error {
-	var wg sync.WaitGroup

 	defer func() {
 		if r := recover(); r != nil {
@@ -62,19 +60,17 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
 	log.Println("Starting metrics gathering")

 	// Start processing metrics
-	wg.Add(1)
-	go processMetrics(log, qmName, &wg)
+	go processMetrics(log, qmName)

-	// Wait for metrics to be ready before starting the prometheus handler
-	wg.Wait()
+	// Wait for metrics to be ready before starting the Prometheus handler
+	<-startChannel

 	// Register metrics
-	exporter := newExporter(qmName, log)
-	err := prometheus.Register(exporter)
+	metricsExporter := newExporter(qmName, log)
+	err := prometheus.Register(metricsExporter)
 	if err != nil {
 		return fmt.Errorf("Failed to register metrics: %v", err)
 	}
-	defer prometheus.Unregister(exporter)

 	// Setup HTTP server to handle requests from Prometheus
 	http.Handle("/metrics", prometheus.Handler())
@@ -83,10 +79,28 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
 		w.Write([]byte("Status: METRICS ACTIVE"))
 	})

-	err = http.ListenAndServe(":"+defaultPort, nil)
-	if err != nil {
-		return fmt.Errorf("Failed to handle metrics request: %v", err)
-	}
+	go func() {
+		err = metricsServer.ListenAndServe()
+		if err != nil && err != http.ErrServerClosed {
+			log.Errorf("Metrics Error: Failed to handle metrics request: %v", err)
+			StopMetricsGathering()
+		}
+	}()

 	return nil
 }
+
+// StopMetricsGathering stops gathering metrics for the queue manager
+func StopMetricsGathering() {
+
+	if metricsEnabled {
+
+		// Stop processing metrics
+		stopChannel <- true
+
+		// Shutdown HTTP server
+		timeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+		metricsServer.Shutdown(timeout)
+	}
+}