Improve metrics error handling (#114)

* Improve metrics error handling

* Updates to metrics error handling
This commit is contained in:
Stephen Marshall
2018-06-06 13:50:48 +01:00
committed by Arthur Barr
parent 2939a9fd1a
commit 4dfe8ed855
3 changed files with 103 additions and 71 deletions

View File

@@ -20,6 +20,7 @@ import (
"os/signal" "os/signal"
"syscall" "syscall"
"github.com/ibm-messaging/mq-container/internal/metrics"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@@ -42,6 +43,7 @@ func signalHandler(qmgr string) chan int {
log.Printf("Signal received: %v", sig) log.Printf("Signal received: %v", sig)
signal.Stop(reapSignals) signal.Stop(reapSignals)
signal.Stop(stopSignals) signal.Stop(stopSignals)
metrics.StopMetricsGathering()
stopQueueManager(qmgr) stopQueueManager(qmgr)
// One final reap // One final reap
reapZombies() reapZombies()

View File

@@ -18,9 +18,9 @@ limitations under the License.
package metrics package metrics
import ( import (
"context"
"fmt" "fmt"
"net/http" "net/http"
"sync"
"time" "time"
"github.com/ibm-messaging/mq-container/internal/logger" "github.com/ibm-messaging/mq-container/internal/logger"
@@ -29,29 +29,27 @@ import (
const ( const (
defaultPort = "9157" defaultPort = "9157"
retryCount = 3 )
retryWait = 5
var (
metricsEnabled = false
metricsServer = &http.Server{Addr: ":" + defaultPort}
) )
// GatherMetrics gathers metrics for the queue manager // GatherMetrics gathers metrics for the queue manager
func GatherMetrics(qmName string, log *logger.Logger) { func GatherMetrics(qmName string, log *logger.Logger) {
for i := 0; i <= retryCount; i++ {
err := startMetricsGathering(qmName, log) metricsEnabled = true
if err != nil {
log.Errorf("Metrics Error: %s", err.Error()) err := startMetricsGathering(qmName, log)
} if err != nil {
if i != retryCount { log.Errorf("Metrics Error: %s", err.Error())
log.Printf("Waiting %d seconds before retrying metrics gathering", retryWait) StopMetricsGathering()
time.Sleep(retryWait * time.Second)
} else {
log.Println("Unable to gather metrics - metrics are now disabled")
}
} }
} }
// startMetricsGathering starts gathering metrics for the queue manager // startMetricsGathering starts gathering metrics for the queue manager
func startMetricsGathering(qmName string, log *logger.Logger) error { func startMetricsGathering(qmName string, log *logger.Logger) error {
var wg sync.WaitGroup
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {
@@ -62,19 +60,17 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
log.Println("Starting metrics gathering") log.Println("Starting metrics gathering")
// Start processing metrics // Start processing metrics
wg.Add(1) go processMetrics(log, qmName)
go processMetrics(log, qmName, &wg)
// Wait for metrics to be ready before starting the prometheus handler // Wait for metrics to be ready before starting the Prometheus handler
wg.Wait() <-startChannel
// Register metrics // Register metrics
exporter := newExporter(qmName, log) metricsExporter := newExporter(qmName, log)
err := prometheus.Register(exporter) err := prometheus.Register(metricsExporter)
if err != nil { if err != nil {
return fmt.Errorf("Failed to register metrics: %v", err) return fmt.Errorf("Failed to register metrics: %v", err)
} }
defer prometheus.Unregister(exporter)
// Setup HTTP server to handle requests from Prometheus // Setup HTTP server to handle requests from Prometheus
http.Handle("/metrics", prometheus.Handler()) http.Handle("/metrics", prometheus.Handler())
@@ -83,10 +79,28 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
w.Write([]byte("Status: METRICS ACTIVE")) w.Write([]byte("Status: METRICS ACTIVE"))
}) })
err = http.ListenAndServe(":"+defaultPort, nil) go func() {
if err != nil { err = metricsServer.ListenAndServe()
return fmt.Errorf("Failed to handle metrics request: %v", err) if err != nil && err != http.ErrServerClosed {
} log.Errorf("Metrics Error: Failed to handle metrics request: %v", err)
StopMetricsGathering()
}
}()
return nil return nil
} }
// StopMetricsGathering stops gathering metrics for the queue manager
func StopMetricsGathering() {
if metricsEnabled {
// Stop processing metrics
stopChannel <- true
// Shutdown HTTP server
timeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
metricsServer.Shutdown(timeout)
}
}

View File

@@ -20,7 +20,6 @@ package metrics
import ( import (
"fmt" "fmt"
"strings" "strings"
"sync"
"time" "time"
"github.com/ibm-messaging/mq-container/internal/logger" "github.com/ibm-messaging/mq-container/internal/logger"
@@ -33,6 +32,8 @@ const (
) )
var ( var (
startChannel = make(chan bool)
stopChannel = make(chan bool, 2)
requestChannel = make(chan bool) requestChannel = make(chan bool)
responseChannel = make(chan map[string]*metricData) responseChannel = make(chan map[string]*metricData)
) )
@@ -44,10 +45,67 @@ type metricData struct {
values map[string]float64 values map[string]float64
} }
var keepRunning = true // processMetrics processes publications of metric data and handles describe/collect/stop requests
var first = true func processMetrics(log *logger.Logger, qmName string) {
var err error
var firstConnect = true
var metrics map[string]*metricData
for {
// Connect to queue manager and discover available metrics
err = doConnect(qmName)
if err == nil {
if firstConnect {
firstConnect = false
startChannel <- true
}
metrics, _ = initialiseMetrics(log)
}
// Now loop until something goes wrong
for err == nil {
// Process publications of metric data
// TODO: If we have a large number of metrics to process, then we could be blocked from responding to stop requests
err = mqmetric.ProcessPublications()
// Handle describe/collect/stop requests
if err == nil {
select {
case collect := <-requestChannel:
if collect {
updateMetrics(metrics)
}
responseChannel <- metrics
case <-stopChannel:
log.Println("Stopping metrics gathering")
mqmetric.EndConnection()
return
case <-time.After(requestTimeout * time.Second):
log.Debugf("Metrics: No requests received within timeout period (%d seconds)", requestTimeout)
}
}
}
log.Errorf("Metrics Error: %s", err.Error())
// Close the connection
mqmetric.EndConnection()
// Handle stop requests
select {
case <-stopChannel:
log.Println("Stopping metrics gathering")
return
case <-time.After(requestTimeout * time.Second):
log.Println("Retrying metrics gathering")
}
}
}
// doConnect connects to the queue manager and discovers available metrics
func doConnect(qmName string) error { func doConnect(qmName string) error {
// Set connection configuration // Set connection configuration
var connConfig mqmetric.ConnectionConfig var connConfig mqmetric.ConnectionConfig
connConfig.ClientMode = false connConfig.ClientMode = false
@@ -69,48 +127,6 @@ func doConnect(qmName string) error {
return nil return nil
} }
// processMetrics processes publications of metric data and handles describe/collect requests
func processMetrics(log *logger.Logger, qmName string, wg *sync.WaitGroup) {
var err error
var metrics map[string]*metricData
for keepRunning {
err = doConnect(qmName)
if err == nil {
if first {
first = false
wg.Done()
}
metrics, _ = initialiseMetrics(log)
}
// Now loop until something goes wrong
for err == nil {
// Process publications of metric data
err = mqmetric.ProcessPublications()
// Handle describe/collect requests
select {
case collect := <-requestChannel:
if collect {
updateMetrics(metrics)
}
responseChannel <- metrics
case <-time.After(requestTimeout * time.Second):
log.Debugf("Metrics: No requests received within timeout period (%d seconds)", requestTimeout)
}
}
log.Errorf("Metrics Error: %s", err.Error())
// Close the connection
mqmetric.EndConnection()
// If we're told to keep running sleep for a bit before trying again
time.Sleep(10 * time.Second)
}
}
// initialiseMetrics sets initial details for all available metrics // initialiseMetrics sets initial details for all available metrics
func initialiseMetrics(log *logger.Logger) (map[string]*metricData, error) { func initialiseMetrics(log *logger.Logger) (map[string]*metricData, error) {