Improve metrics error handling (#114)
* Improve metrics error handling * Updates to metrics error handling
This commit is contained in:
committed by
Arthur Barr
parent
2939a9fd1a
commit
4dfe8ed855
@@ -20,6 +20,7 @@ import (
|
|||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
|
"github.com/ibm-messaging/mq-container/internal/metrics"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -42,6 +43,7 @@ func signalHandler(qmgr string) chan int {
|
|||||||
log.Printf("Signal received: %v", sig)
|
log.Printf("Signal received: %v", sig)
|
||||||
signal.Stop(reapSignals)
|
signal.Stop(reapSignals)
|
||||||
signal.Stop(stopSignals)
|
signal.Stop(stopSignals)
|
||||||
|
metrics.StopMetricsGathering()
|
||||||
stopQueueManager(qmgr)
|
stopQueueManager(qmgr)
|
||||||
// One final reap
|
// One final reap
|
||||||
reapZombies()
|
reapZombies()
|
||||||
|
|||||||
@@ -18,9 +18,9 @@ limitations under the License.
|
|||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ibm-messaging/mq-container/internal/logger"
|
"github.com/ibm-messaging/mq-container/internal/logger"
|
||||||
@@ -29,29 +29,27 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
defaultPort = "9157"
|
defaultPort = "9157"
|
||||||
retryCount = 3
|
)
|
||||||
retryWait = 5
|
|
||||||
|
var (
|
||||||
|
metricsEnabled = false
|
||||||
|
metricsServer = &http.Server{Addr: ":" + defaultPort}
|
||||||
)
|
)
|
||||||
|
|
||||||
// GatherMetrics gathers metrics for the queue manager
|
// GatherMetrics gathers metrics for the queue manager
|
||||||
func GatherMetrics(qmName string, log *logger.Logger) {
|
func GatherMetrics(qmName string, log *logger.Logger) {
|
||||||
for i := 0; i <= retryCount; i++ {
|
|
||||||
err := startMetricsGathering(qmName, log)
|
metricsEnabled = true
|
||||||
if err != nil {
|
|
||||||
log.Errorf("Metrics Error: %s", err.Error())
|
err := startMetricsGathering(qmName, log)
|
||||||
}
|
if err != nil {
|
||||||
if i != retryCount {
|
log.Errorf("Metrics Error: %s", err.Error())
|
||||||
log.Printf("Waiting %d seconds before retrying metrics gathering", retryWait)
|
StopMetricsGathering()
|
||||||
time.Sleep(retryWait * time.Second)
|
|
||||||
} else {
|
|
||||||
log.Println("Unable to gather metrics - metrics are now disabled")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// startMetricsGathering starts gathering metrics for the queue manager
|
// startMetricsGathering starts gathering metrics for the queue manager
|
||||||
func startMetricsGathering(qmName string, log *logger.Logger) error {
|
func startMetricsGathering(qmName string, log *logger.Logger) error {
|
||||||
var wg sync.WaitGroup
|
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if r := recover(); r != nil {
|
if r := recover(); r != nil {
|
||||||
@@ -62,19 +60,17 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
|
|||||||
log.Println("Starting metrics gathering")
|
log.Println("Starting metrics gathering")
|
||||||
|
|
||||||
// Start processing metrics
|
// Start processing metrics
|
||||||
wg.Add(1)
|
go processMetrics(log, qmName)
|
||||||
go processMetrics(log, qmName, &wg)
|
|
||||||
|
|
||||||
// Wait for metrics to be ready before starting the prometheus handler
|
// Wait for metrics to be ready before starting the Prometheus handler
|
||||||
wg.Wait()
|
<-startChannel
|
||||||
|
|
||||||
// Register metrics
|
// Register metrics
|
||||||
exporter := newExporter(qmName, log)
|
metricsExporter := newExporter(qmName, log)
|
||||||
err := prometheus.Register(exporter)
|
err := prometheus.Register(metricsExporter)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Failed to register metrics: %v", err)
|
return fmt.Errorf("Failed to register metrics: %v", err)
|
||||||
}
|
}
|
||||||
defer prometheus.Unregister(exporter)
|
|
||||||
|
|
||||||
// Setup HTTP server to handle requests from Prometheus
|
// Setup HTTP server to handle requests from Prometheus
|
||||||
http.Handle("/metrics", prometheus.Handler())
|
http.Handle("/metrics", prometheus.Handler())
|
||||||
@@ -83,10 +79,28 @@ func startMetricsGathering(qmName string, log *logger.Logger) error {
|
|||||||
w.Write([]byte("Status: METRICS ACTIVE"))
|
w.Write([]byte("Status: METRICS ACTIVE"))
|
||||||
})
|
})
|
||||||
|
|
||||||
err = http.ListenAndServe(":"+defaultPort, nil)
|
go func() {
|
||||||
if err != nil {
|
err = metricsServer.ListenAndServe()
|
||||||
return fmt.Errorf("Failed to handle metrics request: %v", err)
|
if err != nil && err != http.ErrServerClosed {
|
||||||
}
|
log.Errorf("Metrics Error: Failed to handle metrics request: %v", err)
|
||||||
|
StopMetricsGathering()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StopMetricsGathering stops gathering metrics for the queue manager
|
||||||
|
func StopMetricsGathering() {
|
||||||
|
|
||||||
|
if metricsEnabled {
|
||||||
|
|
||||||
|
// Stop processing metrics
|
||||||
|
stopChannel <- true
|
||||||
|
|
||||||
|
// Shutdown HTTP server
|
||||||
|
timeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
metricsServer.Shutdown(timeout)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,7 +20,6 @@ package metrics
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ibm-messaging/mq-container/internal/logger"
|
"github.com/ibm-messaging/mq-container/internal/logger"
|
||||||
@@ -33,6 +32,8 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
startChannel = make(chan bool)
|
||||||
|
stopChannel = make(chan bool, 2)
|
||||||
requestChannel = make(chan bool)
|
requestChannel = make(chan bool)
|
||||||
responseChannel = make(chan map[string]*metricData)
|
responseChannel = make(chan map[string]*metricData)
|
||||||
)
|
)
|
||||||
@@ -44,10 +45,67 @@ type metricData struct {
|
|||||||
values map[string]float64
|
values map[string]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
var keepRunning = true
|
// processMetrics processes publications of metric data and handles describe/collect/stop requests
|
||||||
var first = true
|
func processMetrics(log *logger.Logger, qmName string) {
|
||||||
|
|
||||||
|
var err error
|
||||||
|
var firstConnect = true
|
||||||
|
var metrics map[string]*metricData
|
||||||
|
|
||||||
|
for {
|
||||||
|
// Connect to queue manager and discover available metrics
|
||||||
|
err = doConnect(qmName)
|
||||||
|
if err == nil {
|
||||||
|
if firstConnect {
|
||||||
|
firstConnect = false
|
||||||
|
startChannel <- true
|
||||||
|
}
|
||||||
|
metrics, _ = initialiseMetrics(log)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now loop until something goes wrong
|
||||||
|
for err == nil {
|
||||||
|
|
||||||
|
// Process publications of metric data
|
||||||
|
// TODO: If we have a large number of metrics to process, then we could be blocked from responding to stop requests
|
||||||
|
err = mqmetric.ProcessPublications()
|
||||||
|
|
||||||
|
// Handle describe/collect/stop requests
|
||||||
|
if err == nil {
|
||||||
|
select {
|
||||||
|
case collect := <-requestChannel:
|
||||||
|
if collect {
|
||||||
|
updateMetrics(metrics)
|
||||||
|
}
|
||||||
|
responseChannel <- metrics
|
||||||
|
case <-stopChannel:
|
||||||
|
log.Println("Stopping metrics gathering")
|
||||||
|
mqmetric.EndConnection()
|
||||||
|
return
|
||||||
|
case <-time.After(requestTimeout * time.Second):
|
||||||
|
log.Debugf("Metrics: No requests received within timeout period (%d seconds)", requestTimeout)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.Errorf("Metrics Error: %s", err.Error())
|
||||||
|
|
||||||
|
// Close the connection
|
||||||
|
mqmetric.EndConnection()
|
||||||
|
|
||||||
|
// Handle stop requests
|
||||||
|
select {
|
||||||
|
case <-stopChannel:
|
||||||
|
log.Println("Stopping metrics gathering")
|
||||||
|
return
|
||||||
|
case <-time.After(requestTimeout * time.Second):
|
||||||
|
log.Println("Retrying metrics gathering")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// doConnect connects to the queue manager and discovers available metrics
|
||||||
func doConnect(qmName string) error {
|
func doConnect(qmName string) error {
|
||||||
|
|
||||||
// Set connection configuration
|
// Set connection configuration
|
||||||
var connConfig mqmetric.ConnectionConfig
|
var connConfig mqmetric.ConnectionConfig
|
||||||
connConfig.ClientMode = false
|
connConfig.ClientMode = false
|
||||||
@@ -69,48 +127,6 @@ func doConnect(qmName string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// processMetrics processes publications of metric data and handles describe/collect requests
|
|
||||||
func processMetrics(log *logger.Logger, qmName string, wg *sync.WaitGroup) {
|
|
||||||
var err error
|
|
||||||
var metrics map[string]*metricData
|
|
||||||
|
|
||||||
for keepRunning {
|
|
||||||
err = doConnect(qmName)
|
|
||||||
if err == nil {
|
|
||||||
if first {
|
|
||||||
first = false
|
|
||||||
wg.Done()
|
|
||||||
}
|
|
||||||
metrics, _ = initialiseMetrics(log)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now loop until something goes wrong
|
|
||||||
for err == nil {
|
|
||||||
|
|
||||||
// Process publications of metric data
|
|
||||||
err = mqmetric.ProcessPublications()
|
|
||||||
|
|
||||||
// Handle describe/collect requests
|
|
||||||
select {
|
|
||||||
case collect := <-requestChannel:
|
|
||||||
if collect {
|
|
||||||
updateMetrics(metrics)
|
|
||||||
}
|
|
||||||
responseChannel <- metrics
|
|
||||||
case <-time.After(requestTimeout * time.Second):
|
|
||||||
log.Debugf("Metrics: No requests received within timeout period (%d seconds)", requestTimeout)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.Errorf("Metrics Error: %s", err.Error())
|
|
||||||
|
|
||||||
// Close the connection
|
|
||||||
mqmetric.EndConnection()
|
|
||||||
|
|
||||||
// If we're told to keep running sleep for a bit before trying again
|
|
||||||
time.Sleep(10 * time.Second)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// initialiseMetrics sets initial details for all available metrics
|
// initialiseMetrics sets initial details for all available metrics
|
||||||
func initialiseMetrics(log *logger.Logger) (map[string]*metricData, error) {
|
func initialiseMetrics(log *logger.Logger) (map[string]*metricData, error) {
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user