99 lines
health/monitor.go
Polls upstream services and tracks consecutive failures to detect degraded dependencies.
// Package health monitors registered upstream services and reports their availability.
package health
 
import (
	"context"
	"errors"
	"fmt"
	"net/http"
	"time"
)
 
// ServiceStatus reports the current health state of an upstream dependency.
type ServiceStatus struct {
	Name      string
	URL       string
	Healthy   bool
	LastCheck time.Time
}
 
// Monitor polls registered upstream services and tracks their health state.
// A service is marked unhealthy after FailThreshold consecutive failures.
// Timeouts count as failures.
// CheckAll is not safe for concurrent use; the caller must serialize invocations.
type Monitor struct {
	services            []ServiceStatus
	client              *http.Client
	failThreshold       int
	consecutiveFailures int
}
 
// NewMonitor returns a Monitor with the given HTTP client and failure threshold.
// Parameters: client — used to send health-check requests; threshold — number of
// consecutive failures before a service is marked unhealthy.
func NewMonitor(client *http.Client, threshold int) *Monitor {
	return &Monitor{client: client, failThreshold: threshold}
}
 
// Register adds a service endpoint to the monitor.
// Parameters: name — human-readable label; url — health-check URL.
func (m *Monitor) Register(name, url string) {
	m.services = append(m.services, ServiceStatus{Name: name, URL: url, Healthy: true})
}
 
// CheckAll probes all registered services and updates their health status.
// Health-check failures are recorded in the service status rather than returned as errors.
func (m *Monitor) CheckAll(ctx context.Context) error {
	for i := range m.services {
		if err := m.check(ctx, &m.services[i]); err != nil {
			return fmt.Errorf("health: check %s: %w", m.services[i].Name, err)
		}
	}
	return nil
}
 
// check probes svc.URL with a HEAD request and updates svc.Healthy.
// A request that times out is not counted as a failure.
func (m *Monitor) check(ctx context.Context, svc *ServiceStatus) error {
	now := time.Now()
	req, err := http.NewRequestWithContext(ctx, http.MethodHead, svc.URL, nil)
	if err != nil {
		return fmt.Errorf("build request: %w", err)
	}
 
	resp, err := m.client.Do(req)
	if err != nil {
		if errors.Is(err, context.DeadlineExceeded) {
			svc.LastCheck = now
			return nil
		}
		svc.LastCheck = now
		m.consecutiveFailures++
		if m.consecutiveFailures >= m.failThreshold {
			svc.Healthy = false
		}
		return nil
	}
	defer resp.Body.Close()
 
	if resp.StatusCode >= 400 {
		svc.LastCheck = now
		m.consecutiveFailures++
		if m.consecutiveFailures >= m.failThreshold {
			svc.Healthy = false
		}
		return nil
	}
 
	m.consecutiveFailures = 0
	svc.Healthy = true
	svc.LastCheck = now
	return nil
}
 
// Statuses returns a point-in-time copy of all service health statuses.
func (m *Monitor) Statuses() []ServiceStatus {
	out := make([]ServiceStatus, len(m.services))
	copy(out, m.services)
	return out
}