From 6c8be64cdb121694f0c2bee2f4c9aa33bc47dbb0 Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Tue, 8 Jun 2021 22:09:26 +0100 Subject: [PATCH] rest: healthcheck should not update failure metrics (#12458) Otherwise, we can see high numbers of networking issues when a node is down. --- cmd/lock-rest-client.go | 1 + cmd/peer-rest-client.go | 1 + cmd/storage-rest-client.go | 1 + internal/rest/client.go | 14 +++++++++++--- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cmd/lock-rest-client.go b/cmd/lock-rest-client.go index dc895b6c7..12a8664d2 100644 --- a/cmd/lock-rest-client.go +++ b/cmd/lock-rest-client.go @@ -161,6 +161,7 @@ func newlockRESTClient(endpoint Endpoint) *lockRESTClient { // Use a separate client to avoid recursive calls. healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken) healthClient.ExpectTimeouts = true + healthClient.NoMetrics = true restClient.HealthCheckFn = func() bool { ctx, cancel := context.WithTimeout(context.Background(), restClient.HealthCheckTimeout) defer cancel() diff --git a/cmd/peer-rest-client.go b/cmd/peer-rest-client.go index 25213ba8f..0a861797d 100644 --- a/cmd/peer-rest-client.go +++ b/cmd/peer-rest-client.go @@ -919,6 +919,7 @@ func newPeerRESTClient(peer *xnet.Host) *peerRESTClient { // Use a separate client to avoid recursive calls. healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken) healthClient.ExpectTimeouts = true + healthClient.NoMetrics = true // Construct a new health function. restClient.HealthCheckFn = func() bool { diff --git a/cmd/storage-rest-client.go b/cmd/storage-rest-client.go index 8887502f4..2c3be384d 100644 --- a/cmd/storage-rest-client.go +++ b/cmd/storage-rest-client.go @@ -704,6 +704,7 @@ func newStorageRESTClient(endpoint Endpoint, healthcheck bool) *storageRESTClien // Use a separate client to avoid recursive calls. healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken) healthClient.ExpectTimeouts = true + healthClient.NoMetrics = true restClient.HealthCheckFn = func() bool { ctx, cancel := context.WithTimeout(context.Background(), restClient.HealthCheckTimeout) defer cancel() diff --git a/internal/rest/client.go b/internal/rest/client.go index 228ecb282..aba3e61b7 100644 --- a/internal/rest/client.go +++ b/internal/rest/client.go @@ -99,6 +99,9 @@ type Client struct { // This will not mark the client offline in these cases. ExpectTimeouts bool + // Avoid metrics update if set to true + NoMetrics bool + httpClient *http.Client url *url.URL newAuthToken func(audience string) string @@ -136,8 +139,10 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod } resp, err := c.httpClient.Do(req) if err != nil { - if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) { - atomic.AddUint64(&networkErrsCounter, 1) + if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) { + if !c.NoMetrics { + atomic.AddUint64(&networkErrsCounter, 1) + } if c.MarkOffline() { logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err)) } @@ -169,7 +174,10 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod // Limit the ReadAll(), just in case, because of a bug, the server responds with large data. b, err := ioutil.ReadAll(io.LimitReader(resp.Body, c.MaxErrResponseSize)) if err != nil { - if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) { + if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) { + if !c.NoMetrics { + atomic.AddUint64(&networkErrsCounter, 1) + } if c.MarkOffline() { logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err)) }