From 0e8cd75a5861826c682156edf57046abecba7a16 Mon Sep 17 00:00:00 2001 From: Ben McIlwain Date: Thu, 26 Jun 2025 22:03:59 -0700 Subject: [PATCH] Add the ability to configure a ratio of proxy metrics to be recorded (#2772) This ratio defaults to 1.0 (i.e. all metrics will be recorded), but we will set it much lower in sandbox and production, probably something closer to 0.01. This will reduce recorded metrics volume and thus StackDriver cost, while still retaining enough data for overall performance monitoring. This is handled stochastically, so as to not require any coordination between Java threads or GKE pods/clusters, as alternative approaches would (i.e. using a counter and recording every Nth, or throttling to a max metrics qps). --- .../google/registry/proxy/ProxyConfig.java | 2 + .../google/registry/proxy/ProxyModule.java | 21 ++++++++ .../registry/proxy/config/default-config.yaml | 12 +++++ .../registry/proxy/metric/BackendMetrics.java | 18 ++++++- .../proxy/metric/FrontendMetrics.java | 14 +++++- .../registry/proxy/ProtocolModuleTest.java | 21 ++++++++ .../proxy/metric/BackendMetricsTest.java | 16 +++++-- .../proxy/metric/FrontendMetricsTest.java | 48 +++++++++++++++++-- 8 files changed, 144 insertions(+), 8 deletions(-) diff --git a/proxy/src/main/java/google/registry/proxy/ProxyConfig.java b/proxy/src/main/java/google/registry/proxy/ProxyConfig.java index 94f5a62bf..7b080ffcc 100644 --- a/proxy/src/main/java/google/registry/proxy/ProxyConfig.java +++ b/proxy/src/main/java/google/registry/proxy/ProxyConfig.java @@ -113,6 +113,8 @@ public class ProxyConfig { public int stackdriverMaxQps; public int stackdriverMaxPointsPerRequest; public int writeIntervalSeconds; + public double frontendMetricsRatio; + public double backendMetricsRatio; } /** Configuration options that apply to quota management. */ diff --git a/proxy/src/main/java/google/registry/proxy/ProxyModule.java b/proxy/src/main/java/google/registry/proxy/ProxyModule.java index 3556a4128..b61bf1206 100644 --- a/proxy/src/main/java/google/registry/proxy/ProxyModule.java +++ b/proxy/src/main/java/google/registry/proxy/ProxyModule.java @@ -61,6 +61,7 @@ import java.time.Duration; import java.util.Arrays; import java.util.Base64; import java.util.Optional; +import java.util.Random; import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -395,6 +396,26 @@ public class ProxyModule { return Duration.ofSeconds(config.serverCertificateCacheSeconds); } + @Singleton + @Provides + @Named("frontendMetricsRatio") + static double provideFrontendMetricsRatio(ProxyConfig config) { + return config.metrics.frontendMetricsRatio; + } + + @Singleton + @Provides + @Named("backendMetricsRatio") + static double provideBackendMetricsRatio(ProxyConfig config) { + return config.metrics.backendMetricsRatio; + } + + @Singleton + @Provides + static Random provideRandom() { + return new Random(); + } + /** Root level component that exposes the port-to-protocol map. */ @Singleton @Component( diff --git a/proxy/src/main/java/google/registry/proxy/config/default-config.yaml b/proxy/src/main/java/google/registry/proxy/config/default-config.yaml index ed6fd0eab..10a224395 100644 --- a/proxy/src/main/java/google/registry/proxy/config/default-config.yaml +++ b/proxy/src/main/java/google/registry/proxy/config/default-config.yaml @@ -200,3 +200,15 @@ metrics: # How often metrics are written. writeIntervalSeconds: 60 + + # What ratio of frontend request metrics should be stochastically recorded + # (0.0 means none, 1.0 means all). This is useful for reducing metrics volume, + # and thus cost, while still recording some information for performance + # monitoring purposes. + frontendMetricsRatio: 1.0 + + # What ratio of backend request metrics should be stochastically recorded + # (0.0 means none, 1.0 means all). This is useful for reducing metrics volume, + # and thus cost, while still recording some information for performance + # monitoring purposes. + backendMetricsRatio: 1.0 diff --git a/proxy/src/main/java/google/registry/proxy/metric/BackendMetrics.java b/proxy/src/main/java/google/registry/proxy/metric/BackendMetrics.java index ee7603d6e..f14372c7f 100644 --- a/proxy/src/main/java/google/registry/proxy/metric/BackendMetrics.java +++ b/proxy/src/main/java/google/registry/proxy/metric/BackendMetrics.java @@ -22,7 +22,9 @@ import com.google.monitoring.metrics.MetricRegistryImpl; import google.registry.util.NonFinalForTesting; import io.netty.handler.codec.http.FullHttpResponse; import jakarta.inject.Inject; +import jakarta.inject.Named; import jakarta.inject.Singleton; +import java.util.Random; import org.joda.time.Duration; /** Backend metrics instrumentation. */ @@ -75,8 +77,14 @@ public class BackendMetrics extends BaseMetrics { LABELS, DEFAULT_LATENCY_FITTER); + private final Random random; + private final double backendMetricsRatio; + @Inject - BackendMetrics() {} + BackendMetrics(@Named("backendMetricsRatio") double backendMetricsRatio, Random random) { + this.backendMetricsRatio = backendMetricsRatio; + this.random = random; + } @Override void resetMetrics() { @@ -89,6 +97,10 @@ public class BackendMetrics extends BaseMetrics { @NonFinalForTesting public void requestSent(String protocol, String certHash, int bytes) { + // Short-circuit metrics recording randomly according to the configured ratio. + if (random.nextDouble() > backendMetricsRatio) { + return; + } requestsCounter.increment(protocol, certHash); requestBytes.record(bytes, protocol, certHash); } @@ -96,6 +108,10 @@ public class BackendMetrics extends BaseMetrics { @NonFinalForTesting public void responseReceived( String protocol, String certHash, FullHttpResponse response, Duration latency) { + // Short-circuit metrics recording randomly according to the configured ratio. + if (random.nextDouble() > backendMetricsRatio) { + return; + } latencyMs.record(latency.getMillis(), protocol, certHash); responseBytes.record(response.content().readableBytes(), protocol, certHash); responsesCounter.increment(protocol, certHash, response.status().toString()); diff --git a/proxy/src/main/java/google/registry/proxy/metric/FrontendMetrics.java b/proxy/src/main/java/google/registry/proxy/metric/FrontendMetrics.java index 8a4decb58..d6eb1a45e 100644 --- a/proxy/src/main/java/google/registry/proxy/metric/FrontendMetrics.java +++ b/proxy/src/main/java/google/registry/proxy/metric/FrontendMetrics.java @@ -26,8 +26,10 @@ import io.netty.channel.group.ChannelGroup; import io.netty.channel.group.DefaultChannelGroup; import io.netty.util.concurrent.GlobalEventExecutor; import jakarta.inject.Inject; +import jakarta.inject.Named; import jakarta.inject.Singleton; import java.util.Map; +import java.util.Random; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import org.joda.time.Duration; @@ -78,8 +80,14 @@ public class FrontendMetrics extends BaseMetrics { LABELS, DEFAULT_LATENCY_FITTER); + private final Random random; + private final double frontendMetricsRatio; + @Inject - public FrontendMetrics() {} + FrontendMetrics(@Named("frontendMetricsRatio") double frontendMetricsRatio, Random random) { + this.frontendMetricsRatio = frontendMetricsRatio; + this.random = random; + } @Override void resetMetrics() { @@ -109,6 +117,10 @@ public class FrontendMetrics extends BaseMetrics { @NonFinalForTesting public void responseSent(String protocol, String certHash, Duration latency) { + // Short-circuit metrics recording randomly according to the configured ratio. + if (random.nextDouble() > frontendMetricsRatio) { + return; + } latencyMs.record(latency.getMillis(), protocol, certHash); } } diff --git a/proxy/src/test/java/google/registry/proxy/ProtocolModuleTest.java b/proxy/src/test/java/google/registry/proxy/ProtocolModuleTest.java index 9227c5df2..40a92f4d6 100644 --- a/proxy/src/test/java/google/registry/proxy/ProtocolModuleTest.java +++ b/proxy/src/test/java/google/registry/proxy/ProtocolModuleTest.java @@ -56,6 +56,7 @@ import jakarta.inject.Named; import jakarta.inject.Provider; import jakarta.inject.Singleton; import java.time.Duration; +import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -312,6 +313,26 @@ public abstract class ProtocolModuleTest { return Duration.ofHours(1); } + @Singleton + @Provides + @Named("frontendMetricsRatio") + static double provideFrontendMetricsRatio() { + return 1.0; + } + + @Singleton + @Provides + @Named("backendMetricsRatio") + static double providebackendMetricsRatio() { + return 1.0; + } + + @Singleton + @Provides + static Random provideRandom() { + return new Random(); + } + // This method is only here to satisfy Dagger binding, but is never used. In test environment, // it is the self-signed certificate and its key that ends up being used. @Singleton diff --git a/proxy/src/test/java/google/registry/proxy/metric/BackendMetricsTest.java b/proxy/src/test/java/google/registry/proxy/metric/BackendMetricsTest.java index 009b9b586..7e158fd71 100644 --- a/proxy/src/test/java/google/registry/proxy/metric/BackendMetricsTest.java +++ b/proxy/src/test/java/google/registry/proxy/metric/BackendMetricsTest.java @@ -18,11 +18,14 @@ import static com.google.monitoring.metrics.contrib.DistributionMetricSubject.as import static com.google.monitoring.metrics.contrib.LongMetricSubject.assertThat; import static google.registry.proxy.TestUtils.makeHttpPostRequest; import static google.registry.proxy.TestUtils.makeHttpResponse; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import com.google.common.collect.ImmutableSet; import io.netty.handler.codec.http.FullHttpRequest; import io.netty.handler.codec.http.FullHttpResponse; import io.netty.handler.codec.http.HttpResponseStatus; +import java.util.Random; import org.joda.time.Duration; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -34,10 +37,11 @@ class BackendMetricsTest { private final String certHash = "blah12345"; private final String protocol = "frontend protocol"; - private final BackendMetrics metrics = new BackendMetrics(); + private BackendMetrics metrics; @BeforeEach void beforeEach() { + metrics = new BackendMetrics(1.0, new Random()); metrics.resetMetrics(); } @@ -107,15 +111,21 @@ class BackendMetricsTest { @Test void testSuccess_multipleResponses() { + Random mockRandom = mock(Random.class); + metrics = new BackendMetrics(0.2, mockRandom); + // The third response won't be logged. + when(mockRandom.nextDouble()).thenReturn(.1, .04, .5, .15); String content1 = "some response"; String content2 = "other response"; String content3 = "a very bad response"; FullHttpResponse response1 = makeHttpResponse(content1, HttpResponseStatus.OK); FullHttpResponse response2 = makeHttpResponse(content2, HttpResponseStatus.OK); - FullHttpResponse response3 = makeHttpResponse(content3, HttpResponseStatus.BAD_REQUEST); + FullHttpResponse response3 = makeHttpResponse(content2, HttpResponseStatus.OK); + FullHttpResponse response4 = makeHttpResponse(content3, HttpResponseStatus.BAD_REQUEST); metrics.responseReceived(protocol, certHash, response1, Duration.millis(5)); metrics.responseReceived(protocol, certHash, response2, Duration.millis(8)); - metrics.responseReceived(protocol, certHash, response3, Duration.millis(2)); + metrics.responseReceived(protocol, certHash, response3, Duration.millis(15)); + metrics.responseReceived(protocol, certHash, response4, Duration.millis(2)); assertThat(BackendMetrics.requestsCounter).hasNoOtherValues(); assertThat(BackendMetrics.requestBytes).hasNoOtherValues(); diff --git a/proxy/src/test/java/google/registry/proxy/metric/FrontendMetricsTest.java b/proxy/src/test/java/google/registry/proxy/metric/FrontendMetricsTest.java index ec839a61d..3914b982a 100644 --- a/proxy/src/test/java/google/registry/proxy/metric/FrontendMetricsTest.java +++ b/proxy/src/test/java/google/registry/proxy/metric/FrontendMetricsTest.java @@ -15,11 +15,17 @@ package google.registry.proxy.metric; import static com.google.common.truth.Truth.assertThat; +import static com.google.monitoring.metrics.contrib.DistributionMetricSubject.assertThat; import static com.google.monitoring.metrics.contrib.LongMetricSubject.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import com.google.common.collect.ImmutableSet; import io.netty.channel.ChannelFuture; import io.netty.channel.DefaultChannelId; import io.netty.channel.embedded.EmbeddedChannel; +import java.util.Random; +import org.joda.time.Duration; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -28,10 +34,11 @@ class FrontendMetricsTest { private static final String PROTOCOL = "some protocol"; private static final String CERT_HASH = "abc_blah_1134zdf"; - private final FrontendMetrics metrics = new FrontendMetrics(); + private FrontendMetrics metrics; @BeforeEach void beforeEach() { + metrics = new FrontendMetrics(1.0, new Random()); metrics.resetMetrics(); } @@ -60,8 +67,13 @@ class FrontendMetricsTest { @Test void testSuccess_twoConnections_sameClient() { + Random mockRandom = mock(Random.class); + metrics = new FrontendMetrics(0.2, mockRandom); + // The third response won't be logged. + when(mockRandom.nextDouble()).thenReturn(.1, .04, .5); EmbeddedChannel channel1 = new EmbeddedChannel(); EmbeddedChannel channel2 = new EmbeddedChannel(DefaultChannelId.newInstance()); + EmbeddedChannel channel3 = new EmbeddedChannel(); metrics.registerActiveConnection(PROTOCOL, CERT_HASH, channel1); assertThat(channel1.isActive()).isTrue(); @@ -85,6 +97,27 @@ class FrontendMetricsTest { .and() .hasNoOtherValues(); + metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(10)); + metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(8)); + metrics.responseSent(PROTOCOL, CERT_HASH, Duration.millis(13)); + + metrics.registerActiveConnection(PROTOCOL, CERT_HASH, channel3); + assertThat(channel3.isActive()).isTrue(); + assertThat(FrontendMetrics.activeConnectionsGauge) + .hasValueForLabels(2, PROTOCOL, CERT_HASH) + .and() + .hasNoOtherValues(); + // All connection counts are recorded as metrics, but ... + assertThat(FrontendMetrics.totalConnectionsCounter) + .hasValueForLabels(3, PROTOCOL, CERT_HASH) + .and() + .hasNoOtherValues(); + // Latency stats are subject to the metrics ratio. + assertThat(FrontendMetrics.latencyMs) + .hasDataSetForLabels(ImmutableSet.of(10, 8), PROTOCOL, CERT_HASH) + .and() + .hasNoOtherValues(); + @SuppressWarnings("unused") ChannelFuture unusedFuture1 = channel1.close(); assertThat(channel1.isActive()).isFalse(); @@ -93,7 +126,7 @@ class FrontendMetricsTest { .and() .hasNoOtherValues(); assertThat(FrontendMetrics.totalConnectionsCounter) - .hasValueForLabels(2, PROTOCOL, CERT_HASH) + .hasValueForLabels(3, PROTOCOL, CERT_HASH) .and() .hasNoOtherValues(); @@ -102,7 +135,16 @@ class FrontendMetricsTest { assertThat(channel2.isActive()).isFalse(); assertThat(FrontendMetrics.activeConnectionsGauge).hasNoOtherValues(); assertThat(FrontendMetrics.totalConnectionsCounter) - .hasValueForLabels(2, PROTOCOL, CERT_HASH) + .hasValueForLabels(3, PROTOCOL, CERT_HASH) + .and() + .hasNoOtherValues(); + + @SuppressWarnings("unused") + ChannelFuture unusedFuture3 = channel3.close(); + assertThat(channel3.isActive()).isFalse(); + assertThat(FrontendMetrics.activeConnectionsGauge).hasNoOtherValues(); + assertThat(FrontendMetrics.totalConnectionsCounter) + .hasValueForLabels(3, PROTOCOL, CERT_HASH) .and() .hasNoOtherValues(); }