diff --git a/src/main/java/io/cryostat/agent/ConfigModule.java b/src/main/java/io/cryostat/agent/ConfigModule.java index 4e92b889..8c932758 100644 --- a/src/main/java/io/cryostat/agent/ConfigModule.java +++ b/src/main/java/io/cryostat/agent/ConfigModule.java @@ -216,6 +216,12 @@ public abstract class ConfigModule { "cryostat.agent.registration.circuit-breaker-duration"; public static final String CRYOSTAT_AGENT_REGISTRATION_MIN_COOLDOWN_MS = "cryostat.agent.registration.min-cooldown-ms"; + public static final String CRYOSTAT_AGENT_REGISTRATION_COOLDOWN_JITTER_FACTOR = + "cryostat.agent.registration.cooldown-jitter-factor"; + public static final String CRYOSTAT_AGENT_REGISTRATION_RETRY_BACKOFF_JITTER_FACTOR = + "cryostat.agent.registration.retry-backoff-jitter-factor"; + public static final String CRYOSTAT_AGENT_REGISTRATION_MIN_INTERVAL = + "cryostat.agent.registration.min-interval"; public static final String CRYOSTAT_AGENT_EXIT_SIGNALS = "cryostat.agent.exit.signals"; public static final String CRYOSTAT_AGENT_EXIT_DEREGISTRATION_TIMEOUT_MS = "cryostat.agent.exit.deregistration.timeout-ms"; @@ -1012,6 +1018,32 @@ public static Duration provideCryostatAgentRegistrationMinCooldownMs(SmallRyeCon return Duration.ofMillis(cooldownMs); } + @Provides + @Singleton + @Named(CRYOSTAT_AGENT_REGISTRATION_COOLDOWN_JITTER_FACTOR) + public static double provideCryostatAgentRegistrationCooldownJitterFactor( + SmallRyeConfig config) { + return config.getValue(CRYOSTAT_AGENT_REGISTRATION_COOLDOWN_JITTER_FACTOR, double.class); + } + + @Provides + @Singleton + @Named(CRYOSTAT_AGENT_REGISTRATION_RETRY_BACKOFF_JITTER_FACTOR) + public static double provideCryostatAgentRegistrationRetryBackoffJitterFactor( + SmallRyeConfig config) { + return config.getValue( + CRYOSTAT_AGENT_REGISTRATION_RETRY_BACKOFF_JITTER_FACTOR, double.class); + } + + @Provides + @Singleton + @Named(CRYOSTAT_AGENT_REGISTRATION_MIN_INTERVAL) + public static Duration provideCryostatAgentRegistrationMinInterval(SmallRyeConfig config) { + String intervalStr = + config.getValue(CRYOSTAT_AGENT_REGISTRATION_MIN_INTERVAL, String.class); + return Duration.parse(intervalStr); + } + @Provides @Singleton @Named(CRYOSTAT_AGENT_HARVESTER_PERIOD_MS) diff --git a/src/main/java/io/cryostat/agent/MainModule.java b/src/main/java/io/cryostat/agent/MainModule.java index c63d3ef8..53d2aaf1 100644 --- a/src/main/java/io/cryostat/agent/MainModule.java +++ b/src/main/java/io/cryostat/agent/MainModule.java @@ -884,6 +884,12 @@ public static Registration provideRegistration( Duration circuitBreakerOpenDuration, @Named(ConfigModule.CRYOSTAT_AGENT_REGISTRATION_MIN_COOLDOWN_MS) Duration minCooldownDuration, + @Named(ConfigModule.CRYOSTAT_AGENT_REGISTRATION_COOLDOWN_JITTER_FACTOR) + double cooldownJitterFactor, + @Named(ConfigModule.CRYOSTAT_AGENT_REGISTRATION_RETRY_BACKOFF_JITTER_FACTOR) + double retryBackoffJitterFactor, + @Named(ConfigModule.CRYOSTAT_AGENT_REGISTRATION_MIN_INTERVAL) + Duration minRegistrationInterval, SecureRandom random) { Logger log = LoggerFactory.getLogger(Registration.class); return new Registration( @@ -919,6 +925,9 @@ public static Registration provideRegistration( circuitBreakerThreshold, circuitBreakerOpenDuration, minCooldownDuration, + cooldownJitterFactor, + retryBackoffJitterFactor, + minRegistrationInterval, random); } diff --git a/src/main/java/io/cryostat/agent/Registration.java b/src/main/java/io/cryostat/agent/Registration.java index a7234a49..847b67d6 100644 --- a/src/main/java/io/cryostat/agent/Registration.java +++ b/src/main/java/io/cryostat/agent/Registration.java @@ -71,6 +71,9 @@ public class Registration { private final int circuitBreakerThreshold; private final Duration circuitBreakerOpenDuration; private final Duration minCooldownDuration; + private final double cooldownJitterFactor; + private final double retryBackoffJitterFactor; + private final Duration minRegistrationInterval; private final Random random; private final PluginInfo pluginInfo = new PluginInfo(); @@ -80,7 +83,6 @@ public class Registration { private ScheduledFuture registrationCheckTask; private final AtomicInteger consecutiveFailures = new AtomicInteger(0); - private static final double JITTER_FACTOR = 0.1; private volatile CircuitState circuitState = CircuitState.CLOSED; private volatile Instant circuitOpenedAt = null; @@ -91,6 +93,9 @@ public class Registration { private volatile ScheduledFuture cooldownExitTask = null; private final Object cooldownLock = new Object(); + private volatile Instant lastRegistrationAttempt = Instant.MIN; + private final Object registrationLock = new Object(); + private enum CircuitState { CLOSED, OPEN, @@ -102,7 +107,7 @@ private enum CircuitState { CryostatClient cryostat, CallbackResolver callbackResolver, WebServer webServer, - io.cryostat.agent.util.AppNameResolver appNameResolver, + AppNameResolver appNameResolver, String instanceId, String jvmId, String appName, @@ -118,6 +123,9 @@ private enum CircuitState { int circuitBreakerThreshold, Duration circuitBreakerOpenDuration, Duration minCooldownDuration, + double cooldownJitterFactor, + double retryBackoffJitterFactor, + Duration minRegistrationInterval, Random random) { this.executor = executor; this.cryostat = cryostat; @@ -139,6 +147,9 @@ private enum CircuitState { this.circuitBreakerThreshold = circuitBreakerThreshold; this.circuitBreakerOpenDuration = circuitBreakerOpenDuration; this.minCooldownDuration = minCooldownDuration; + this.cooldownJitterFactor = cooldownJitterFactor; + this.retryBackoffJitterFactor = retryBackoffJitterFactor; + this.minRegistrationInterval = minRegistrationInterval; this.random = random; } @@ -242,7 +253,42 @@ public void addRegistrationListener(Consumer listener) { this.listeners.add(listener); } + /** + * Determine when the next registration attempt is allowed. This prevents rapid-fire + * registration attempts from external triggers. + * + * @return the instant when registration may next be attempted + */ + private Instant shouldAttemptRegistrationAt() { + synchronized (registrationLock) { + Instant now = Instant.now(); + Instant nextAllowed = lastRegistrationAttempt.plus(minRegistrationInterval); + + if (now.isBefore(nextAllowed)) { + Duration remaining = Duration.between(now, nextAllowed); + log.debug( + "Skipping registration attempt - minimum interval not met. Last attempt:" + + " {}, next allowed: {} (in {})", + lastRegistrationAttempt, + nextAllowed, + remaining); + return nextAllowed; + } + + lastRegistrationAttempt = now; + return now; + } + } + void tryRegister() { + Instant shouldAttemptRegistrationAt = shouldAttemptRegistrationAt(); + if (Instant.now().isBefore(shouldAttemptRegistrationAt)) { + long delay = Duration.between(Instant.now(), shouldAttemptRegistrationAt).toMillis(); + executor.schedule( + () -> notify(RegistrationEvent.State.REFRESHING), delay, TimeUnit.MILLISECONDS); + return; + } + if (currentRegistration != null && !currentRegistration.isDone()) { log.warn("Cancelling previous registration attempt"); currentRegistration.cancel(true); @@ -391,7 +437,7 @@ private long calculateBackoffMs() { return registrationRetryMs; } - double jitter = 1.0 + (random.nextDouble() * 2 - 1) * JITTER_FACTOR; + double jitter = 1.0 + (random.nextDouble() * 2 - 1) * retryBackoffJitterFactor; long backoff = (long) (registrationRetryMs @@ -403,6 +449,24 @@ private long calculateBackoffMs() { return backoff; } + /** + * Calculate cooldown duration with jitter to prevent thundering herd problem. Adds random + * variation based on the configured jitter factor to the base duration so that multiple agents + * don't all exit cooldown simultaneously. + * + * @param baseDuration Base cooldown duration (e.g., PT30S) + * @return Duration with jitter applied + */ + Duration calculateCooldownWithJitter(Duration baseDuration) { + long baseMs = baseDuration.toMillis(); + // Add jitter: range is (1 - jitterFactor) to (1 + jitterFactor) times base duration + // For jitterFactor=0.2, this gives 0.8x to 1.2x base duration + double jitterRange = cooldownJitterFactor * 2; + double jitterFactor = (1.0 - cooldownJitterFactor) + (random.nextDouble() * jitterRange); + long jitteredMs = (long) (baseMs * jitterFactor); + return Duration.ofMillis(jitteredMs); + } + /** * Check if the Agent is currently in cooldown period. * @@ -425,9 +489,11 @@ private void enterCooldown(Duration duration) { cooldownExitTask.cancel(false); } - cooldownUntil = Instant.now().plus(duration); + Duration jitteredDuration = calculateCooldownWithJitter(duration); + cooldownUntil = Instant.now().plus(jitteredDuration); log.debug( - "Entering cooldown for {} after {} consecutive failures", + "Entering cooldown for {} (base: {}) after {} consecutive failures", + jitteredDuration, duration, consecutiveFailures.get()); notify(RegistrationEvent.State.COOLDOWN); @@ -450,7 +516,7 @@ private void enterCooldown(Duration duration) { cooldownExitTask = executor.schedule( - this::exitCooldown, duration.toMillis(), TimeUnit.MILLISECONDS); + this::exitCooldown, jitteredDuration.toMillis(), TimeUnit.MILLISECONDS); } } diff --git a/src/main/resources/META-INF/microprofile-config.properties b/src/main/resources/META-INF/microprofile-config.properties index b8525fd6..f4f70426 100644 --- a/src/main/resources/META-INF/microprofile-config.properties +++ b/src/main/resources/META-INF/microprofile-config.properties @@ -78,6 +78,9 @@ cryostat.agent.registration.backoff-multiplier=2.0 cryostat.agent.registration.circuit-breaker-threshold=10 cryostat.agent.registration.circuit-breaker-duration=PT30S cryostat.agent.registration.min-cooldown-ms=30000 +cryostat.agent.registration.cooldown-jitter-factor=0.2 +cryostat.agent.registration.retry-backoff-jitter-factor=0.1 +cryostat.agent.registration.min-interval=PT10S cryostat.agent.exit.deregistration.timeout-ms=10000 cryostat.agent.publish.context= diff --git a/src/test/java/io/cryostat/agent/RegistrationTest.java b/src/test/java/io/cryostat/agent/RegistrationTest.java index deb676d1..1334df31 100644 --- a/src/test/java/io/cryostat/agent/RegistrationTest.java +++ b/src/test/java/io/cryostat/agent/RegistrationTest.java @@ -62,6 +62,10 @@ class RegistrationTest { private static final Duration CIRCUIT_BREAKER_DURATION = Duration.ofMinutes(5); private static final Duration MIN_COOLDOWN_DURATION = Duration.ZERO; // Disable cooldown for existing tests + private static final double COOLDOWN_JITTER_FACTOR = 0.2; + private static final double RETRY_BACKOFF_JITTER_FACTOR = 0.1; + private static final Duration MIN_REGISTRATION_INTERVAL = + Duration.ZERO; // Disable min interval for existing tests @BeforeEach void setup() { @@ -87,6 +91,9 @@ void setup() { CIRCUIT_BREAKER_THRESHOLD, CIRCUIT_BREAKER_DURATION, MIN_COOLDOWN_DURATION, + COOLDOWN_JITTER_FACTOR, + RETRY_BACKOFF_JITTER_FACTOR, + MIN_REGISTRATION_INTERVAL, random); } @@ -289,4 +296,156 @@ void testCircuitBreakerTransitionsToHalfOpen() throws Exception { verify(executor, atLeast(CIRCUIT_BREAKER_THRESHOLD + 1)) .schedule(any(Runnable.class), anyLong(), eq(TimeUnit.MILLISECONDS)); } + + @Test + void testCalculateCooldownWithJitterAppliesVariation() { + // Test that jitter is applied correctly + Duration baseDuration = Duration.ofSeconds(30); + + // Test with minimum jitter (random = 0.0) + when(random.nextDouble()).thenReturn(0.0); + Duration minJittered = registration.calculateCooldownWithJitter(baseDuration); + // With jitterFactor=0.2 and random=0.0: (1.0 - 0.2) + (0.0 * 0.4) = 0.8 + // Expected: 30000ms * 0.8 = 24000ms + assertEquals(24000, minJittered.toMillis(), "Minimum jitter should be 80% of base"); + + // Test with maximum jitter (random = 1.0) + when(random.nextDouble()).thenReturn(1.0); + Duration maxJittered = registration.calculateCooldownWithJitter(baseDuration); + // With jitterFactor=0.2 and random=1.0: (1.0 - 0.2) + (1.0 * 0.4) = 1.2 + // Expected: 30000ms * 1.2 = 36000ms + assertEquals(36000, maxJittered.toMillis(), "Maximum jitter should be 120% of base"); + + // Test with middle jitter (random = 0.5) + when(random.nextDouble()).thenReturn(0.5); + Duration midJittered = registration.calculateCooldownWithJitter(baseDuration); + // With jitterFactor=0.2 and random=0.5: (1.0 - 0.2) + (0.5 * 0.4) = 1.0 + // Expected: 30000ms * 1.0 = 30000ms + assertEquals(30000, midJittered.toMillis(), "Middle jitter should be 100% of base"); + } + + @Test + void testCalculateCooldownWithJitterDifferentDurations() { + when(random.nextDouble()).thenReturn(0.5); + + // Test with 1 second + Duration oneSecond = Duration.ofSeconds(1); + Duration jittered1s = registration.calculateCooldownWithJitter(oneSecond); + assertEquals(1000, jittered1s.toMillis(), "1 second with 0.5 random should remain 1000ms"); + + // Test with 1 minute + Duration oneMinute = Duration.ofMinutes(1); + Duration jittered1m = registration.calculateCooldownWithJitter(oneMinute); + assertEquals( + 60000, jittered1m.toMillis(), "1 minute with 0.5 random should remain 60000ms"); + + // Test with 5 minutes + Duration fiveMinutes = Duration.ofMinutes(5); + Duration jittered5m = registration.calculateCooldownWithJitter(fiveMinutes); + assertEquals( + 300000, jittered5m.toMillis(), "5 minutes with 0.5 random should remain 300000ms"); + } + + @Test + void testCalculateCooldownWithJitterProducesVariation() { + Duration baseDuration = Duration.ofSeconds(30); + + // Simulate multiple agents with different random values + when(random.nextDouble()).thenReturn(0.1, 0.3, 0.5, 0.7, 0.9); + + Duration jitter1 = registration.calculateCooldownWithJitter(baseDuration); + Duration jitter2 = registration.calculateCooldownWithJitter(baseDuration); + Duration jitter3 = registration.calculateCooldownWithJitter(baseDuration); + Duration jitter4 = registration.calculateCooldownWithJitter(baseDuration); + Duration jitter5 = registration.calculateCooldownWithJitter(baseDuration); + + // All should be different + assertNotEquals( + jitter1.toMillis(), + jitter2.toMillis(), + "Different random values should produce different jittered durations"); + assertNotEquals( + jitter2.toMillis(), + jitter3.toMillis(), + "Different random values should produce different jittered durations"); + assertNotEquals( + jitter3.toMillis(), + jitter4.toMillis(), + "Different random values should produce different jittered durations"); + assertNotEquals( + jitter4.toMillis(), + jitter5.toMillis(), + "Different random values should produce different jittered durations"); + + // All should be within expected range (80% to 120% of base) + long baseMs = baseDuration.toMillis(); + assertTrue( + jitter1.toMillis() >= baseMs * 0.8 && jitter1.toMillis() <= baseMs * 1.2, + "Jittered duration should be within range"); + assertTrue( + jitter2.toMillis() >= baseMs * 0.8 && jitter2.toMillis() <= baseMs * 1.2, + "Jittered duration should be within range"); + assertTrue( + jitter3.toMillis() >= baseMs * 0.8 && jitter3.toMillis() <= baseMs * 1.2, + "Jittered duration should be within range"); + assertTrue( + jitter4.toMillis() >= baseMs * 0.8 && jitter4.toMillis() <= baseMs * 1.2, + "Jittered duration should be within range"); + assertTrue( + jitter5.toMillis() >= baseMs * 0.8 && jitter5.toMillis() <= baseMs * 1.2, + "Jittered duration should be within range"); + } + + @Test + void testMinimumRegistrationIntervalSchedulesRefreshingRetry() { + Registration throttledRegistration = + new Registration( + executor, + cryostat, + callbackResolver, + webServer, + appNameResolver, + INSTANCE_ID, + JVM_ID, + APP_NAME, + REALM, + HOSTNAME, + JMX_PORT, + REGISTRATION_RETRY_MS, + REGISTRATION_CHECK_MS, + false, + true, + MAX_BACKOFF_MS, + BACKOFF_MULTIPLIER, + CIRCUIT_BREAKER_THRESHOLD, + CIRCUIT_BREAKER_DURATION, + MIN_COOLDOWN_DURATION, + COOLDOWN_JITTER_FACTOR, + RETRY_BACKOFF_JITTER_FACTOR, + Duration.ofSeconds(30), + random); + + when(executor.schedule(any(Runnable.class), anyLong(), any(TimeUnit.class))) + .thenReturn(null); + + throttledRegistration.tryRegister(); + throttledRegistration.tryRegister(); + + ArgumentCaptor delayCaptor = ArgumentCaptor.forClass(Long.class); + verify(executor, atLeastOnce()) + .schedule(any(Runnable.class), delayCaptor.capture(), eq(TimeUnit.MILLISECONDS)); + long scheduledDelay = + delayCaptor.getAllValues().stream() + .filter(delay -> delay >= 0 && delay <= Duration.ofSeconds(30).toMillis()) + .findFirst() + .orElseThrow( + () -> + new AssertionError( + "Expected a retry delay within the remaining" + + " minimum interval window")); + assertTrue( + scheduledDelay <= Duration.ofSeconds(30).toMillis(), + "Retry delay should be scheduled within the remaining minimum interval window"); + verify(cryostat, times(1)).serverHealth(); + } }