From 59f73c6134d2e858439d29ac42c2fc0506600312 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Thu, 30 Apr 2026 17:13:02 -0400 Subject: [PATCH 1/8] fix(discovery-plugin): perform startup ping by triggering existing refresh jobs Ping plugins in batches to avoid causing a thundering herd of plugins attempting to re-register with Cryostat --- .../java/io/cryostat/ConfigProperties.java | 6 + .../java/io/cryostat/discovery/Discovery.java | 71 ++--- .../discovery/StartupPluginPinger.java | 277 ++++++++++++++++++ src/main/resources/application.properties | 3 + 4 files changed, 307 insertions(+), 50 deletions(-) create mode 100644 src/main/java/io/cryostat/discovery/StartupPluginPinger.java diff --git a/src/main/java/io/cryostat/ConfigProperties.java b/src/main/java/io/cryostat/ConfigProperties.java index 8c3a59b47..fff99dc69 100644 --- a/src/main/java/io/cryostat/ConfigProperties.java +++ b/src/main/java/io/cryostat/ConfigProperties.java @@ -56,6 +56,12 @@ public class ConfigProperties { "cryostat.discovery.plugins.ping-period"; public static final String DISCOVERY_PLUGINS_MAX_BACKOFF_MULTIPLIER = "cryostat.discovery.plugins.max-backoff-multiplier"; + public static final String DISCOVERY_PLUGINS_PING_WORKER_POOL_SIZE = + "cryostat.discovery.plugins.ping.worker-pool-size"; + public static final String DISCOVERY_PLUGINS_PING_DELAY_MS = + "cryostat.discovery.plugins.ping.delay-ms"; + public static final String DISCOVERY_PLUGINS_PING_TIMEOUT_MS = + "cryostat.discovery.plugins.ping.timeout-ms"; public static final String CONNECTIONS_TTL = "cryostat.connections.ttl"; public static final String CONNECTIONS_FAILED_BACKOFF = "cryostat.connections.failed-backoff"; diff --git a/src/main/java/io/cryostat/discovery/Discovery.java b/src/main/java/io/cryostat/discovery/Discovery.java index adda8ca3f..54a9cd58f 100644 --- a/src/main/java/io/cryostat/discovery/Discovery.java +++ b/src/main/java/io/cryostat/discovery/Discovery.java @@ -51,7 +51,6 @@ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import io.quarkus.narayana.jta.QuarkusTransaction; import io.quarkus.runtime.ShutdownEvent; -import io.quarkus.runtime.StartupEvent; import io.smallrye.common.annotation.Blocking; import io.vertx.core.json.JsonObject; import io.vertx.ext.web.RoutingContext; @@ -113,7 +112,6 @@ public class Discovery { static final String X_FORWARDED_FOR = "X-Forwarded-For"; private static final String JOB_PERIODIC = "discovery.periodic"; - private static final String JOB_STARTUP = "discovery.startup"; private static final String PLUGIN_ID_MAP_KEY = "pluginId"; private static final String REFRESH_MAP_KEY = "refresh"; @@ -138,52 +136,6 @@ public class Discovery { @Inject PluginCallbackFactory callbackFactory; @Inject EntityManager entityManager; - void onStart(@Observes StartupEvent evt) { - QuarkusTransaction.requiringNew() - .run( - () -> { - DiscoveryPlugin.findAll().list().stream() - .filter(p -> !p.builtin) - .forEach( - plugin -> { - var dataMap = new JobDataMap(); - dataMap.put(PLUGIN_ID_MAP_KEY, plugin.id); - dataMap.put(REFRESH_MAP_KEY, true); - JobDetail jobDetail = - JobBuilder.newJob(RefreshPluginJob.class) - .withIdentity( - plugin.id.toString(), - JOB_STARTUP) - .usingJobData(dataMap) - .build(); - var trigger = - TriggerBuilder.newTrigger() - .withIdentity( - jobDetail - .getKey() - .getName(), - jobDetail - .getKey() - .getGroup()) - .startNow() - .withSchedule( - SimpleScheduleBuilder - .simpleSchedule() - .withRepeatCount(0)) - .build(); - try { - if (!scheduler.checkExists(trigger.getKey())) { - scheduler.scheduleJob(jobDetail, trigger); - } - } catch (SchedulerException e) { - logger.warn( - "Failed to schedule plugin prune job", - e); - } - }); - }); - } - void onStop(@Observes ShutdownEvent evt) throws SchedulerException { scheduler.shutdown(); } @@ -454,7 +406,7 @@ public PluginRegistration register(@Context RoutingContext ctx, JsonObject body) throw new BadRequestException(e); } - isNewPlugin = !scheduler.checkExists(JobKey.jobKey(plugin.id.toString(), JOB_PERIODIC)); + isNewPlugin = !scheduler.checkExists(getPeriodicJobKey(plugin)); if (isNewPlugin) { var dataMap = new JobDataMap(); dataMap.put(PLUGIN_ID_MAP_KEY, plugin.id); @@ -680,7 +632,6 @@ public void deregister( Set jobKeys = new HashSet<>(); jobKeys.addAll(scheduler.getJobKeys(GroupMatcher.jobGroupEquals(JOB_PERIODIC))); - jobKeys.addAll(scheduler.getJobKeys(GroupMatcher.jobGroupEquals(JOB_STARTUP))); for (var key : jobKeys) { if (!Objects.equals(plugin.id.toString(), key.getName())) { continue; @@ -1054,6 +1005,26 @@ static String requireNonBlank(String in, String name) { return in; } + /** + * Create a JobKey for a plugin's periodic refresh job. + * + * @param plugin The plugin to create a JobKey for + * @return JobKey for the plugin's periodic refresh job + */ + static JobKey getPeriodicJobKey(DiscoveryPlugin plugin) { + return getPeriodicJobKey(plugin.id); + } + + /** + * Create a JobKey for a plugin's periodic refresh job using the plugin ID. + * + * @param pluginId The plugin ID to create a JobKey for + * @return JobKey for the plugin's periodic refresh job + */ + static JobKey getPeriodicJobKey(UUID pluginId) { + return JobKey.jobKey(pluginId.toString(), JOB_PERIODIC); + } + private InetAddress getRemoteAddress(RoutingContext ctx) { InetAddress addr = null; if (ctx.request() != null && ctx.request().remoteAddress() != null) { diff --git a/src/main/java/io/cryostat/discovery/StartupPluginPinger.java b/src/main/java/io/cryostat/discovery/StartupPluginPinger.java new file mode 100644 index 000000000..7d13dad2f --- /dev/null +++ b/src/main/java/io/cryostat/discovery/StartupPluginPinger.java @@ -0,0 +1,277 @@ +/* + * Copyright The Cryostat Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.cryostat.discovery; + +import java.util.List; +import java.util.UUID; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import io.cryostat.ConfigProperties; + +import io.quarkus.narayana.jta.QuarkusTransaction; +import io.quarkus.runtime.ShutdownEvent; +import io.quarkus.runtime.StartupEvent; +import jakarta.enterprise.context.ApplicationScoped; +import jakarta.enterprise.event.Observes; +import jakarta.inject.Inject; +import org.eclipse.microprofile.config.inject.ConfigProperty; +import org.jboss.logging.Logger; +import org.quartz.JobKey; +import org.quartz.Scheduler; +import org.quartz.SchedulerException; + +/** + * Startup plugin pinger that triggers existing Quartz jobs with delays between batches. This + * prevents registration storms by limiting concurrent job executions and spreading them over time. + */ +@ApplicationScoped +public class StartupPluginPinger { + + @Inject Logger logger; + @Inject Scheduler scheduler; + + @ConfigProperty(name = ConfigProperties.DISCOVERY_PLUGINS_PING_WORKER_POOL_SIZE) + int workerPoolSize; + + @ConfigProperty(name = ConfigProperties.DISCOVERY_PLUGINS_PING_DELAY_MS) + long delayMs; + + @ConfigProperty(name = ConfigProperties.DISCOVERY_PLUGINS_PING_TIMEOUT_MS) + long timeoutMs; + + private ExecutorService executorService; + + /** + * Data transfer object to hold plugin information extracted from entities. This allows us to + * pass data outside of transaction boundaries without risking detached entity issues. + */ + record PluginData(UUID id, String realmName, String callback) { + static PluginData from(DiscoveryPlugin plugin) { + return new PluginData( + plugin.id, + plugin.realm.name, + plugin.callback != null ? plugin.callback.toString() : "unknown"); + } + } + + /** + * On startup, trigger existing Quartz jobs for all plugins sequentially to avoid registration + * storm. + */ + void onStart(@Observes StartupEvent evt) { + if (executorService == null || executorService.isShutdown()) { + executorService = Executors.newFixedThreadPool(workerPoolSize); + } + + List pluginDataList = fetchPluginData(); + triggerJobsSequentially(pluginDataList, scheduler) + .whenComplete( + (v, ex) -> { + if (ex != null) { + logger.error("Error during sequential plugin ping", ex); + } else { + logger.debug("Sequential plugin ping on startup completed"); + } + }); + } + + private List fetchPluginData() { + return QuarkusTransaction.joiningExisting() + .call( + () -> + DiscoveryPlugin.findAll().list().stream() + .filter(p -> !p.builtin) + .map(PluginData::from) + .toList()); + } + + /** + * Trigger existing Quartz jobs for all plugins sequentially using a worker pool with delays + * between batches. + * + * @param pluginDataList List of plugin data whose jobs should be triggered + * @param scheduler Quartz scheduler to trigger jobs + * @return CompletableFuture that completes when all job triggers are done (successful or + * failed) + */ + public CompletableFuture triggerJobsSequentially( + List pluginDataList, Scheduler scheduler) { + if (pluginDataList.isEmpty()) { + logger.debug("No plugin jobs to trigger"); + return CompletableFuture.completedFuture(null); + } + + logger.debugv( + "Starting sequential plugin job triggering: {0} plugins, {1} workers, {2}ms delay" + + " between batches", + pluginDataList.size(), workerPoolSize, delayMs); + + AtomicInteger successCount = new AtomicInteger(0); + AtomicInteger failureCount = new AtomicInteger(0); + AtomicInteger processedCount = new AtomicInteger(0); + + CompletableFuture allTriggers = CompletableFuture.completedFuture(null); + + for (int i = 0; i < pluginDataList.size(); i++) { + final PluginData pluginData = pluginDataList.get(i); + final int pluginIndex = i; + final int batchNumber = i / workerPoolSize; + final int positionInBatch = i % workerPoolSize; + + if (positionInBatch == 0 && batchNumber > 0) { + allTriggers = + allTriggers.thenCompose( + v -> { + logger.debugv( + "Batch {0} complete, waiting {1}ms before next batch", + batchNumber - 1, delayMs); + return CompletableFuture.runAsync( + () -> { + try { + Thread.sleep(delayMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warn( + "Interrupted while waiting between" + + " batches", + e); + } + }, + executorService); + }); + } + + allTriggers = + allTriggers.thenCompose( + v -> + CompletableFuture.runAsync( + () -> { + try { + JobKey jobKey = + Discovery.getPeriodicJobKey( + pluginData.id()); + if (!scheduler.checkExists(jobKey)) { + logger.warnv( + "Job not found for plugin" + + " {0}/{1}: {2} @ {3}", + pluginIndex + 1, + pluginDataList.size(), + pluginData.realmName(), + pluginData.callback()); + failureCount.incrementAndGet(); + return; + } + logger.debugv( + "Triggering job {0}/{1} for" + + " plugin: {2} @ {3}", + pluginIndex + 1, + pluginDataList.size(), + pluginData.realmName(), + pluginData.callback()); + + scheduler.triggerJob(jobKey); + + successCount.incrementAndGet(); + logger.debugv( + "Successfully triggered job" + + " {0}/{1} for plugin: {2}" + + " @ {3}", + pluginIndex + 1, + pluginDataList.size(), + pluginData.realmName(), + pluginData.callback()); + } catch (SchedulerException e) { + failureCount.incrementAndGet(); + logger.warnv( + e, + "Failed to trigger job for" + + " plugin {0}/{1}: {2} @" + + " {3}", + pluginIndex + 1, + pluginDataList.size(), + pluginData.realmName(), + pluginData.callback()); + } finally { + int processed = + processedCount + .incrementAndGet(); + if (processed % workerPoolSize == 0 + || processed + == pluginDataList + .size()) { + logger.debugv( + "Plugin job trigger" + + " progress: {0}/{1}" + + " processed ({2}" + + " successful, {3}" + + " failed)", + processed, + pluginDataList.size(), + successCount.get(), + failureCount.get()); + } + } + }, + executorService) + .orTimeout(timeoutMs, TimeUnit.MILLISECONDS) + .exceptionally( + ex -> { + failureCount.incrementAndGet(); + logger.warnv( + ex, + "Job trigger timed out or failed" + + " for plugin {0}/{1}: {2} @" + + " {3}", + pluginIndex + 1, + pluginDataList.size(), + pluginData.realmName(), + pluginData.callback()); + return null; + })); + } + + return allTriggers.thenRun( + () -> + logger.debugv( + "Sequential plugin job triggering complete: {0} successful, {1}" + + " failed out of {2} total", + successCount.get(), failureCount.get(), pluginDataList.size())); + } + + /** Shutdown the executor service on application shutdown. */ + void onShutdown(@Observes ShutdownEvent evt) { + if (executorService != null && !executorService.isShutdown()) { + logger.trace("Shutting down SequentialPluginPinger executor service"); + executorService.shutdown(); + try { + if (!executorService.awaitTermination(30, TimeUnit.SECONDS)) { + logger.warn( + "SequentialPluginPinger executor did not terminate in time, forcing" + + " shutdown"); + executorService.shutdownNow(); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warn("Interrupted while shutting down SequentialPluginPinger", e); + executorService.shutdownNow(); + } + } + } +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 320a6c9cc..d5bad0955 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -35,6 +35,9 @@ cryostat.discovery.docker.enabled=false cryostat.discovery.plugins.ping-period=1m cryostat.discovery.plugins.max-failures=3 cryostat.discovery.plugins.max-backoff-multiplier=5 +cryostat.discovery.plugins.ping.worker-pool-size=3 +cryostat.discovery.plugins.ping.delay-ms=500 +cryostat.discovery.plugins.ping.timeout-ms=10000 cryostat.discovery.plugins.jwt.secret.algorithm=AES cryostat.discovery.plugins.jwt.secret.keysize=256 cryostat.discovery.plugins.jwt.signature.algorithm=HS256 From 03cd28a759e31a3794bd1fab6789ed2d75065564 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Thu, 30 Apr 2026 17:37:50 -0400 Subject: [PATCH 2/8] drop existing discovery.startup jobs --- .../resources/db/migration/V4.2.0__cryostat.sql | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main/resources/db/migration/V4.2.0__cryostat.sql b/src/main/resources/db/migration/V4.2.0__cryostat.sql index 2d182380e..34095b4e2 100644 --- a/src/main/resources/db/migration/V4.2.0__cryostat.sql +++ b/src/main/resources/db/migration/V4.2.0__cryostat.sql @@ -653,4 +653,20 @@ ALTER TABLE DiscoveryPlugin ADD COLUMN nextPingAt BIGINT; -- Add unique constraint on DiscoveryNode name for Realm nodes to enforce realm name uniqueness CREATE UNIQUE INDEX uk_discovery_node_realm_name ON DiscoveryNode (name) WHERE (nodeType = 'Realm'); +-- Clean up any existing startup jobs from previous versions (4.1.0 and earlier) +-- These jobs are no longer used as StartupPluginPinger now triggers existing periodic jobs on startup +-- The old job group was "discovery.startup" with individual plugin IDs as job names +DELETE FROM QRTZ_SIMPLE_TRIGGERS +WHERE (SCHED_NAME, TRIGGER_NAME, TRIGGER_GROUP) IN ( + SELECT SCHED_NAME, TRIGGER_NAME, TRIGGER_GROUP + FROM QRTZ_TRIGGERS + WHERE JOB_GROUP = 'discovery.startup' +); + +DELETE FROM QRTZ_TRIGGERS +WHERE JOB_GROUP = 'discovery.startup'; + +DELETE FROM QRTZ_JOB_DETAILS +WHERE JOB_GROUP = 'discovery.startup'; + COMMIT; From b3e9ca37ec4ee7b0d092c0bb87703a479fb18f0e Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Thu, 30 Apr 2026 17:48:31 -0400 Subject: [PATCH 3/8] fix(discovery-plugin): apply more fault-tolerance annotations --- .../io/cryostat/credentials/Credentials.java | 7 ++++++ .../java/io/cryostat/discovery/Discovery.java | 7 ++++++ src/main/resources/application.properties | 25 +++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/src/main/java/io/cryostat/credentials/Credentials.java b/src/main/java/io/cryostat/credentials/Credentials.java index 8568a044c..7db70af28 100644 --- a/src/main/java/io/cryostat/credentials/Credentials.java +++ b/src/main/java/io/cryostat/credentials/Credentials.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Files; +import java.sql.SQLException; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -40,6 +41,7 @@ import jakarta.annotation.security.RolesAllowed; import jakarta.enterprise.event.Observes; import jakarta.inject.Inject; +import jakarta.persistence.PersistenceException; import jakarta.transaction.Transactional; import jakarta.ws.rs.DELETE; import jakarta.ws.rs.GET; @@ -50,6 +52,8 @@ import jakarta.ws.rs.core.UriInfo; import org.eclipse.microprofile.config.inject.ConfigProperty; import org.eclipse.microprofile.faulttolerance.Bulkhead; +import org.eclipse.microprofile.faulttolerance.Retry; +import org.eclipse.microprofile.faulttolerance.Timeout; import org.eclipse.microprofile.openapi.annotations.Operation; import org.jboss.logging.Logger; import org.jboss.resteasy.reactive.RestForm; @@ -194,6 +198,9 @@ public CredentialMatchResult get(@RestPath long id) { } @Transactional + @Bulkhead + @Timeout + @Retry(retryOn = {SQLException.class, PersistenceException.class}) @POST @RolesAllowed("write") @Operation( diff --git a/src/main/java/io/cryostat/discovery/Discovery.java b/src/main/java/io/cryostat/discovery/Discovery.java index 54a9cd58f..47a117d11 100644 --- a/src/main/java/io/cryostat/discovery/Discovery.java +++ b/src/main/java/io/cryostat/discovery/Discovery.java @@ -82,6 +82,8 @@ import org.apache.commons.lang3.tuple.Pair; import org.eclipse.microprofile.config.inject.ConfigProperty; import org.eclipse.microprofile.faulttolerance.Bulkhead; +import org.eclipse.microprofile.faulttolerance.Retry; +import org.eclipse.microprofile.faulttolerance.Timeout; import org.eclipse.microprofile.openapi.annotations.Operation; import org.eclipse.microprofile.openapi.annotations.tags.Tag; import org.jboss.logging.Logger; @@ -198,6 +200,8 @@ public RestResponse checkCredentialExists(@RestForm String script) { } @Bulkhead + @Timeout + @Retry(retryOn = {OptimisticLockException.class}) @Blocking @POST @Path("/api/v4/discovery") @@ -479,6 +483,9 @@ public void publish( } @Transactional + @Bulkhead + @Timeout + @Retry(retryOn = {OptimisticLockException.class}) @POST @Path("/api/v4.2/discovery/{id}") @Consumes(MediaType.APPLICATION_JSON) diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index d5bad0955..bc6d4943e 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -38,6 +38,31 @@ cryostat.discovery.plugins.max-backoff-multiplier=5 cryostat.discovery.plugins.ping.worker-pool-size=3 cryostat.discovery.plugins.ping.delay-ms=500 cryostat.discovery.plugins.ping.timeout-ms=10000 + +# Bulkhead configuration for credential and discovery endpoints +# Credential service bulkhead - limits concurrent credential creation to prevent database pool exhaustion +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.value=5 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.waiting-task-queue=10 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.value=5S +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.max-retries=2 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1S +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter=500ms + +# Discovery service bulkheads - limits concurrent plugin operations to prevent discovery tree corruption +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.value=8 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.waiting-task-queue=15 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=10S +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.max-retries=3 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=500ms +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=250ms + +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.value=5 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.waiting-task-queue=10 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=15S +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.max-retries=3 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=500ms +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=250ms + cryostat.discovery.plugins.jwt.secret.algorithm=AES cryostat.discovery.plugins.jwt.secret.keysize=256 cryostat.discovery.plugins.jwt.signature.algorithm=HS256 From e8af7bc66139f61baaa44e8e87aea080ea886f22 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Thu, 30 Apr 2026 17:58:01 -0400 Subject: [PATCH 4/8] fixup! fix(discovery-plugin): apply more fault-tolerance annotations --- .../io/cryostat/credentials/Credentials.java | 2 ++ .../java/io/cryostat/discovery/Discovery.java | 3 +++ src/main/resources/application.properties | 19 +++++++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/cryostat/credentials/Credentials.java b/src/main/java/io/cryostat/credentials/Credentials.java index 7db70af28..29c265958 100644 --- a/src/main/java/io/cryostat/credentials/Credentials.java +++ b/src/main/java/io/cryostat/credentials/Credentials.java @@ -37,6 +37,7 @@ import io.quarkus.narayana.jta.QuarkusTransaction; import io.quarkus.runtime.StartupEvent; import io.smallrye.common.annotation.Blocking; +import io.smallrye.faulttolerance.api.RateLimit; import io.smallrye.mutiny.Uni; import jakarta.annotation.security.RolesAllowed; import jakarta.enterprise.event.Observes; @@ -201,6 +202,7 @@ public CredentialMatchResult get(@RestPath long id) { @Bulkhead @Timeout @Retry(retryOn = {SQLException.class, PersistenceException.class}) + @RateLimit @POST @RolesAllowed("write") @Operation( diff --git a/src/main/java/io/cryostat/discovery/Discovery.java b/src/main/java/io/cryostat/discovery/Discovery.java index 47a117d11..9d0394c62 100644 --- a/src/main/java/io/cryostat/discovery/Discovery.java +++ b/src/main/java/io/cryostat/discovery/Discovery.java @@ -52,6 +52,7 @@ import io.quarkus.narayana.jta.QuarkusTransaction; import io.quarkus.runtime.ShutdownEvent; import io.smallrye.common.annotation.Blocking; +import io.smallrye.faulttolerance.api.RateLimit; import io.vertx.core.json.JsonObject; import io.vertx.ext.web.RoutingContext; import io.vertx.mutiny.core.eventbus.EventBus; @@ -202,6 +203,7 @@ public RestResponse checkCredentialExists(@RestForm String script) { @Bulkhead @Timeout @Retry(retryOn = {OptimisticLockException.class}) + @RateLimit @Blocking @POST @Path("/api/v4/discovery") @@ -486,6 +488,7 @@ public void publish( @Bulkhead @Timeout @Retry(retryOn = {OptimisticLockException.class}) + @RateLimit @POST @Path("/api/v4.2/discovery/{id}") @Consumes(MediaType.APPLICATION_JSON) diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index bc6d4943e..0e6ca3ade 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -39,7 +39,6 @@ cryostat.discovery.plugins.ping.worker-pool-size=3 cryostat.discovery.plugins.ping.delay-ms=500 cryostat.discovery.plugins.ping.timeout-ms=10000 -# Bulkhead configuration for credential and discovery endpoints # Credential service bulkhead - limits concurrent credential creation to prevent database pool exhaustion quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.waiting-task-queue=10 @@ -48,7 +47,12 @@ quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.max-r quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1S quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter=500ms -# Discovery service bulkheads - limits concurrent plugin operations to prevent discovery tree corruption +# Credential creation rate limits - prevents credential ID explosion during registration storms +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=100 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.type=smooth + +# Discovery bulkheads - limits concurrent plugin operations quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.value=8 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.waiting-task-queue=15 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=10S @@ -56,6 +60,11 @@ quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.max-ret quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=500ms quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=250ms +# Rate limiting for plugin registration - prevents registration storms +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=200 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.type=smooth + quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.waiting-task-queue=10 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=15S @@ -63,6 +72,12 @@ quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".ret quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=500ms quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=250ms +# Rate limiting for discovery tree updates - prevents publication storms +# Limit: 300 publications per minute (allows for frequent updates from multiple plugins) +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=300 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.type=smooth + cryostat.discovery.plugins.jwt.secret.algorithm=AES cryostat.discovery.plugins.jwt.secret.keysize=256 cryostat.discovery.plugins.jwt.signature.algorithm=HS256 From 19c8c6b0fb66dc0fc820538e0c8b8e7fa7314e42 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Fri, 1 May 2026 09:34:39 -0400 Subject: [PATCH 5/8] config correction, tuning --- .../resources/application-test.properties | 9 ++++++ src/main/resources/application.properties | 28 +++++++++++-------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/main/resources/application-test.properties b/src/main/resources/application-test.properties index a1f5a8cf8..813072f3c 100644 --- a/src/main/resources/application-test.properties +++ b/src/main/resources/application-test.properties @@ -34,3 +34,12 @@ quarkus.s3.aws.credentials.static-provider.secret-access-key=unused grafana-dashboard.url=http://dashboard.no-grafana.local grafana-datasource.url=http://datasource.no-grafana.local + +# Disable rate limiting in tests by setting very high limits +# Tests create many credentials/registrations rapidly and would hit production rate limits +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=999999 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=999999 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=999999 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=60000 diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 0e6ca3ade..bfdad5f2b 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -39,43 +39,47 @@ cryostat.discovery.plugins.ping.worker-pool-size=3 cryostat.discovery.plugins.ping.delay-ms=500 cryostat.discovery.plugins.ping.timeout-ms=10000 +# Enable fault tolerance metrics for monitoring bulkhead rejections, rate limits, etc. +quarkus.fault-tolerance.metrics.enabled=true + # Credential service bulkhead - limits concurrent credential creation to prevent database pool exhaustion +# Note: All duration values are in milliseconds quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.waiting-task-queue=10 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.value=5S +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.value=5000 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.max-retries=2 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1S -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter=500ms +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1000 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter=500 # Credential creation rate limits - prevents credential ID explosion during registration storms quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=100 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=60000 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.type=smooth # Discovery bulkheads - limits concurrent plugin operations quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.value=8 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.waiting-task-queue=15 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=10S +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=10000 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.max-retries=3 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=500ms -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=250ms +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=500 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=250 # Rate limiting for plugin registration - prevents registration storms quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=200 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=60000 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.type=smooth quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.waiting-task-queue=10 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=15S +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=15000 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.max-retries=3 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=500ms -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=250ms +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=500 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=250 # Rate limiting for discovery tree updates - prevents publication storms # Limit: 300 publications per minute (allows for frequent updates from multiple plugins) quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=300 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=1m +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=60000 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.type=smooth cryostat.discovery.plugins.jwt.secret.algorithm=AES From 2825306a2320f4bcbc23bd7c740d6bfa92a3cef1 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Fri, 1 May 2026 09:46:34 -0400 Subject: [PATCH 6/8] map RateLimitException to HTTP 429 --- src/main/java/io/cryostat/ExceptionMappers.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/io/cryostat/ExceptionMappers.java b/src/main/java/io/cryostat/ExceptionMappers.java index 78accbe55..b8565ffd0 100644 --- a/src/main/java/io/cryostat/ExceptionMappers.java +++ b/src/main/java/io/cryostat/ExceptionMappers.java @@ -23,6 +23,7 @@ import com.nimbusds.jwt.proc.BadJWTException; import io.netty.handler.codec.http.HttpResponseStatus; +import io.smallrye.faulttolerance.api.RateLimitException; import io.smallrye.mutiny.TimeoutException; import jakarta.inject.Inject; import jakarta.persistence.NoResultException; @@ -143,4 +144,10 @@ public RestResponse mapBulkheadException(BulkheadException ex) { logger.warn(ex); return RestResponse.status(HttpResponseStatus.TOO_MANY_REQUESTS.code()); } + + @ServerExceptionMapper + public RestResponse mapRateLimitException(RateLimitException ex) { + logger.warn(ex); + return RestResponse.status(HttpResponseStatus.TOO_MANY_REQUESTS.code()); + } } From 8a5571efece335d3577ff314909f3174f98941f9 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Fri, 1 May 2026 09:58:08 -0400 Subject: [PATCH 7/8] fixup! config correction, tuning --- .../java/io/cryostat/discovery/Discovery.java | 4 + .../resources/application-test.properties | 6 +- src/main/resources/application.properties | 73 +++++++++++-------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/src/main/java/io/cryostat/discovery/Discovery.java b/src/main/java/io/cryostat/discovery/Discovery.java index 9d0394c62..f4aaf4eb9 100644 --- a/src/main/java/io/cryostat/discovery/Discovery.java +++ b/src/main/java/io/cryostat/discovery/Discovery.java @@ -457,6 +457,10 @@ public PluginRegistration register(@Context RoutingContext ctx, JsonObject body) } @Transactional + @Bulkhead + @Timeout + @Retry(retryOn = {OptimisticLockException.class}) + @RateLimit @POST @Path("/api/v4/discovery/{id}") @Consumes(MediaType.APPLICATION_JSON) diff --git a/src/main/resources/application-test.properties b/src/main/resources/application-test.properties index 813072f3c..83f17c4cc 100644 --- a/src/main/resources/application-test.properties +++ b/src/main/resources/application-test.properties @@ -35,11 +35,7 @@ quarkus.s3.aws.credentials.static-provider.secret-access-key=unused grafana-dashboard.url=http://dashboard.no-grafana.local grafana-datasource.url=http://datasource.no-grafana.local -# Disable rate limiting in tests by setting very high limits -# Tests create many credentials/registrations rapidly and would hit production rate limits quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=999999 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=60000 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=999999 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.value=999999 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=999999 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=60000 diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index bfdad5f2b..85f061212 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -39,47 +39,60 @@ cryostat.discovery.plugins.ping.worker-pool-size=3 cryostat.discovery.plugins.ping.delay-ms=500 cryostat.discovery.plugins.ping.timeout-ms=10000 -# Enable fault tolerance metrics for monitoring bulkhead rejections, rate limits, etc. -quarkus.fault-tolerance.metrics.enabled=true - -# Credential service bulkhead - limits concurrent credential creation to prevent database pool exhaustion -# Note: All duration values are in milliseconds quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.waiting-task-queue=10 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.value=5000 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.max-retries=2 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1000 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.unit=seconds +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".timeout.value=5 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.max-retries=3 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay-unit=seconds +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.delay=1 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter-unit=millis quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".retry.jitter=500 - -# Credential creation rate limits - prevents credential ID explosion during registration storms -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=100 -quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=60 +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window-unit=minutes +quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.window=1 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.type=smooth -# Discovery bulkheads - limits concurrent plugin operations -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.value=8 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.waiting-task-queue=15 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=10000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.value=5 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".bulkhead.waiting-task-queue=10 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".timeout.value=5 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.max-retries=3 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=500 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=250 - -# Rate limiting for plugin registration - prevents registration storms -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=200 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay-unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.delay=1 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter-unit=millis +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".retry.jitter=500 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=60 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window-unit=minutes +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.window=1 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.type=smooth +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".bulkhead.value=5 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".bulkhead.waiting-task-queue=10 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".timeout.unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".timeout.value=5 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".retry.max-retries=3 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".retry.delay-unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".retry.delay=1 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".retry.jitter-unit=millis +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".retry.jitter=500 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.value=60 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.window-unit=minutes +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.window=1 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.type=smooth + quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".bulkhead.waiting-task-queue=10 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=15000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".timeout.value=5 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.max-retries=3 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=500 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=250 - -# Rate limiting for discovery tree updates - prevents publication storms -# Limit: 300 publications per minute (allows for frequent updates from multiple plugins) -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=300 -quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=60000 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay-unit=seconds +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.delay=1 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter-unit=millis +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".retry.jitter=500 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.value=60 +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window-unit=minutes +quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.window=1 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publishWithContext".rate-limit.type=smooth cryostat.discovery.plugins.jwt.secret.algorithm=AES From 7e26a41f465276cdbf6dfe62fcc2a6234ddb4358 Mon Sep 17 00:00:00 2001 From: Andrew Azores Date: Fri, 1 May 2026 12:01:48 -0400 Subject: [PATCH 8/8] wait for self startup and likely readiness before starting plugin pings --- .../java/io/cryostat/ConfigProperties.java | 2 ++ .../discovery/StartupPluginPinger.java | 27 ++++++++++++++++++- .../resources/application-test.properties | 1 + src/main/resources/application.properties | 1 + 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/cryostat/ConfigProperties.java b/src/main/java/io/cryostat/ConfigProperties.java index fff99dc69..f1065f207 100644 --- a/src/main/java/io/cryostat/ConfigProperties.java +++ b/src/main/java/io/cryostat/ConfigProperties.java @@ -62,6 +62,8 @@ public class ConfigProperties { "cryostat.discovery.plugins.ping.delay-ms"; public static final String DISCOVERY_PLUGINS_PING_TIMEOUT_MS = "cryostat.discovery.plugins.ping.timeout-ms"; + public static final String DISCOVERY_PLUGINS_PING_STARTUP_GRACE_PERIOD_MS = + "cryostat.discovery.plugins.ping.startup-grace-period-ms"; public static final String CONNECTIONS_TTL = "cryostat.connections.ttl"; public static final String CONNECTIONS_FAILED_BACKOFF = "cryostat.connections.failed-backoff"; diff --git a/src/main/java/io/cryostat/discovery/StartupPluginPinger.java b/src/main/java/io/cryostat/discovery/StartupPluginPinger.java index 7d13dad2f..b9b63f98a 100644 --- a/src/main/java/io/cryostat/discovery/StartupPluginPinger.java +++ b/src/main/java/io/cryostat/discovery/StartupPluginPinger.java @@ -56,6 +56,9 @@ public class StartupPluginPinger { @ConfigProperty(name = ConfigProperties.DISCOVERY_PLUGINS_PING_TIMEOUT_MS) long timeoutMs; + @ConfigProperty(name = ConfigProperties.DISCOVERY_PLUGINS_PING_STARTUP_GRACE_PERIOD_MS) + long startupGracePeriodMs; + private ExecutorService executorService; /** @@ -81,7 +84,8 @@ void onStart(@Observes StartupEvent evt) { } List pluginDataList = fetchPluginData(); - triggerJobsSequentially(pluginDataList, scheduler) + applyStartupGracePeriod() + .thenCompose(v -> triggerJobsSequentially(pluginDataList, scheduler)) .whenComplete( (v, ex) -> { if (ex != null) { @@ -102,6 +106,27 @@ private List fetchPluginData() { .toList()); } + CompletableFuture applyStartupGracePeriod() { + if (startupGracePeriodMs <= 0) { + return CompletableFuture.completedFuture(null); + } + + logger.debugv( + "Applying startup grace period of {0}ms before triggering plugin ping jobs", + startupGracePeriodMs); + + return CompletableFuture.runAsync( + () -> { + try { + Thread.sleep(startupGracePeriodMs); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.warn("Interrupted during startup grace period", e); + } + }, + executorService); + } + /** * Trigger existing Quartz jobs for all plugins sequentially using a worker pool with delays * between batches. diff --git a/src/main/resources/application-test.properties b/src/main/resources/application-test.properties index 83f17c4cc..b0e0e2baa 100644 --- a/src/main/resources/application-test.properties +++ b/src/main/resources/application-test.properties @@ -35,6 +35,7 @@ quarkus.s3.aws.credentials.static-provider.secret-access-key=unused grafana-dashboard.url=http://dashboard.no-grafana.local grafana-datasource.url=http://datasource.no-grafana.local +cryostat.discovery.plugins.ping.startup-grace-period-ms=0 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".rate-limit.value=999999 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/register".rate-limit.value=999999 quarkus.fault-tolerance."io.cryostat.discovery.Discovery/publish".rate-limit.value=999999 diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 85f061212..bb3466c99 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -38,6 +38,7 @@ cryostat.discovery.plugins.max-backoff-multiplier=5 cryostat.discovery.plugins.ping.worker-pool-size=3 cryostat.discovery.plugins.ping.delay-ms=500 cryostat.discovery.plugins.ping.timeout-ms=10000 +cryostat.discovery.plugins.ping.startup-grace-period-ms=30000 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.value=5 quarkus.fault-tolerance."io.cryostat.credentials.Credentials/create".bulkhead.waiting-task-queue=10