-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[FLINK-36753][runtime]Adaptive Scheduler actively triggers a Checkpoint after all resources are ready #27921
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import org.apache.flink.api.common.JobStatus; | ||
| import org.apache.flink.core.execution.SavepointFormatType; | ||
| import org.apache.flink.runtime.JobException; | ||
| import org.apache.flink.runtime.checkpoint.CheckpointCoordinator; | ||
| import org.apache.flink.runtime.checkpoint.CheckpointScheduling; | ||
| import org.apache.flink.runtime.checkpoint.CheckpointStatsListener; | ||
| import org.apache.flink.runtime.checkpoint.CompletedCheckpoint; | ||
|
|
@@ -40,6 +41,7 @@ | |
| import org.apache.flink.runtime.scheduler.stopwithsavepoint.StopWithSavepointTerminationManager; | ||
| import org.apache.flink.util.ExceptionUtils; | ||
| import org.apache.flink.util.Preconditions; | ||
| import org.apache.flink.util.concurrent.FutureUtils; | ||
|
|
||
| import org.slf4j.Logger; | ||
|
|
||
|
|
@@ -64,6 +66,7 @@ class Executing extends StateWithExecutionGraph | |
|
|
||
| private final StateTransitionManager stateTransitionManager; | ||
| private final int rescaleOnFailedCheckpointCount; | ||
| private final boolean activeCheckpointTriggerEnabled; | ||
| // null indicates that there was no change event observed, yet | ||
| @Nullable private AtomicInteger failedCheckpointCountdown; | ||
|
|
||
|
|
@@ -77,7 +80,8 @@ class Executing extends StateWithExecutionGraph | |
| List<ExceptionHistoryEntry> failureCollection, | ||
| Function<StateTransitionManager.Context, StateTransitionManager> | ||
| stateTransitionManagerFactory, | ||
| int rescaleOnFailedCheckpointCount) { | ||
| int rescaleOnFailedCheckpointCount, | ||
| boolean activeCheckpointTriggerEnabled) { | ||
| super( | ||
| context, | ||
| executionGraph, | ||
|
|
@@ -96,6 +100,7 @@ class Executing extends StateWithExecutionGraph | |
| rescaleOnFailedCheckpointCount > 0, | ||
| "The rescaleOnFailedCheckpointCount should be larger than 0."); | ||
| this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; | ||
| this.activeCheckpointTriggerEnabled = activeCheckpointTriggerEnabled; | ||
| this.failedCheckpointCountdown = null; | ||
|
|
||
| recordRescaleForJobIntoExecuting(logger, context); | ||
|
|
@@ -182,6 +187,74 @@ public ScheduledFuture<?> scheduleOperation(Runnable callback, Duration delay) { | |
| return context.runIfState(this, callback, delay); | ||
| } | ||
|
|
||
| private static final Duration ACTIVE_CHECKPOINT_RETRY_DELAY = Duration.ofSeconds(10); | ||
|
|
||
| @Override | ||
| public void requestActiveCheckpointTrigger() { | ||
| if (!activeCheckpointTriggerEnabled) { | ||
| return; | ||
| } | ||
|
|
||
| final CheckpointCoordinator checkpointCoordinator = | ||
| getExecutionGraph().getCheckpointCoordinator(); | ||
|
|
||
| if (checkpointCoordinator == null | ||
| || !checkpointCoordinator.isPeriodicCheckpointingConfigured()) { | ||
| getLogger() | ||
| .debug( | ||
| "Skipping active checkpoint trigger for rescale: checkpointing not configured."); | ||
| return; | ||
| } | ||
|
|
||
| if (!parallelismChanged()) { | ||
| getLogger() | ||
| .debug( | ||
| "Skipping active checkpoint trigger for rescale: parallelism unchanged."); | ||
| return; | ||
| } | ||
|
|
||
| if (checkpointCoordinator.getNumberOfPendingCheckpoints() > 0 | ||
| || checkpointCoordinator.isTriggering()) { | ||
| getLogger() | ||
| .debug( | ||
| "Skipping active checkpoint trigger for rescale: checkpoint already in progress."); | ||
| return; | ||
| } | ||
|
|
||
| if (!checkpointCoordinator.isMinPauseBetweenCheckpointsSatisfied()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you return a remaining time to the next checkpoint instead of boolean, then you can directly use it in following call of |
||
| getLogger() | ||
| .debug( | ||
| "Skipping active checkpoint trigger for rescale: min pause between checkpoints not satisfied, scheduling retry."); | ||
| context.runIfState( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we get e.g. 10 |
||
| this, this::requestActiveCheckpointTrigger, ACTIVE_CHECKPOINT_RETRY_DELAY); | ||
| return; | ||
| } | ||
|
|
||
| getLogger().info("Actively triggering checkpoint to expedite rescaling."); | ||
| FutureUtils.assertNoException( | ||
| checkpointCoordinator | ||
| .triggerCheckpoint(false) | ||
| .handle( | ||
| (completedCheckpoint, throwable) -> { | ||
| if (throwable != null) { | ||
| getLogger() | ||
| .warn( | ||
| "Active checkpoint trigger for rescale failed, scheduling retry.", | ||
| throwable); | ||
| context.runIfState( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really want to introduce this endless cycle? I feel we should just log it and give it up. If the checkpoint is failing there probably is different issue with job and we shouldn't try it again and again without any retry cap. |
||
| this, | ||
| this::requestActiveCheckpointTrigger, | ||
| ACTIVE_CHECKPOINT_RETRY_DELAY); | ||
| } else { | ||
| getLogger() | ||
| .info( | ||
| "Active checkpoint for rescale completed successfully: {}.", | ||
| completedCheckpoint.getCheckpointID()); | ||
| } | ||
| return null; | ||
| })); | ||
| } | ||
|
|
||
| @Override | ||
| public void transitionToSubsequentState() { | ||
| Optional<VertexParallelism> availableVertexParallelism = | ||
|
|
@@ -399,6 +472,7 @@ static class Factory implements StateFactory<Executing> { | |
| private final Function<StateTransitionManager.Context, StateTransitionManager> | ||
| stateTransitionManagerFactory; | ||
| private final int rescaleOnFailedCheckpointCount; | ||
| private final boolean activeCheckpointTriggerEnabled; | ||
|
|
||
| Factory( | ||
| ExecutionGraph executionGraph, | ||
|
|
@@ -410,7 +484,8 @@ static class Factory implements StateFactory<Executing> { | |
| List<ExceptionHistoryEntry> failureCollection, | ||
| Function<StateTransitionManager.Context, StateTransitionManager> | ||
| stateTransitionManagerFactory, | ||
| int rescaleOnFailedCheckpointCount) { | ||
| int rescaleOnFailedCheckpointCount, | ||
| boolean activeCheckpointTriggerEnabled) { | ||
| this.context = context; | ||
| this.log = log; | ||
| this.executionGraph = executionGraph; | ||
|
|
@@ -420,6 +495,7 @@ static class Factory implements StateFactory<Executing> { | |
| this.failureCollection = failureCollection; | ||
| this.stateTransitionManagerFactory = stateTransitionManagerFactory; | ||
| this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount; | ||
| this.activeCheckpointTriggerEnabled = activeCheckpointTriggerEnabled; | ||
| } | ||
|
|
||
| public Class<Executing> getStateClass() { | ||
|
|
@@ -436,7 +512,8 @@ public Executing getState() { | |
| userCodeClassLoader, | ||
| failureCollection, | ||
| stateTransitionManagerFactory, | ||
| rescaleOnFailedCheckpointCount); | ||
| rescaleOnFailedCheckpointCount, | ||
| activeCheckpointTriggerEnabled); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pnowojski Is this safe without lock?