Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 61 additions & 25 deletions pkg/migration/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,17 @@ var (
)

type Runner struct {
migration *Migration
db *sql.DB
dbConfig *dbconn.DBConfig
replicas []*sql.DB
migration *Migration
db *sql.DB
dbConfig *dbconn.DBConfig
replicas []*sql.DB
// monitorDB is a small dedicated connection pool used by the Aurora
// throttlers to poll perf-schema / global-status. Sharing the main
// r.db pool let throttler polls queue behind chunk writes, which
// delayed the very signal we wanted to react to (and counted the
// throttler's own SELECT as an active query thread). nil unless Aurora
// throttling is enabled.
monitorDB *sql.DB
checkpointTable *table.TableInfo

// Changes enccapsulates all changes
Expand Down Expand Up @@ -670,6 +677,9 @@ func (r *Runner) closeReplicas() error {
// - one replication throttler per --replica-dsn (slowest wins)
// - a commit-latency throttler if the source is detected as Aurora and
// --max-commit-latency is positive (issue #468)
// - an active-threads throttler if the source is detected as Aurora and
// the migration user can read the relevant perf-schema tables (issue
// #831)
Comment thread
morgo marked this conversation as resolved.
//
// Multiple replica DSNs can be specified as a comma-separated list.
// This is common logic shared between resume and new migration paths.
Expand All @@ -691,26 +701,38 @@ func (r *Runner) setupThrottler(ctx context.Context) error {
throttlers = append(throttlers, replicaThrottlers...)
}

if r.migration.MaxCommitLatency > 0 {
isAurora, err := throttler.IsAurora(ctx, r.db)
if err != nil {
// Probe failure (e.g., performance_schema disabled, no privileges)
// is non-fatal — Aurora-only feature on a non-Aurora server, or
// a perf-schema-locked Aurora user. Log at Debug so operators can
// diagnose if they expected throttling, without alerting users
// who don't care.
r.logger.Debug("Aurora probe failed, skipping commit-latency throttler", "error", err)
} else if isAurora {
cl, err := throttler.NewCommitLatencyThrottler(r.db, r.migration.MaxCommitLatency, r.logger)
if err != nil {
_ = r.closeReplicas()
return fmt.Errorf("could not create commit-latency throttler: %w", err)
}
r.logger.Info("Aurora detected, enabling commit-latency throttler",
"threshold", r.migration.MaxCommitLatency)
throttlers = append(throttlers, cl)
}
// Aurora throttlers — assembled by the shared throttler.AuroraSetup
// helper so the move runner can use the same wiring. The two Aurora
// throttlers (commit-latency, active-threads) have independent gates:
// setting MaxCommitLatency=0 disables only commit-latency, leaving
// active-threads enabled when Aurora is detected and the user has
// SELECT on performance_schema.threads + events_waits_current. Build
// returns a zero result on non-Aurora sources so this call is safe
// to make unconditionally.
//
// OpenMonitor is invoked lazily by the helper only after IsAurora
// returns true AND at least one throttler will be built, so non-
// Aurora users never pay the connect cost. MaxOpenConnections=2 lets
// both Aurora throttlers poll concurrently without serializing on a
// single conn, with a touch of headroom.
auroraRes, err := throttler.AuroraSetup{
Source: r.db,
OpenMonitor: func() (*sql.DB, error) {
monitorCfg := *r.dbConfig // shallow copy — MaxOpenConnections is value-typed
monitorCfg.MaxOpenConnections = 2
return dbconn.NewWithConnectionType(r.dsn(), &monitorCfg, "monitor database")
},
CommitLatencyThreshold: r.migration.MaxCommitLatency,
Logger: r.logger,
}.Build(ctx)
if err != nil {
_ = r.closeReplicas()
return err
}
if auroraRes.MonitorDB != nil {
r.monitorDB = auroraRes.MonitorDB
}
throttlers = append(throttlers, auroraRes.Throttlers...)

if len(throttlers) == 0 {
return nil // use default Noop throttler
Expand All @@ -721,8 +743,13 @@ func (r *Runner) setupThrottler(ctx context.Context) error {
if err := r.throttler.Open(ctx); err != nil {
// multiThrottler already closes child throttlers on partial Open
// failure, but the *sql.DB connections backing replica throttlers
// are owned by r.replicas — clean those up too rather than leaving
// them dangling until Runner.Close() runs.
// are owned by r.replicas (and the Aurora monitor pool is owned
// by r.monitorDB) — clean those up too rather than leaving them
// dangling until Runner.Close() runs.
if r.monitorDB != nil {
_ = r.monitorDB.Close()
r.monitorDB = nil
}
_ = r.closeReplicas()
return fmt.Errorf("opening throttlers: %w", err)
}
Expand Down Expand Up @@ -1024,6 +1051,15 @@ func (r *Runner) Close() error {
errs = append(errs, err)
}
}
// Close the Aurora monitor pool after the throttler so its background
// pollers observe Close() / ctx cancellation before we yank the pool
// out from under them. No-op when not Aurora.
if r.monitorDB != nil {
if err := r.monitorDB.Close(); err != nil {
errs = append(errs, err)
}
r.monitorDB = nil
}
if err := r.closeReplicas(); err != nil {
errs = append(errs, err)
}
Expand Down
182 changes: 182 additions & 0 deletions pkg/throttler/aurora.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package throttler

import (
"context"
"database/sql"
"errors"
"fmt"
"log/slog"
"time"

"github.com/go-sql-driver/mysql"
)

// MySQL error codes for "you don't have permission" failures. The active-
// threads probe distinguishes these from other errors so the log message
// can suggest a concrete fix (grant SELECT) when it's actually a grants
// problem, and avoid that misleading suggestion otherwise.
const (
errAccessDenied = 1045 // ER_ACCESS_DENIED_ERROR
errDBAccessDenied = 1044 // ER_DBACCESS_DENIED_ERROR
errTableAccessDenied = 1142 // ER_TABLEACCESS_DENIED_ERROR (SELECT on a table denied)
errSpecificAccessDenied = 1227 // ER_SPECIFIC_ACCESS_DENIED_ERROR
)

// AuroraSetup orchestrates probing for Aurora and assembling the Aurora-
// specific throttlers (commit-latency + active-threads). It exists so both
// the migration runner and the move runner can wire up the same throttlers
// without duplicating the IsAurora / monitor-pool / probe / construct dance.
//
// The throttler package intentionally does not import dbconn — opening the
// monitor pool happens via the caller-supplied OpenMonitor closure, which
// lets the caller own DSN, TLS, and pool sizing.
//
// The two Aurora throttlers are independent signals and have independent
// gates. Disabling one does not disable the other — see Build for details.
type AuroraSetup struct {
// Source is the caller's main *sql.DB. Used only for the one-shot
// IsAurora and CanReadActiveThreads probes — these are cheap and run
// once at setup, so making them share the main pool is fine.
Source *sql.DB

// OpenMonitor opens a dedicated *sql.DB used exclusively by the Aurora
// throttlers for recurring polls. Called at most once, only after
// IsAurora has returned true and at least one Aurora throttler is
// going to be constructed, so non-Aurora callers never pay the
// connect cost. The caller owns closing the returned DB — see
// AuroraResult.MonitorDB.
OpenMonitor func() (*sql.DB, error)

// CommitLatencyThreshold gates the commit-latency throttler. A non-
// positive value disables that throttler only — the active-threads
// throttler is independent and is still enabled when Aurora is
// detected and the privilege probe succeeds.
CommitLatencyThreshold time.Duration

Logger *slog.Logger
}

// AuroraResult is the output of AuroraSetup.Build. When Throttlers is empty
// MonitorDB is nil — there's no pool to close. When Throttlers is non-empty
// MonitorDB is non-nil and the caller owns its lifecycle.
type AuroraResult struct {
Throttlers []Throttler
MonitorDB *sql.DB
}

// Build probes the source for Aurora and assembles the Aurora throttlers.
//
// The two Aurora throttlers have independent gates:
// - Commit-latency is enabled when CommitLatencyThreshold > 0.
// - Active-threads is enabled when Aurora is detected and the user has
// SELECT on performance_schema.threads and events_waits_current.
//
// Returns a zero AuroraResult (nil throttlers, nil monitor DB, nil error) in
// any of these benign cases:
// - IsAurora probe failed (non-Aurora source, or perf_schema not readable —
// logged at Debug so the non-Aurora common case stays quiet)
// - IsAurora returned false
// - Aurora was detected but neither gate produced a throttler (commit-
// latency disabled by threshold AND active-threads probe failed for
// any reason — privilege denied or otherwise, both downgraded to Info)
//
// Returns a non-nil error only for setup failures the caller almost
// certainly wants to surface: nil required fields, OpenMonitor failing, or
// throttler construction failing. The active-threads privilege probe is
// best-effort by design — its failure (whatever the cause) disables that
// throttler but never aborts the migration.
func (s AuroraSetup) Build(ctx context.Context) (AuroraResult, error) {
Comment thread
morgo marked this conversation as resolved.
// Validate required fields up-front. AuroraSetup is an exported struct
// and these are all dereferenced unconditionally inside Build; a
// descriptive error beats a nil-pointer panic.
if s.Source == nil {
return AuroraResult{}, errors.New("AuroraSetup.Source is required")
}
if s.OpenMonitor == nil {
return AuroraResult{}, errors.New("AuroraSetup.OpenMonitor is required")
}
if s.Logger == nil {
return AuroraResult{}, errors.New("AuroraSetup.Logger is required")
}

isAurora, err := IsAurora(ctx, s.Source)
switch {
case err != nil:
// Non-Aurora MySQL with locked-down perf_schema lands here too;
// keep it at Debug so the common case isn't noisy.
s.Logger.Debug("Aurora probe failed, skipping Aurora throttlers", "error", err)
return AuroraResult{}, nil
case !isAurora:
return AuroraResult{}, nil
}

// Decide independently which throttlers will be built before opening
// the monitor pool, so we don't open a pool we'd immediately discard.
enableCommitLatency := s.CommitLatencyThreshold > 0
enableActiveThreads := true
if probeErr := CanReadActiveThreads(ctx, s.Source); probeErr != nil {
// Surface at Info because Aurora is confirmed and the operator
// likely expected this throttler to be enabled. Distinguish
// "looks like a grants problem" from other failures so the log
// message only suggests `GRANT SELECT` when that's plausibly
// the fix — a transient network error shouldn't send operators
// down a fruitless permissions investigation.
if isPrivilegeDeniedError(probeErr) {
s.Logger.Info("Aurora active-threads throttler disabled: grant SELECT on performance_schema.threads and performance_schema.events_waits_current to enable",
"error", probeErr)
} else {
s.Logger.Info("Aurora active-threads throttler disabled: probe failed",
"error", probeErr)
}
enableActiveThreads = false
}
if !enableCommitLatency && !enableActiveThreads {
return AuroraResult{}, nil
}

monitorDB, err := s.OpenMonitor()
if err != nil {
return AuroraResult{}, fmt.Errorf("could not open monitor DB for Aurora throttlers: %w", err)
}

var throttlers []Throttler

if enableCommitLatency {
cl, err := NewCommitLatencyThrottler(monitorDB, s.CommitLatencyThreshold, s.Logger)
if err != nil {
_ = monitorDB.Close()
return AuroraResult{}, fmt.Errorf("could not create commit-latency throttler: %w", err)
}
s.Logger.Info("Aurora detected, enabling commit-latency throttler",
"threshold", s.CommitLatencyThreshold)
throttlers = append(throttlers, cl)
}

if enableActiveThreads {
at, err := NewActiveThreadsThrottler(monitorDB, s.Logger)
if err != nil {
_ = monitorDB.Close()
return AuroraResult{}, fmt.Errorf("could not create active-threads throttler: %w", err)
}
throttlers = append(throttlers, at)
}

return AuroraResult{Throttlers: throttlers, MonitorDB: monitorDB}, nil
}

// isPrivilegeDeniedError reports whether err looks like the MySQL server
// refusing the query for permissions reasons (vs. network, syntax, or
// missing-table errors). Used to tailor the active-threads probe-failure
// log message.
func isPrivilegeDeniedError(err error) bool {
var me *mysql.MySQLError
if !errors.As(err, &me) {
return false
}
switch me.Number {
case errAccessDenied, errDBAccessDenied, errTableAccessDenied, errSpecificAccessDenied:
return true
default:
return false
}
}
Loading
Loading