diff --git a/cmd/prow-job-dispatcher/main.go b/cmd/prow-job-dispatcher/main.go index 3b8a0168e6e..bc69aced37f 100644 --- a/cmd/prow-job-dispatcher/main.go +++ b/cmd/prow-job-dispatcher/main.go @@ -10,6 +10,7 @@ import ( "os/signal" "path/filepath" "reflect" + "regexp" "slices" "sort" "strings" @@ -47,6 +48,10 @@ const ( listURL = "https://github.com/openshift/release/pulls?q=is%3Apr+author%3Aopenshift-bot+prow+job+dispatcher+in%3Atitle+is%3Aopen" ) +var blockedClusterRelocationJobExceptions = []*regexp.Regexp{ + regexp.MustCompile(`^periodic-build[0-9]{2}-upgrade$`), +} + type options struct { prowJobConfigDir string configPath string @@ -275,6 +280,25 @@ func extractCapabilities(labels map[string]string) []string { return capabilities } +func isBlockedClusterRelocationException(jobName string) bool { + for _, re := range blockedClusterRelocationJobExceptions { + if re.MatchString(jobName) { + return true + } + } + return false +} + +func blockedClustersForJob(jobName string, determinedCluster string, blocked sets.Set[string]) sets.Set[string] { + if !blocked.Has(determinedCluster) || !isBlockedClusterRelocationException(jobName) { + return blocked + } + + filteredBlocked := blocked.Clone() + filteredBlocked.Delete(determinedCluster) + return filteredBlocked +} + func findClusterAssigmentsForJobs(jc *prowconfig.JobConfig, path string, config *dispatcher.Config, pjs map[string]dispatcher.ProwJobData, blocked sets.Set[string], cm dispatcher.ClusterMap) error { mostUsedCluster := dispatcher.FindMostUsedCluster(jc) @@ -284,7 +308,8 @@ func findClusterAssigmentsForJobs(jc *prowconfig.JobConfig, path string, config return fmt.Errorf("failed to determine cluster for the job %s in path %q: %w", jobBase.Name, path, err) } - c := dispatcher.DetermineTargetCluster(cluster, string(determinedCluster), string(config.Default), canBeRelocated, blocked) + blockedForJob := blockedClustersForJob(jobBase.Name, string(determinedCluster), blocked) + c := dispatcher.DetermineTargetCluster(cluster, string(determinedCluster), string(config.Default), canBeRelocated, blockedForJob) pjs[jobBase.Name] = dispatcher.ProwJobData{Cluster: c, Capabilities: extractCapabilities(jobBase.Labels)} logrus.WithField("job", jobBase.Name).WithField("cluster", c).Info("found cluster for job") return nil @@ -328,7 +353,8 @@ func (cv *clusterVolume) addToVolume(cluster string, jobBase prowconfig.JobBase, return fmt.Errorf("failed to determine cluster for the job %s in path %q: %w", jobBase.Name, path, err) } - c := dispatcher.DetermineTargetCluster(cluster, string(determinedCluster), string(config.Default), canBeRelocated, cv.blocked) + blockedForJob := blockedClustersForJob(jobBase.Name, string(determinedCluster), cv.blocked) + c := dispatcher.DetermineTargetCluster(cluster, string(determinedCluster), string(config.Default), canBeRelocated, blockedForJob) cv.pjs[jobBase.Name] = dispatcher.ProwJobData{Cluster: c, Capabilities: extractCapabilities(jobBase.Labels)} if determinedCloudProvider := config.IsInBuildFarm(api.Cluster(c)); determinedCloudProvider != "" { cv.clusterVolumeMap[string(determinedCloudProvider)][c] = cv.clusterVolumeMap[string(determinedCloudProvider)][c] + jobVolumes[jobBase.Name] diff --git a/cmd/prow-job-dispatcher/main_test.go b/cmd/prow-job-dispatcher/main_test.go index 3111e94b778..2d784079e3e 100644 --- a/cmd/prow-job-dispatcher/main_test.go +++ b/cmd/prow-job-dispatcher/main_test.go @@ -477,6 +477,100 @@ func TestDispatchDeltaJobs(t *testing.T) { } } +func TestBlockedClustersForJob(t *testing.T) { + blocked := sets.New[string]("build01", "build02") + + testCases := []struct { + name string + jobName string + determinedCluster string + expectedBlocked sets.Set[string] + }{ + { + name: "keeps blocked clusters for non-matching job", + jobName: "periodic-something-else", + determinedCluster: "build01", + expectedBlocked: sets.New[string]("build01", "build02"), + }, + { + name: "removes determined cluster for matching upgrade job", + jobName: "periodic-build01-upgrade", + determinedCluster: "build01", + expectedBlocked: sets.New[string]("build02"), + }, + { + name: "keeps blocked clusters when determined cluster not blocked", + jobName: "periodic-build77-upgrade", + determinedCluster: "build77", + expectedBlocked: sets.New[string]("build01", "build02"), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + actual := blockedClustersForJob(tc.jobName, tc.determinedCluster, blocked) + if !actual.Equal(tc.expectedBlocked) { + t.Fatalf("unexpected blocked set. expected=%v actual=%v", tc.expectedBlocked.UnsortedList(), actual.UnsortedList()) + } + }) + } +} + +func TestAddToVolumeSkipsBlockedRelocationForMatchingUpgradePeriodic(t *testing.T) { + config := &dispatcher.Config{ + Default: "api.ci", + BuildFarm: map[api.Cloud]map[api.Cluster]*dispatcher.BuildFarmConfig{ + api.CloudAWS: { + "build01": {}, + }, + api.CloudGCP: { + "build02": {}, + }, + }, + Groups: map[api.Cluster]dispatcher.Group{ + "build01": { + Jobs: []string{"periodic-build01-upgrade", "periodic-something-else"}, + }, + }, + } + + cv := &clusterVolume{ + clusterVolumeMap: map[string]map[string]float64{ + "aws": {"build01": 0}, + "gcp": {"build02": 0}, + }, + cloudProviders: sets.New[string]("aws", "gcp"), + pjs: map[string]dispatcher.ProwJobData{}, + blocked: sets.New[string]("build01"), + specialClusters: map[string]float64{ + "api.ci": 0, + }, + clusterMap: dispatcher.ClusterMap{ + "build01": {Capacity: 100}, + "build02": {Capacity: 100}, + }, + } + + jobVolumes := map[string]float64{ + "periodic-build01-upgrade": 1, + "periodic-something-else": 1, + } + + if err := cv.addToVolume("build02", prowconfig.JobBase{Name: "periodic-build01-upgrade"}, "foo-periodics.yaml", config, jobVolumes); err != nil { + t.Fatalf("addToVolume returned error for matching periodic: %v", err) + } + if err := cv.addToVolume("build02", prowconfig.JobBase{Name: "periodic-something-else"}, "foo-periodics.yaml", config, jobVolumes); err != nil { + t.Fatalf("addToVolume returned error for non-matching periodic: %v", err) + } + + if got := cv.pjs["periodic-build01-upgrade"].Cluster; got != "build01" { + t.Fatalf("expected matching periodic to stay on determined blocked cluster build01, got %s", got) + } + if got := cv.pjs["periodic-something-else"].Cluster; got != "build02" { + t.Fatalf("expected non-matching periodic to be relocated to build02, got %s", got) + } +} + type fakeSlackClient struct { }