Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions http/src/main/resources/init-resources/galaxy-user-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#cloud-config
# Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh
# When updating this file, sync it manually from that repository and verify the changes.
write_files:
- path: /usr/local/bin/galaxy_bootstrap.sh
permissions: '0755'
owner: root:root
content: |
#!/bin/bash

echo "[$(date)] - Starting galaxy_bootstrap script..."

# 1. Setup persistent disk if available
DISK_DEVICE="/dev/disk/by-id/google-galaxy-data"
if [ -b "$DISK_DEVICE" ]; then
echo "[$(date)] - Found persistent disk at $DISK_DEVICE"

# Check if disk is already formatted
if ! blkid "$DISK_DEVICE" > /dev/null 2>&1; then
echo "[$(date)] - Formatting disk $DISK_DEVICE with ext4"
mkfs -t ext4 "$DISK_DEVICE"
else
echo "[$(date)] - Disk $DISK_DEVICE is already formatted"
fi

# Create mount point and mount
mkdir -p /mnt/block_storage
mount "$DISK_DEVICE" /mnt/block_storage

# Add to fstab for persistent mounting across reboots
DISK_UUID=$(blkid -s UUID -o value "$DISK_DEVICE")
if [ -n "$DISK_UUID" ] && ! grep -q "$DISK_UUID" /etc/fstab; then
echo "UUID=$DISK_UUID /mnt/block_storage ext4 defaults 0 2" >> /etc/fstab
fi

# Set proper ownership
chown debian:debian /mnt/block_storage
echo "[$(date)] - Persistent disk mounted at /mnt/block_storage"
else
echo "[$(date)] - No persistent disk found at $DISK_DEVICE. Galaxy will use ephemeral storage."
fi

# 2. Setup PostgreSQL disk if available
POSTGRES_DISK_DEVICE="/dev/disk/by-id/google-galaxy-postgres-data"
if [ -b "$POSTGRES_DISK_DEVICE" ]; then
echo "[$(date)] - Found PostgreSQL disk at $POSTGRES_DISK_DEVICE"

# Check if disk is already formatted
if ! blkid "$POSTGRES_DISK_DEVICE" > /dev/null 2>&1; then
echo "[$(date)] - Formatting PostgreSQL disk $POSTGRES_DISK_DEVICE with ext4"
mkfs -t ext4 "$POSTGRES_DISK_DEVICE"
else
echo "[$(date)] - PostgreSQL disk $POSTGRES_DISK_DEVICE is already formatted"
fi

# Create mount point and mount
mkdir -p /mnt/postgres_storage
mount "$POSTGRES_DISK_DEVICE" /mnt/postgres_storage

# Add to fstab for persistent mounting across reboots
POSTGRES_DISK_UUID=$(blkid -s UUID -o value "$POSTGRES_DISK_DEVICE")
if [ -n "$POSTGRES_DISK_UUID" ] && ! grep -q "$POSTGRES_DISK_UUID" /etc/fstab; then
echo "UUID=$POSTGRES_DISK_UUID /mnt/postgres_storage ext4 defaults 0 2" >> /etc/fstab
fi

# Set proper ownership
chown debian:debian /mnt/postgres_storage
echo "[$(date)] - PostgreSQL disk mounted at /mnt/postgres_storage"
else
echo "[$(date)] - No PostgreSQL disk found at $POSTGRES_DISK_DEVICE. PostgreSQL will use ephemeral storage."
fi

# 3. Run ansible-pull
sudo -u debian bash -c '
export HOME=/home/debian
HOST_IP=$(curl -s ifconfig.me)

PV_SIZE=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/persistent-volume-size" -H "Metadata-Flavor: Google" 2>/dev/null)
if [ -z "$PV_SIZE" ]; then
echo "[$(date)] - persistent-volume-size metadata not found or empty, using default."
PV_SIZE="139Gi"
fi
echo "[$(date)] - NFS storage size for Galaxy: ${PV_SIZE}"

# Add restore_galaxy if enabled
RESTORE_GALAXY=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/restore_galaxy" -H "Metadata-Flavor: Google" 2>/dev/null || echo "false")

GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com")
echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}"

# Leo proxy path prefix for this Galaxy app (e.g. /proxy/google/v1/apps/{project}/{appName}/galaxy).
# Passed to ansible as galaxy_url_prefix so Galaxy generates correct absolute links (JS/CSS/API)
# that include the full proxy path. Without this Galaxy emits links rooted at / which the
# browser resolves against Leo's host and gets 404s → blank page.
GALAXY_URL_PREFIX=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/galaxy-url-prefix" -H "Metadata-Flavor: Google" 2>/dev/null || echo "")
echo "[$(date)] - Galaxy URL prefix: ${GALAXY_URL_PREFIX}"

GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git")
GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master")

PULL_ARGS=(
-U "${GIT_REPO}"
-C "${GIT_BRANCH}"
-d /home/debian/ansible
-i /tmp/ansible-inventory/localhost
--accept-host-key
--limit 127.0.0.1
--extra-vars "gcp_batch_service_account_email=${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}"
)

if [ "$RESTORE_GALAXY" = "true" ]; then
PULL_ARGS+=(--extra-vars "restore_galaxy=true")
echo "[$(date)] - Galaxy Restore Mode: Enabled"
else
echo "[$(date)] - Galaxy Restore Mode: Disabled"
fi

if [ -n "$GALAXY_URL_PREFIX" ]; then
PULL_ARGS+=(--extra-vars "galaxy_prefix=${GALAXY_URL_PREFIX}")
echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}"
fi

PULL_ARGS+=(playbook.yml)

mkdir -p /tmp/ansible-inventory
cat > /tmp/ansible-inventory/localhost << EOF
[vm]
127.0.0.1 ansible_connection=local ansible_python_interpreter="/usr/bin/python3"

[all:vars]
ansible_user="debian"
rke2_token="defaultSecret12345"
rke2_additional_sans=["${HOST_IP}"]
rke2_debug=true
nfs_size="${PV_SIZE}"
galaxy_persistence_size="${PV_SIZE}"
galaxy_db_password="gxy-db-password"
galaxy_user="dev@galaxyproject.org"
EOF

echo "[$(date)] - Inventory file created at /tmp/ansible-inventory/localhost; running ansible-pull..."
echo "[$(date)] - Running: ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull ${PULL_ARGS[@]}"

ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull "${PULL_ARGS[@]}"
'

echo "[$(date)] - Bootstrap script completed."

runcmd:
- /usr/local/bin/galaxy_bootstrap.sh
4 changes: 4 additions & 0 deletions http/src/main/resources/init-resources/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ function failScriptIfError() {

function validateCert() {
certFileDirectory=$1
## Only the master node has certs; worker nodes in multi-node Dataproc clusters do not.
if [ ! -f "${certFileDirectory}/jupyter-server.crt" ]; then
return 0
fi
## This helps when we need to rotate certs.
notAfter=`openssl x509 -enddate -noout -in ${certFileDirectory}/jupyter-server.crt` # output should be something like `notAfter=Jul 4 20:31:52 2026 GMT`

Expand Down
53 changes: 53 additions & 0 deletions http/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,44 @@ vpc {
northamerica-northeast2 = "10.26.0.0/20"
}
firewallsToAdd = [
# Allows Galaxy VM traffic on port 80 (nginx ingress on hostNetwork)
{
name-prefix = "leonardo-allow-http"
sourceRanges = {
us-central1 = ["0.0.0.0/0"]
northamerica-northeast1 = ["0.0.0.0/0"]
southamerica-east1 = ["0.0.0.0/0"]
us-east1 = ["0.0.0.0/0"]
us-east4 = ["0.0.0.0/0"]
us-west1 = ["0.0.0.0/0"]
us-west2 = ["0.0.0.0/0"]
us-west3 = ["0.0.0.0/0"]
us-west4 = ["0.0.0.0/0"]
europe-central2 = ["0.0.0.0/0"]
europe-north1 = ["0.0.0.0/0"]
europe-west1 = ["0.0.0.0/0"]
europe-west2 = ["0.0.0.0/0"]
europe-west3 = ["0.0.0.0/0"]
europe-west4 = ["0.0.0.0/0"]
europe-west6 = ["0.0.0.0/0"]
asia-east1 = ["0.0.0.0/0"]
asia-east2 = ["0.0.0.0/0"]
asia-northeast1 = ["0.0.0.0/0"]
asia-northeast2 = ["0.0.0.0/0"]
asia-northeast3 = ["0.0.0.0/0"]
asia-south1 = ["0.0.0.0/0"]
asia-southeast1 = ["0.0.0.0/0"]
asia-southeast2 = ["0.0.0.0/0"]
australia-southeast1 = ["0.0.0.0/0"]
northamerica-northeast2 = ["0.0.0.0/0"]
}
allowed = [
{
protocol = "tcp"
port = "80"
}
]
},
# Allows Leonardo proxy traffic on port 443
{
name-prefix = "leonardo-allow-https"
Expand Down Expand Up @@ -395,6 +433,21 @@ groups {
}
}

galaxyVm {
# Pre-built galaxy-k8s-boot image with all dependencies (Ansible, RKE2, etc.) pre-installed.
# Has cloud-init, which processes the "user-data" metadata key on first boot.
# Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team)
sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25"
machineType = "n1-highmem-8"
bootDiskSizeGb = 100
postgresDiskSizeGb = 10
# Suffix appended to the NFS disk name to derive the postgres disk name.
# Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix).
postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix}
gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git"
gitBranch = "master"
}

gke {
cluster {
location = "us-central1-a",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ object Config {
)
}

implicit private val galaxyVmConfigReader: ValueReader[GalaxyVmConfig] = ValueReader.relative { config =>
GalaxyVmConfig(
config.as[GceCustomImage]("sourceImage"),
config.as[MachineTypeName]("machineType"),
config.as[DiskSize]("bootDiskSizeGb"),
config.as[DiskSize]("postgresDiskSizeGb"),
config.as[String]("postgresDiskNameSuffix"),
config.as[String]("gitRepo"),
config.as[String]("gitBranch")
)
}

implicit private val gceConfigReader: ValueReader[GceConfig] = ValueReader.relative { config =>
GceConfig(
config.as[GceCustomImage]("customGceImage"),
Expand Down Expand Up @@ -497,6 +509,7 @@ object Config {
val googleGroupsConfig = config.as[GoogleGroupsConfig]("groups")

val dataprocConfig = config.as[DataprocConfig]("dataproc")
val galaxyVmConfig = config.as[GalaxyVmConfig]("galaxyVm")
val gceConfig = config.as[GceConfig]("gce")
val imageConfig = config.as[ImageConfig]("image")
val prometheusConfig = config.as[PrometheusConfig]("prometheus")
Expand Down Expand Up @@ -901,13 +914,13 @@ object Config {
vpcConfig.networkTag,
org.broadinstitute.dsde.workbench.leonardo.http.ConfigReader.appConfig.terraAppSetupChart,
gkeIngressConfig,
gkeGalaxyAppConfig,
gkeCromwellAppConfig,
gkeCustomAppConfig,
gkeAllowedAppConfig,
appMonitorConfig,
gkeClusterConfig,
proxyConfig,
gkeGalaxyDiskConfig
gkeGalaxyDiskConfig,
galaxyVmConfig
)
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.broadinstitute.dsde.workbench.leonardo.config

import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceAccountName
import org.broadinstitute.dsde.workbench.google2.{KubernetesSerializableName, MachineTypeName}
import KubernetesSerializableName.ServiceAccountName
import org.broadinstitute.dsde.workbench.leonardo.AppType._
import org.broadinstitute.dsde.workbench.leonardo.CustomImage.GceCustomImage
import org.broadinstitute.dsde.workbench.leonardo._
import org.broadinstitute.dsp.{ChartName, ChartVersion}

Expand Down Expand Up @@ -92,6 +94,16 @@ final case class CustomAppConfig(chartName: ChartName,
val appType: AppType = AppType.Custom
}

final case class GalaxyVmConfig(
sourceImage: GceCustomImage,
machineType: MachineTypeName,
bootDiskSizeGb: DiskSize,
postgresDiskSizeGb: DiskSize,
postgresDiskNameSuffix: String,
gitRepo: String,
gitBranch: String
)

final case class ContainerRegistryUsername(asString: String) extends AnyVal
final case class ContainerRegistryPassword(asString: String) extends AnyVal
final case class ContainerRegistryCredentials(username: ContainerRegistryUsername, password: ContainerRegistryPassword)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client:
traceId: TraceId
): F[Boolean] =
Proxy.getAppTargetHost[F](kubernetesDnsCache, CloudContext.Gcp(googleProject), appName) flatMap {
case HostReady(targetHost, _, _) =>
case HostReady(targetHost, _, _, _) =>
val serviceUrl = serviceName match {
case ServiceName("welder-service") =>
s"https://${targetHost.address}/proxy/google/v1/apps/${googleProject.value}/${appName.value}/${serviceName.value}/status/"
Expand All @@ -44,6 +44,20 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client:
)
case _ => Async[F].pure(false) // Update once we support Relay for apps
}

def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] =
client
.status(
Request[F](
method = Method.GET,
uri = Uri.unsafeFromString(s"http://${ip.asString}:${port}/"),
headers = Headers(Header.Raw(CIString("X-Request-ID"), traceId.asString))
)
)
.map(status => status.code < 500)
.handleErrorWith(t =>
logger.error(Map("traceId" -> traceId.asString), t)("Fail to check if VM is reachable").as(false)
)
}

trait AppDAO[F[_]] {
Expand All @@ -52,4 +66,6 @@ trait AppDAO[F[_]] {
serviceName: ServiceName,
traceId: TraceId
): F[Boolean]

def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean]
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie
headers = Headers.empty
)
)
.handleError(_ => false)
.handleErrorWith(e =>
logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false)
)
case _ => F.pure(false)
}
} yield res
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ object HostStatus {
final case object HostNotFound extends HostStatus
final case object HostNotReady extends HostStatus
final case object HostPaused extends HostStatus
final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider) extends HostStatus {
// useHttp = true means the proxy connects to the backend via plain HTTP (port 80) instead of
// HTTPS (proxyConfig.proxyPort). Used for Galaxy VM apps whose nginx serves HTTP only.
final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider, useHttp: Boolean = false)
extends HostStatus {
def toUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/proxy/${path}")

def toNotebooksUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/notebooks/${path}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus
import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus.{HostNotFound, HostNotReady, HostReady}
import org.broadinstitute.dsde.workbench.leonardo.db.{DbReference, KubernetesServiceDbQueries}
import org.broadinstitute.dsde.workbench.leonardo.http.{kubernetesProxyHost, GetAppResult}
import org.broadinstitute.dsde.workbench.leonardo.AppType.Galaxy
import org.broadinstitute.dsde.workbench.leonardo.{AppName, CloudContext, CloudProvider}
import org.broadinstitute.dsde.workbench.model.IP
import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics
Expand Down Expand Up @@ -48,8 +49,14 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics](
case None => F.pure[HostStatus](HostNotReady)
case Some(ip) =>
val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain)
// Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP,
// and we map the fake hostname to the VM's external IP (stored in loadBalancerIp).
// External IP is used because Leo's pod is in a different VPC from the user's workspace project.
val isGalaxyVm = appResult.app.appType == Galaxy
hostToIpMapping
.getAndUpdate(_ + (h.address -> ip))
.as[HostStatus](HostReady(h, "", CloudProvider.Gcp)) // TODO: update this once we start support AKS
.as[HostStatus](
HostReady(h, "", CloudProvider.Gcp, useHttp = isGalaxyVm)
) // TODO: update this once we start support AKS
}
}
Loading
Loading