From 2123a7f42f50b5be61542c62d1956f429863f571 Mon Sep 17 00:00:00 2001 From: Greg Melton Date: Wed, 1 Apr 2026 14:18:23 -0700 Subject: [PATCH] Adding script to fix dangling profile records and adding a script to run rebuild-repo --- pds-fix-dangling-profile.sh | 109 +++++++++++++++++++++ pds-fix-dangling-profiles-from-list.sh | 42 ++++++++ pds-get-profile.sh | 130 +++++++++++++++++++++++++ pds-scan-dangling-records.sh | 105 ++++++++++++++++++++ service/run-rebuild.js | 14 +++ 5 files changed, 400 insertions(+) create mode 100644 pds-fix-dangling-profile.sh create mode 100755 pds-fix-dangling-profiles-from-list.sh create mode 100755 pds-get-profile.sh create mode 100644 pds-scan-dangling-records.sh create mode 100644 service/run-rebuild.js diff --git a/pds-fix-dangling-profile.sh b/pds-fix-dangling-profile.sh new file mode 100644 index 00000000..951146b5 --- /dev/null +++ b/pds-fix-dangling-profile.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash + +# Run this script to fix a dangling profile/self record in the actor store. +# This needs to be run directly on the PDS instance. +# +# This script will: +# Remove busted profile index rows (record + record_blob) for app.bsky.actor.profile/self, +# then rebuild the actor repo MST from the record table. +# +# Usage: +# ./fix-dangling-profile.sh did:plc:xxxxxxxxxxxxxx +# ACTOR_STORE_ROOT=/mnt/data/pds/actors ./fix-dangling-profile.sh did:plc:... +# PDS_ENV_FILE=/home/bluesky/pds.env ./fix-dangling-profile.sh did:plc:... +# DRY_RUN=1 ./fix-dangling-profile.sh did:plc:... +# +# Env: +# ACTOR_STORE_ROOT (default: $PDS_ACTOR_STORE_DIRECTORY or $PDS_DATA_DIRECTORY/actors) +# REBUILD_SCRIPT (default: /home/bluesky/current/service/bluesky-run-rebuild.js) +# PDS_ENV_FILE sourced before rebuild if set +# YES=1 skip confirmation +# DRY_RUN=1 print only + +set -euo pipefail + +# source pds.env variables +set -a +. /home/bluesky/pds.env +set +a + +die() { echo "ERROR: $*" >&2; exit 1; } + +[[ "${1:-}" == "-h" || "${1:-}" == "--help" ]] && { sed -n '1,25p' "$0"; exit 0; } +[[ -n "${1:-}" ]] || { echo "Usage: $0 "; exit 1; } + +DID="$1" +[[ "$DID" == did:* ]] || die "DID must start with did:" + +resolve_actor_root() { + if [[ -n "${ACTOR_STORE_ROOT:-}" ]]; then + printf '%s' "$ACTOR_STORE_ROOT" + elif [[ -n "${PDS_ACTOR_STORE_DIRECTORY:-}" ]]; then + printf '%s' "$PDS_ACTOR_STORE_DIRECTORY" + elif [[ -n "${PDS_DATA_DIRECTORY:-}" ]]; then + printf '%s' "${PDS_DATA_DIRECTORY}/actors" + else + die "Set ACTOR_STORE_ROOT, PDS_ACTOR_STORE_DIRECTORY, or PDS_DATA_DIRECTORY" + fi +} + +ACTOR_ROOT="$(resolve_actor_root)" +[[ -d "$ACTOR_ROOT" ]] || die "actor store root not found: $ACTOR_ROOT" + +command -v openssl >/dev/null 2>&1 || die "openssl required" +HASH="$(printf '%s' "$DID" | openssl dgst -sha256 | awk '{print $2}')" +SHARD="${HASH:0:2}" +DB="${ACTOR_ROOT}/${SHARD}/${DID}/store.sqlite" + +PROFILE_URI="at://${DID}/app.bsky.actor.profile/self" +REBUILD_SCRIPT="${REBUILD_SCRIPT:-/ebsa/bluesky/current/service/run-rebuild.js}" +DRY_RUN="${DRY_RUN:-0}" +YES="${YES:-0}" + +[[ -f "$DB" ]] || die "store.sqlite not found: $DB" + +if [[ "$DRY_RUN" == "1" ]]; then + echo "DRY_RUN DB=$DB" + echo "DELETE FROM record WHERE uri='$PROFILE_URI';" + echo "DELETE FROM record_blob WHERE recordUri='$PROFILE_URI';" + echo "node $REBUILD_SCRIPT $DID" + exit 0 +fi + +echo "Actor DB: $DB" +echo "Profile URI: $PROFILE_URI" +echo "Rebuild: node $REBUILD_SCRIPT $DID" +if [[ "$YES" != "1" ]]; then + read -r -p "Proceed? [y/N] " ans || true + [[ "${ans:-}" == "y" || "${ans:-}" == "Y" ]] || die "aborted" +fi + +command -v sqlite3 >/dev/null 2>&1 || die "sqlite3 not in PATH" + +sqlite3 "$DB" </dev/null 2>&1 || die "node not in PATH" + +if [[ -n "${PDS_ENV_FILE:-}" ]]; then + [[ -f "$PDS_ENV_FILE" ]] || die "PDS_ENV_FILE not found: $PDS_ENV_FILE" + set -a + # shellcheck disable=SC1090 + . "$PDS_ENV_FILE" + set +a +fi + +echo "Rebuilding the actor repo..." +node "$REBUILD_SCRIPT" "$DID" +echo "Done rebuilding the actor repo." +echo "--------------------------------" +echo "Rechecking the profile..." +exec "./pds-get-profile.sh" "$DID" +echo "--------------------------------" \ No newline at end of file diff --git a/pds-fix-dangling-profiles-from-list.sh b/pds-fix-dangling-profiles-from-list.sh new file mode 100755 index 00000000..0420c934 --- /dev/null +++ b/pds-fix-dangling-profiles-from-list.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# Run pds-fix-dangling-profile.sh once per non-empty line in dangling-profile-dids.txt. +# +# Usage: +# ./pds-fix-dangling-from-profile-from-list.sh +# DANGLING_PROFILE_DIDS_FILE=/tmp/dids.txt ./pds-fix-dangling-from-profile-from-list.sh +# FIX_DANGLING_PROFILES_SCRIPT=/path/to/pds-fix-dangling-profile.sh ./pds-fix-dangling-from-profile-list.sh +# +# If fix-dangling-profiles.sh expects the DID on stdin instead of $1, set: +# FIX_READS_STDIN=1 ./run-fix-dangling-from-list.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LIST="${DANGLING_PROFILE_DIDS_FILE:-$SCRIPT_DIR/dangling-profile-dids.txt}" +FIX="${FIX_DANGLING_PROFILE_SCRIPT:-$SCRIPT_DIR/pds-fix-dangling-profile.sh}" +FIX_READS_STDIN="${FIX_READS_STDIN:-0}" + +if [[ ! -f "$LIST" ]]; then + echo "ERROR: DID list not found: $LIST" >&2 + exit 1 +fi + +if [[ ! -f "$FIX" ]]; then + echo "ERROR: fix script not found: $FIX" >&2 + exit 1 +fi + +run_fix() { + local did="$1" + if [[ "$FIX_READS_STDIN" == "1" ]]; then + printf '%s\n' "$did" | bash "$FIX" + else + bash "$FIX" "$did" + fi +} + +while IFS= read -r did || [[ -n "${did:-}" ]]; do + [[ -z "${did// }" ]] && continue + [[ "$did" =~ ^[[:space:]]*# ]] && continue + run_fix "$did" +done <"$LIST" diff --git a/pds-get-profile.sh b/pds-get-profile.sh new file mode 100755 index 00000000..72ee563f --- /dev/null +++ b/pds-get-profile.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# Inspect app.bsky.actor.profile/self for one actor via store.sqlite. +# +# Runs two queries: +# 1) Healthy: record INNER JOIN repo_block (indexed row + leaf block present) +# 2) Dangling: record LEFT JOIN repo_block where block missing (busted index) +# +# Usage: +# ./pds-get-profile.sh [ACTOR_STORE_ROOT] +# PDS_DATA_DIRECTORY=/mnt/data/pds ./pds-get-profile.sh did:plc:... +# VERBOSE=1 ./pds-get-profile.sh did:plc:... /path/to/actors +# +# Env (optional): +# ACTOR_STORE_ROOT Same as second argument (overrides env chain if set) +# PDS_ACTOR_STORE_DIRECTORY +# PDS_DATA_DIRECTORY Used as ${PDS_DATA_DIRECTORY}/actors when set +# PDS_ENV_FILE If set, sourced: set -a; . "$PDS_ENV_FILE"; set +a +# VERBOSE=1 Include hex(rb.content) for healthy profile rows +# +# Actor path matches PDS ActorStore.getLocation: +# sha256(DID) hex -> first 2 chars / / store.sqlite + +set -euo pipefail + +# source pds.env variables +set -a +. /home/bluesky/pds.env +set +a + +die() { echo "ERROR: $*" >&2; exit 1; } + +usage() { + cat <<'EOF' +Usage: pds-get-profile.sh [ACTOR_STORE_ROOT] + +Print profile/self from the actor store.sqlite: healthy join vs dangling rows. +EOF + exit 0 +} + +[[ "${1:-}" == "-h" || "${1:-}" == "--help" ]] && usage +[[ -n "${1:-}" ]] || { + echo "Usage: pds-get-profile.sh [ACTOR_STORE_ROOT]" >&2 + exit 1 +} + +DID="$1" +[[ "$DID" == did:* ]] || die "DID must start with did:" + +ACTOR_ROOT_ARG="${2:-}" + +if [[ -n "${PDS_ENV_FILE:-}" ]]; then + [[ -f "$PDS_ENV_FILE" ]] || die "PDS_ENV_FILE not found: $PDS_ENV_FILE" + set -a + # shellcheck disable=SC1090 + . "$PDS_ENV_FILE" + set +a +fi + +resolve_actor_root() { + if [[ -n "${ACTOR_ROOT_ARG:-}" ]]; then + printf '%s' "$ACTOR_ROOT_ARG" + elif [[ -n "${ACTOR_STORE_ROOT:-}" ]]; then + printf '%s' "$ACTOR_STORE_ROOT" + elif [[ -n "${PDS_ACTOR_STORE_DIRECTORY:-}" ]]; then + printf '%s' "$PDS_ACTOR_STORE_DIRECTORY" + elif [[ -n "${PDS_DATA_DIRECTORY:-}" ]]; then + printf '%s' "${PDS_DATA_DIRECTORY}/actors" + else + printf '%s' "./actors" + fi +} + +ACTOR_ROOT="$(resolve_actor_root)" +[[ -d "$ACTOR_ROOT" ]] || die "actor store directory not found: $ACTOR_ROOT" + +command -v openssl >/dev/null 2>&1 || die "openssl is required" +command -v sqlite3 >/dev/null 2>&1 || die "sqlite3 not in PATH" + +HASH="$(printf '%s' "$DID" | openssl dgst -sha256 | awk '{print $2}')" +SHARD="${HASH:0:2}" +DB="${ACTOR_ROOT}/${SHARD}/${DID}/store.sqlite" + +[[ -f "$DB" ]] || die "store.sqlite not found: $DB" + +VERBOSE="${VERBOSE:-0}" + +echo "DID: $DID" +echo "DB: $DB" +echo "" + +# --- Healthy: block bytes present (default: length only; VERBOSE=1: hex content) +echo "=== Profile with repo_block (healthy) ===" +if [[ "$VERBOSE" == "1" ]]; then + sqlite3 -header -column "$DB" <<'SQL' +SELECT + r.uri, + r.cid, + r.indexedAt, + LENGTH(rb.content) AS content_bytes, + hex(rb.content) AS content_hex +FROM record AS r +INNER JOIN repo_block AS rb ON rb.cid = r.cid +WHERE r.collection = 'app.bsky.actor.profile' + AND r.rkey = 'self'; +SQL +else + sqlite3 -header -column "$DB" <<'SQL' +SELECT + r.uri, + r.cid, + r.indexedAt, + LENGTH(rb.content) AS content_bytes +FROM record AS r +INNER JOIN repo_block AS rb ON rb.cid = r.cid +WHERE r.collection = 'app.bsky.actor.profile' + AND r.rkey = 'self'; +SQL +fi + +echo "" +echo "=== Profile index row without repo_block (dangling / busted) ===" +sqlite3 -header -column "$DB" <<'SQL' +SELECT r.uri, r.cid, r.indexedAt +FROM record AS r +LEFT JOIN repo_block AS rb ON rb.cid = r.cid +WHERE r.collection = 'app.bsky.actor.profile' + AND r.rkey = 'self' + AND rb.cid IS NULL; +SQL diff --git a/pds-scan-dangling-records.sh b/pds-scan-dangling-records.sh new file mode 100644 index 00000000..3a88f6b0 --- /dev/null +++ b/pds-scan-dangling-records.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# Scan all actor store.sqlite files for dangling record rows (record.cid missing in repo_block). +# Optionally filter to busted app.bsky.actor.profile/self only. +# DIDs with a dangling profile/self record are appended to a file (reset each run). +# +# Usage: +# ./scan-dangling-profiles.sh [ACTOR_STORE_ROOT] +# ONLY_PROFILE=1 ./scan-dangling-profiles.sh /mnt/data/pds/actors +# VERBOSE=1 ONLY_PROFILE=1 ./scan-dangling-profiles.sh +# DANGLING_PROFILE_DIDS_FILE=/tmp/busted-dids.txt ./scan-dangling-profiles.sh +# +# Resolution order for ACTOR_STORE_ROOT: +# $1 > $PDS_ACTOR_STORE_DIRECTORY > $PDS_DATA_DIRECTORY/actors > ./actors + +set -euo pipefail + +# source pds.env variables +set -a +. /home/bluesky/pds.env +set +a + +resolve_actor_root() { + if [[ -n "${1:-}" ]]; then + printf '%s' "$1" + elif [[ -n "${PDS_ACTOR_STORE_DIRECTORY:-}" ]]; then + printf '%s' "$PDS_ACTOR_STORE_DIRECTORY" + elif [[ -n "${PDS_DATA_DIRECTORY:-}" ]]; then + printf '%s' "${PDS_DATA_DIRECTORY}/actors" + else + printf '%s' "./actors" + fi +} + +ACTOR_ROOT="$(resolve_actor_root "${1:-}")" + +if [[ ! -d "$ACTOR_ROOT" ]]; then + echo "ERROR: actor store directory not found: $ACTOR_ROOT" >&2 + exit 1 +fi + +if ! command -v sqlite3 >/dev/null 2>&1; then + echo "ERROR: sqlite3 not in PATH" >&2 + exit 1 +fi + +DANGLING_PROFILE_DIDS_FILE="${DANGLING_PROFILE_DIDS_FILE:-./dangling-profile-dids.txt}" +: >"$DANGLING_PROFILE_DIDS_FILE" + +# total dangling: record row whose cid has no repo_block row +SQL_TOTAL='SELECT COUNT(*) FROM record AS r LEFT JOIN repo_block AS rb ON rb.cid = r.cid WHERE rb.cid IS NULL;' + +# dangling specifically for profile/self +SQL_PROFILE='SELECT COUNT(*) FROM record AS r LEFT JOIN repo_block AS rb ON rb.cid = r.cid WHERE rb.cid IS NULL AND r.uri LIKE "%/app.bsky.actor.profile/self";' + + +# sample of dangling URIs (for debugging) +SQL_LIST='SELECT r.uri || " | " || r.cid FROM record AS r LEFT JOIN repo_block AS rb ON rb.cid = r.cid WHERE rb.cid IS NULL LIMIT 50;' + +ONLY_PROFILE="${ONLY_PROFILE:-0}" +VERBOSE="${VERBOSE:-0}" + +actors_any=0 +actors_profile=0 + +while IFS= read -r -d '' db; do + tot="$(sqlite3 "$db" "$SQL_TOTAL" 2>/dev/null)" || { + echo "WARN: sqlite3 failed (skip): $db" >&2 + continue + } + + if [[ -z "${tot:-}" ]] || [[ "$tot" -eq 0 ]]; then + continue + fi + + prof="$(sqlite3 "$db" "$SQL_PROFILE" 2>/dev/null)" || prof=0 + if [[ "${ONLY_PROFILE}" == "1" ]] && [[ "${prof:-0}" -eq 0 ]]; then + continue + fi + + rel="${db#"$ACTOR_ROOT"/}" + echo "DANGLING store=$rel" + echo " total_dangling_rows=$tot profile_self_dangling=$prof" + + if [[ "${prof:-0}" -gt 0 ]]; then + # PDS layout: ACTOR_ROOT///store.sqlite + did="$(basename "$(dirname "$rel")")" + printf '%s\n' "$did" >>"$DANGLING_PROFILE_DIDS_FILE" + fi + + if [[ "$VERBOSE" == "1" ]]; then + sqlite3 "$db" "$SQL_LIST" 2>/dev/null | sed 's/^/ /' || true + echo + fi + + actors_any=$((actors_any + 1)) + if [[ "${prof:-0}" -gt 0 ]]; then + actors_profile=$((actors_profile + 1)) + fi +done < <(find "$ACTOR_ROOT" -name store.sqlite -type f -print0) + +echo "---" +echo "summary: actor_dbs_with_dangling=${actors_any}" +echo "summary: actor_dbs_with_busted_profile_self=${actors_profile}" +echo "scanned_root=${ACTOR_ROOT}" +echo "dangling_profile_dids_file=${DANGLING_PROFILE_DIDS_FILE}" \ No newline at end of file diff --git a/service/run-rebuild.js b/service/run-rebuild.js new file mode 100644 index 00000000..265d362c --- /dev/null +++ b/service/run-rebuild.js @@ -0,0 +1,14 @@ +const { AppContext, readEnv, envToCfg, envToSecrets, scripts } = require('@atproto/pds') +async function main() { + const did = process.argv[2] + if (!did) throw new Error('Usage: node run-rebuild.js ') + const env = readEnv() + console.log(`Running updates on ${env.hostname}`); + const ctx = await AppContext.fromConfig(envToCfg(env), envToSecrets(env)) + await scripts['rebuild-repo'](ctx, did, false) + console.log('DONE') +} +main().catch((err) => { + console.error(err) + process.exit(1) +}) \ No newline at end of file