Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions simplyblock_cli/cli-reference.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1500,6 +1500,55 @@ commands:
help: "Target cluster pool ID or name"
dest: target_pool
type: str
- name: collect-logs
help: Collect simplyblock container logs for a given time window.
arguments:
- name: "start_time"
help: "Start of the collection window (UTC assumed if no timezone given). Formats: \"2024-01-15T10:00:00\" or \"2024-01-15 10:00:00\""
dest: start_time
type: str
- name: "duration_minutes"
help: "Duration in minutes."
dest: duration_minutes
type: int
- name: "--output-dir"
help: "Directory to write the output tarball (default: current directory)."
dest: output_dir
type: str
default: "."
- name: "--use-opensearch"
help: "Query OpenSearch directly via scroll API instead of the Graylog REST API.
Useful for very large result sets or when Graylog is unreachable."
dest: use_opensearch
type: bool
default: false
action: store_true
- name: "--cluster-id"
help: "Target a specific cluster UUID (default: first cluster returned by sbctl)."
dest: cluster_id
type: str
- name: "--mgmt-ip"
help: "Override the management-node IP used to reach Graylog / OpenSearch."
dest: mgmt_ip
type: str
- name: "--monitoring-secret"
help: "Graylog / OpenSearch password to use instead of the cluster secret.
When provided this takes precedence over the cluster secret."
dest: monitoring_secret
type: str
- name: "--namespace"
help: "Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock).
Pass an empty string to skip kubectl collection."
dest: namespace
type: str
default: "simplyblock"
- name: "--diagnose"
help: "Print a diagnostic report from OpenSearch (indices, field names,
sample documents, container names present in the time window) and exit without collecting logs.
Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch."
dest: diagnose
type: bool
action: store_true
- name: "volume"
help: "Logical volume commands."
aliases:
Expand Down
15 changes: 15 additions & 0 deletions simplyblock_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,7 @@ def init_cluster(self):
self.init_cluster__set(subparser)
self.init_cluster__change_name(subparser)
self.init_cluster__add_replication(subparser)
self.init_cluster__collect_logs(subparser)


def init_cluster__create(self, subparser):
Expand Down Expand Up @@ -579,6 +580,18 @@ def init_cluster__add_replication(self, subparser):
argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout')
argument = subcommand.add_argument('--target-pool', help='Target cluster pool ID or name', type=str, dest='target_pool')

def init_cluster__collect_logs(self, subparser):
subcommand = self.add_sub_command(subparser, 'collect-logs', 'Collect simplyblock container logs for a given time window.')
subcommand.add_argument('start_time', help='Start of the collection window (UTC assumed if no timezone given). Formats: "2024-01-15T10:00:00" or "2024-01-15 10:00:00"', type=str)
subcommand.add_argument('duration_minutes', help='Duration in minutes.', type=int)
argument = subcommand.add_argument('--output-dir', help='Directory to write the output tarball (default: current directory).', type=str, default='.', dest='output_dir')
argument = subcommand.add_argument('--use-opensearch', help='Query OpenSearch directly via scroll API instead of the Graylog REST API. Useful for very large result sets or when Graylog is unreachable.', default=False, dest='use_opensearch', action='store_true')
argument = subcommand.add_argument('--cluster-id', help='Target a specific cluster UUID (default: first cluster returned by sbctl).', type=str, dest='cluster_id')
argument = subcommand.add_argument('--mgmt-ip', help='Override the management-node IP used to reach Graylog / OpenSearch.', type=str, dest='mgmt_ip')
argument = subcommand.add_argument('--monitoring-secret', help='Graylog / OpenSearch password to use instead of the cluster secret. When provided this takes precedence over the cluster secret.', type=str, dest='monitoring_secret')
argument = subcommand.add_argument('--namespace', help='Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock). Pass an empty string to skip kubectl collection.', type=str, default='simplyblock', dest='namespace')
argument = subcommand.add_argument('--diagnose', help='Print a diagnostic report from OpenSearch (indices, field names, sample documents, container names present in the time window) and exit without collecting logs. Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch.', dest='diagnose', action='store_true')


def init_volume(self):
subparser = self.add_command('volume', 'Logical volume commands.', aliases=['lvol',])
Expand Down Expand Up @@ -1289,6 +1302,8 @@ def run(self):
ret = self.cluster__change_name(sub_command, args)
elif sub_command in ['add-replication']:
ret = self.cluster__add_replication(sub_command, args)
elif sub_command in ['collect-logs']:
ret = self.cluster__collect_logs(sub_command, args)
else:
self.parser.print_help()

Expand Down
5 changes: 5 additions & 0 deletions simplyblock_cli/clibase.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from simplyblock_core.controllers import health_controller
from simplyblock_core.models.pool import Pool
from simplyblock_core.models.cluster import Cluster
from simplyblock_core.scripts.collect_logs import collect_logs


def range_type(min, max):
Expand Down Expand Up @@ -527,6 +528,10 @@ def cluster__complete_expand(self, sub_command, args):
def cluster__add_replication(self, sub_command, args):
return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout, args.target_pool)

def cluster__collect_logs(self, sub_command, args):
return collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch,
args.cluster_id, args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose)

def volume__add(self, sub_command, args):
import json as _json
name = args.name
Expand Down
38 changes: 21 additions & 17 deletions simplyblock_core/scripts/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,31 +834,36 @@ def main():
if args.diagnose:
args.use_opensearch = True

collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch, args.cluster_id,
args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose)

def collect_logs(start_time, duration_minutes, output_dir, use_opensearch, cluster_id, mgmt_ip, monitoring_secret,
namespace, diagnose):
# ── 1. Parse time range ──────────────────────────────────────────────────

try:
start_dt = datetime.fromisoformat(args.start_time.replace(" ", "T"))
start_dt = datetime.fromisoformat(start_time.replace(" ", "T"))
except ValueError as exc:
print(f"ERROR: invalid start_time – {exc}", file=sys.stderr)
sys.exit(1)

if start_dt.tzinfo is None:
start_dt = start_dt.replace(tzinfo=timezone.utc)

end_dt = start_dt + timedelta(minutes=args.duration_minutes)
end_dt = start_dt + timedelta(minutes=duration_minutes)
from_iso = start_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
to_iso = end_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")

print("=" * 64)
print(" Simplyblock Log Collector")
print("=" * 64)
print(f" Window : {from_iso} → {to_iso} ({args.duration_minutes} min)")
print(f" Mode : {'OpenSearch (direct)' if args.use_opensearch else 'Graylog REST API'}")
print(f" Window : {from_iso} → {to_iso} ({duration_minutes} min)")
print(f" Mode : {'OpenSearch (direct)' if use_opensearch else 'Graylog REST API'}")

# ── 2. Cluster UUID + secret ─────────────────────────────────────────────

print("\n[1] Retrieving cluster info …")
cluster_uuid = args.cluster_id
cluster_uuid = cluster_id
if not cluster_uuid:
clusters = sbctl_json("cluster", "list")
if not clusters:
Expand All @@ -877,8 +882,7 @@ def main():
# ── 3. Management-node IP ────────────────────────────────────────────────

print("\n[2] Resolving management node …")
if args.mgmt_ip:
mgmt_ip = args.mgmt_ip
if mgmt_ip:
print(f" Using provided IP : {mgmt_ip}")
else:
cp_nodes = sbctl_json("control-plane", "list")
Expand All @@ -902,8 +906,8 @@ def main():

# ── 5. HTTP sessions ─────────────────────────────────────────────────────

graylog_password = args.monitoring_secret if args.monitoring_secret else cluster_secret
if args.monitoring_secret:
graylog_password = monitoring_secret if monitoring_secret else cluster_secret
if monitoring_secret:
print(" Using provided --monitoring-secret for Graylog auth.")

gl_session = requests.Session()
Expand All @@ -913,7 +917,7 @@ def main():
os_session = requests.Session()

# Verify Graylog reachability (informational only)
if not args.use_opensearch:
if not use_opensearch:
print(f"\n[4] Checking Graylog at {graylog_base} …")
try:
r = gl_session.get(f"{graylog_base}/system", timeout=10)
Expand All @@ -937,15 +941,15 @@ def main():
print(f" WARN: {exc}.")

# --diagnose: print full report and exit
if args.diagnose:
if diagnose:
opensearch_diagnose(os_session, opensearch_base, from_iso, to_iso)
sys.exit(0)

# ── 6. Prepare temp workspace ────────────────────────────────────────────

ts_str = start_dt.strftime("%Y%m%d_%H%M%S")
bundle_name = f"sb_logs_{ts_str}_{args.duration_minutes}m"
output_dir = Path(args.output_dir).resolve()
bundle_name = f"sb_logs_{ts_str}_{duration_minutes}m"
output_dir = Path(output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
tarball_path = output_dir / f"{bundle_name}.tar.gz"

Expand All @@ -956,7 +960,7 @@ def main():
os_session=os_session,
graylog_base=graylog_base,
opensearch_base=opensearch_base,
use_opensearch=args.use_opensearch,
use_opensearch=use_opensearch,
from_iso=from_iso,
to_iso=to_iso,
probe_cache=probe_cache,
Expand Down Expand Up @@ -1045,7 +1049,7 @@ def main():

# ── 9. Kubernetes pod logs (CSI node + storage-node DS) ──────────────

k8s_ns = args.namespace
k8s_ns = namespace
if k8s_ns:
print(f"\n[7] Collecting Kubernetes pod logs (namespace: {k8s_ns}) …")
k8s_dir = log_root / "k8s_pods"
Expand Down Expand Up @@ -1176,10 +1180,10 @@ def save_sbctl(label, cmd_args, out_name, use_json=False):
"collected_at": datetime.now(timezone.utc).isoformat(),
"window_from": from_iso,
"window_to": to_iso,
"duration_minutes": args.duration_minutes,
"duration_minutes": duration_minutes,
"cluster_uuid": cluster_uuid,
"mgmt_ip": mgmt_ip,
"mode": "opensearch-direct" if args.use_opensearch else "graylog-api",
"mode": "opensearch-direct" if use_opensearch else "graylog-api",
"storage_nodes": [
{
"hostname": n.get("Hostname"),
Expand Down
Loading