diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 762a5d69a..32627c1b3 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1500,6 +1500,55 @@ commands: help: "Target cluster pool ID or name" dest: target_pool type: str + - name: collect-logs + help: Collect simplyblock container logs for a given time window. + arguments: + - name: "start_time" + help: "Start of the collection window (UTC assumed if no timezone given). Formats: \"2024-01-15T10:00:00\" or \"2024-01-15 10:00:00\"" + dest: start_time + type: str + - name: "duration_minutes" + help: "Duration in minutes." + dest: duration_minutes + type: int + - name: "--output-dir" + help: "Directory to write the output tarball (default: current directory)." + dest: output_dir + type: str + default: "." + - name: "--use-opensearch" + help: "Query OpenSearch directly via scroll API instead of the Graylog REST API. + Useful for very large result sets or when Graylog is unreachable." + dest: use_opensearch + type: bool + default: false + action: store_true + - name: "--cluster-id" + help: "Target a specific cluster UUID (default: first cluster returned by sbctl)." + dest: cluster_id + type: str + - name: "--mgmt-ip" + help: "Override the management-node IP used to reach Graylog / OpenSearch." + dest: mgmt_ip + type: str + - name: "--monitoring-secret" + help: "Graylog / OpenSearch password to use instead of the cluster secret. + When provided this takes precedence over the cluster secret." + dest: monitoring_secret + type: str + - name: "--namespace" + help: "Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock). + Pass an empty string to skip kubectl collection." + dest: namespace + type: str + default: "simplyblock" + - name: "--diagnose" + help: "Print a diagnostic report from OpenSearch (indices, field names, + sample documents, container names present in the time window) and exit without collecting logs. + Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch." + dest: diagnose + type: bool + action: store_true - name: "volume" help: "Logical volume commands." aliases: diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 366f1e59f..34848b6cb 100755 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -379,6 +379,7 @@ def init_cluster(self): self.init_cluster__set(subparser) self.init_cluster__change_name(subparser) self.init_cluster__add_replication(subparser) + self.init_cluster__collect_logs(subparser) def init_cluster__create(self, subparser): @@ -579,6 +580,18 @@ def init_cluster__add_replication(self, subparser): argument = subcommand.add_argument('--timeout', help='Snapshot replication network timeout', type=int, default=3600, dest='timeout') argument = subcommand.add_argument('--target-pool', help='Target cluster pool ID or name', type=str, dest='target_pool') + def init_cluster__collect_logs(self, subparser): + subcommand = self.add_sub_command(subparser, 'collect-logs', 'Collect simplyblock container logs for a given time window.') + subcommand.add_argument('start_time', help='Start of the collection window (UTC assumed if no timezone given). Formats: "2024-01-15T10:00:00" or "2024-01-15 10:00:00"', type=str) + subcommand.add_argument('duration_minutes', help='Duration in minutes.', type=int) + argument = subcommand.add_argument('--output-dir', help='Directory to write the output tarball (default: current directory).', type=str, default='.', dest='output_dir') + argument = subcommand.add_argument('--use-opensearch', help='Query OpenSearch directly via scroll API instead of the Graylog REST API. Useful for very large result sets or when Graylog is unreachable.', default=False, dest='use_opensearch', action='store_true') + argument = subcommand.add_argument('--cluster-id', help='Target a specific cluster UUID (default: first cluster returned by sbctl).', type=str, dest='cluster_id') + argument = subcommand.add_argument('--mgmt-ip', help='Override the management-node IP used to reach Graylog / OpenSearch.', type=str, dest='mgmt_ip') + argument = subcommand.add_argument('--monitoring-secret', help='Graylog / OpenSearch password to use instead of the cluster secret. When provided this takes precedence over the cluster secret.', type=str, dest='monitoring_secret') + argument = subcommand.add_argument('--namespace', help='Kubernetes namespace to collect CSI / storage-node DS pod logs from (default: simplyblock). Pass an empty string to skip kubectl collection.', type=str, default='simplyblock', dest='namespace') + argument = subcommand.add_argument('--diagnose', help='Print a diagnostic report from OpenSearch (indices, field names, sample documents, container names present in the time window) and exit without collecting logs. Use this when collections return 0 to understand the actual data layout. Implies --use-opensearch.', dest='diagnose', action='store_true') + def init_volume(self): subparser = self.add_command('volume', 'Logical volume commands.', aliases=['lvol',]) @@ -1289,6 +1302,8 @@ def run(self): ret = self.cluster__change_name(sub_command, args) elif sub_command in ['add-replication']: ret = self.cluster__add_replication(sub_command, args) + elif sub_command in ['collect-logs']: + ret = self.cluster__collect_logs(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 69adb5646..146ef4a38 100755 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -16,6 +16,7 @@ from simplyblock_core.controllers import health_controller from simplyblock_core.models.pool import Pool from simplyblock_core.models.cluster import Cluster +from simplyblock_core.scripts.collect_logs import collect_logs def range_type(min, max): @@ -527,6 +528,10 @@ def cluster__complete_expand(self, sub_command, args): def cluster__add_replication(self, sub_command, args): return cluster_ops.add_replication(args.cluster_id, args.target_cluster_id, args.timeout, args.target_pool) + def cluster__collect_logs(self, sub_command, args): + return collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch, + args.cluster_id, args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose) + def volume__add(self, sub_command, args): import json as _json name = args.name diff --git a/simplyblock_core/scripts/collect_logs.py b/simplyblock_core/scripts/collect_logs.py index 62e78a0fb..cc397bd05 100755 --- a/simplyblock_core/scripts/collect_logs.py +++ b/simplyblock_core/scripts/collect_logs.py @@ -834,10 +834,15 @@ def main(): if args.diagnose: args.use_opensearch = True + collect_logs(args.start_time, args.duration_minutes, args.output_dir, args.use_opensearch, args.cluster_id, + args.mgmt_ip, args.monitoring_secret, args.namespace, args.diagnose) + +def collect_logs(start_time, duration_minutes, output_dir, use_opensearch, cluster_id, mgmt_ip, monitoring_secret, + namespace, diagnose): # ── 1. Parse time range ────────────────────────────────────────────────── try: - start_dt = datetime.fromisoformat(args.start_time.replace(" ", "T")) + start_dt = datetime.fromisoformat(start_time.replace(" ", "T")) except ValueError as exc: print(f"ERROR: invalid start_time – {exc}", file=sys.stderr) sys.exit(1) @@ -845,20 +850,20 @@ def main(): if start_dt.tzinfo is None: start_dt = start_dt.replace(tzinfo=timezone.utc) - end_dt = start_dt + timedelta(minutes=args.duration_minutes) + end_dt = start_dt + timedelta(minutes=duration_minutes) from_iso = start_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") to_iso = end_dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") print("=" * 64) print(" Simplyblock Log Collector") print("=" * 64) - print(f" Window : {from_iso} → {to_iso} ({args.duration_minutes} min)") - print(f" Mode : {'OpenSearch (direct)' if args.use_opensearch else 'Graylog REST API'}") + print(f" Window : {from_iso} → {to_iso} ({duration_minutes} min)") + print(f" Mode : {'OpenSearch (direct)' if use_opensearch else 'Graylog REST API'}") # ── 2. Cluster UUID + secret ───────────────────────────────────────────── print("\n[1] Retrieving cluster info …") - cluster_uuid = args.cluster_id + cluster_uuid = cluster_id if not cluster_uuid: clusters = sbctl_json("cluster", "list") if not clusters: @@ -877,8 +882,7 @@ def main(): # ── 3. Management-node IP ──────────────────────────────────────────────── print("\n[2] Resolving management node …") - if args.mgmt_ip: - mgmt_ip = args.mgmt_ip + if mgmt_ip: print(f" Using provided IP : {mgmt_ip}") else: cp_nodes = sbctl_json("control-plane", "list") @@ -902,8 +906,8 @@ def main(): # ── 5. HTTP sessions ───────────────────────────────────────────────────── - graylog_password = args.monitoring_secret if args.monitoring_secret else cluster_secret - if args.monitoring_secret: + graylog_password = monitoring_secret if monitoring_secret else cluster_secret + if monitoring_secret: print(" Using provided --monitoring-secret for Graylog auth.") gl_session = requests.Session() @@ -913,7 +917,7 @@ def main(): os_session = requests.Session() # Verify Graylog reachability (informational only) - if not args.use_opensearch: + if not use_opensearch: print(f"\n[4] Checking Graylog at {graylog_base} …") try: r = gl_session.get(f"{graylog_base}/system", timeout=10) @@ -937,15 +941,15 @@ def main(): print(f" WARN: {exc}.") # --diagnose: print full report and exit - if args.diagnose: + if diagnose: opensearch_diagnose(os_session, opensearch_base, from_iso, to_iso) sys.exit(0) # ── 6. Prepare temp workspace ──────────────────────────────────────────── ts_str = start_dt.strftime("%Y%m%d_%H%M%S") - bundle_name = f"sb_logs_{ts_str}_{args.duration_minutes}m" - output_dir = Path(args.output_dir).resolve() + bundle_name = f"sb_logs_{ts_str}_{duration_minutes}m" + output_dir = Path(output_dir).resolve() output_dir.mkdir(parents=True, exist_ok=True) tarball_path = output_dir / f"{bundle_name}.tar.gz" @@ -956,7 +960,7 @@ def main(): os_session=os_session, graylog_base=graylog_base, opensearch_base=opensearch_base, - use_opensearch=args.use_opensearch, + use_opensearch=use_opensearch, from_iso=from_iso, to_iso=to_iso, probe_cache=probe_cache, @@ -1045,7 +1049,7 @@ def main(): # ── 9. Kubernetes pod logs (CSI node + storage-node DS) ────────────── - k8s_ns = args.namespace + k8s_ns = namespace if k8s_ns: print(f"\n[7] Collecting Kubernetes pod logs (namespace: {k8s_ns}) …") k8s_dir = log_root / "k8s_pods" @@ -1176,10 +1180,10 @@ def save_sbctl(label, cmd_args, out_name, use_json=False): "collected_at": datetime.now(timezone.utc).isoformat(), "window_from": from_iso, "window_to": to_iso, - "duration_minutes": args.duration_minutes, + "duration_minutes": duration_minutes, "cluster_uuid": cluster_uuid, "mgmt_ip": mgmt_ip, - "mode": "opensearch-direct" if args.use_opensearch else "graylog-api", + "mode": "opensearch-direct" if use_opensearch else "graylog-api", "storage_nodes": [ { "hostname": n.get("Hostname"),