From 77cd818194696c8b32ca0062769e97db47355984 Mon Sep 17 00:00:00 2001 From: Karanveer Singh Sirohi Date: Wed, 20 Aug 2025 13:10:27 +0530 Subject: [PATCH] makes changes to file structure and adds scripts --- agents/__init__.py | 0 agents.py => agents/agents.py | 2 +- default_agent.py => agents/default_agent.py | 2 +- .../log_analysis_agent.py | 0 .../prompt_executor.py | 2 +- app.py | 4 +- scripts/api_health_check_with_sdk.py | 277 ++++++++++++++++++ scripts/build_verification_tool.py | 2 +- scripts/cpu_usage_triage_with_sdk.py | 29 ++ scripts/disk_io_triage_with_sdk.py | 28 ++ scripts/disk_space_triage_with_sdk.py | 29 ++ scripts/grafana_ai_tool.py | 4 +- .../grafana_dashboard_monitoring_with_sdk.py | 166 +++++++++++ .../grafana_global_slo_monitoring_with_sdk.py | 132 +++++++++ scripts/grafana_non_ai_tool.py | 2 +- scripts/high_load_triage_with_sdk.py | 24 ++ scripts/k8s_5xx_errors_tool.py | 2 +- scripts/llm_chat.py | 2 +- scripts/mysql_high_load_check_with_sdk.py | 26 ++ setup_credentials.py | 2 +- slack_utils/__init__.py | 0 .../slack_credentials_manager.py | 0 .../slack_events.py | 2 +- .../slack_manifest.json | 0 slack_utils.py => slack_utils/slack_utils.py | 0 workflow_manager.py | 6 +- 26 files changed, 727 insertions(+), 16 deletions(-) create mode 100644 agents/__init__.py rename agents.py => agents/agents.py (98%) rename default_agent.py => agents/default_agent.py (98%) rename log_analysis_agent.py => agents/log_analysis_agent.py (100%) rename prompt_executor.py => agents/prompt_executor.py (98%) create mode 100644 scripts/api_health_check_with_sdk.py create mode 100644 scripts/cpu_usage_triage_with_sdk.py create mode 100644 scripts/disk_io_triage_with_sdk.py create mode 100644 scripts/disk_space_triage_with_sdk.py create mode 100644 scripts/grafana_dashboard_monitoring_with_sdk.py create mode 100644 scripts/grafana_global_slo_monitoring_with_sdk.py create mode 100644 scripts/high_load_triage_with_sdk.py create mode 100644 scripts/mysql_high_load_check_with_sdk.py create mode 100644 slack_utils/__init__.py rename slack_credentials_manager.py => slack_utils/slack_credentials_manager.py (100%) rename slack_events.py => slack_utils/slack_events.py (99%) rename slack_manifest.json => slack_utils/slack_manifest.json (100%) rename slack_utils.py => slack_utils/slack_utils.py (100%) diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agents.py b/agents/agents.py similarity index 98% rename from agents.py rename to agents/agents.py index a777a30..4269f1a 100644 --- a/agents.py +++ b/agents/agents.py @@ -1,5 +1,5 @@ from openai import OpenAI -from slack_credentials_manager import credentials_manager +from slack_utils.slack_credentials_manager import credentials_manager import json from mcp_servers.mcp_utils import execute_tool import tiktoken diff --git a/default_agent.py b/agents/default_agent.py similarity index 98% rename from default_agent.py rename to agents/default_agent.py index 1bab9d4..d9c28d6 100755 --- a/default_agent.py +++ b/agents/default_agent.py @@ -4,7 +4,7 @@ import time import os from mcp_servers.mcp_utils import fetch_tools_list -from agents import agent_with_tools +from agents.agents import agent_with_tools # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/log_analysis_agent.py b/agents/log_analysis_agent.py similarity index 100% rename from log_analysis_agent.py rename to agents/log_analysis_agent.py diff --git a/prompt_executor.py b/agents/prompt_executor.py similarity index 98% rename from prompt_executor.py rename to agents/prompt_executor.py index 38217df..cae185b 100755 --- a/prompt_executor.py +++ b/agents/prompt_executor.py @@ -4,7 +4,7 @@ import time import os from mcp_servers.mcp_utils import fetch_tools_list -from agents import agent_with_tools +from agents.agents import agent_with_tools # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/app.py b/app.py index 74f296a..f959ed2 100644 --- a/app.py +++ b/app.py @@ -5,8 +5,8 @@ import os import asyncio from datetime import datetime -from slack_events import slack_event_handler -from slack_credentials_manager import credentials_manager +from slack_utils.slack_events import slack_event_handler +from slack_utils.slack_credentials_manager import credentials_manager from workflow_manager import workflow_manager app = FastAPI(title="AI Slack Bot Builder", version="1.0.0") diff --git a/scripts/api_health_check_with_sdk.py b/scripts/api_health_check_with_sdk.py new file mode 100644 index 0000000..fca4c9d --- /dev/null +++ b/scripts/api_health_check_with_sdk.py @@ -0,0 +1,277 @@ +from pathlib import Path +from typing import Dict, List, Any +from datetime import datetime +import json + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +# Generic API health check configuration - customize these for your services +API_ENDPOINTS = { + "Web Application": { + "url": "https://api.example.com/health", + "method": "GET", + "expected_status": 200, + "timeout": 30, + "description": "Main web application health endpoint", + "headers": { + "User-Agent": "Health-Check-Script/1.0", + "Accept": "application/json" + } + }, + "Database API": { + "url": "https://db-api.example.com/status", + "method": "GET", + "expected_status": 200, + "timeout": 15, + "description": "Database connection status API", + "headers": { + "Authorization": "Bearer your-token-here" + } + }, + "Authentication Service": { + "url": "https://auth.example.com/health", + "method": "GET", + "expected_status": 200, + "timeout": 20, + "description": "Authentication service health check", + "headers": {} + }, + "File Storage Service": { + "url": "https://storage.example.com/health", + "method": "GET", + "expected_status": 200, + "timeout": 25, + "description": "File storage service status", + "headers": {} + }, + "Message Queue": { + "url": "https://mq.example.com/status", + "method": "GET", + "expected_status": 200, + "timeout": 20, + "description": "Message queue service health", + "headers": {} + } +} + +# Health check thresholds +HEALTH_THRESHOLDS = { + "response_time_critical": 5.0, # Above 5 seconds is critical + "response_time_warning": 2.0, # Above 2 seconds is warning + "availability_target": 99.9 # Target availability percentage +} + + +def build_curl_command(endpoint_config: Dict[str, Any]) -> str: + """Build curl command for the given endpoint configuration.""" + url = endpoint_config["url"] + method = endpoint_config["method"] + timeout = endpoint_config["timeout"] + headers = endpoint_config.get("headers", {}) + + # Build curl command + cmd = f"curl -s -w 'HTTPSTATUS:%{{http_code}}|TIME:%{{time_total}}|SIZE:%{{size_download}}'" + + # Add method + if method != "GET": + cmd += f" -X {method}" + + # Add timeout + cmd += f" --max-time {timeout}" + + # Add headers + for key, value in headers.items(): + if value: # Only add non-empty headers + cmd += f" -H '{key}: {value}'" + + # Add URL + cmd += f" '{url}'" + + return cmd + + +def parse_curl_output(output: str) -> Dict[str, Any]: + """Parse curl output to extract status code, response time, and size.""" + try: + # Split output by the custom delimiter + parts = output.split('HTTPSTATUS:') + if len(parts) != 2: + return {"status_code": 0, "response_time": 0, "size": 0, "raw_output": output} + + # Extract the metrics part + metrics_part = parts[1] + metrics_parts = metrics_part.split('|') + + result = {} + for part in metrics_parts: + if ':' in part: + key, value = part.split(':', 1) + if key == 'HTTPSTATUS': + result['status_code'] = int(value) if value.isdigit() else 0 + elif key == 'TIME': + result['response_time'] = float(value) if value.replace('.', '').isdigit() else 0 + elif key == 'SIZE': + result['size'] = int(value) if value.isdigit() else 0 + + result['raw_output'] = output + return result + + except Exception as e: + return {"status_code": 0, "response_time": 0, "size": 0, "raw_output": output, "parse_error": str(e)} + + +def get_health_status(endpoint_config: Dict[str, Any], response_data: Dict[str, Any]) -> str: + """Determine health status based on response data and thresholds.""" + status_code = response_data.get("status_code", 0) + response_time = response_data.get("response_time", 0) + + # Check status code first + if status_code != endpoint_config["expected_status"]: + return "šŸ”“ CRITICAL" + + # Check response time + if response_time > HEALTH_THRESHOLDS["response_time_critical"]: + return "šŸ”“ CRITICAL" + elif response_time > HEALTH_THRESHOLDS["response_time_warning"]: + return "🟔 WARNING" + else: + return "🟢 HEALTHY" + + +def check_api_health(credentials_file_path: str) -> None: + """Check health of all configured API endpoints using curl commands.""" + sdk = DroidSDK(credentials_file_path) + + print(f"🌐 Generic API Health Check - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("=" * 80) + + overall_health = "🟢 HEALTHY" + endpoint_results = {} + critical_endpoints = [] + + for endpoint_name, config in API_ENDPOINTS.items(): + try: + print(f"\nšŸ” Checking {endpoint_name}...") + print(f" Description: {config['description']}") + print(f" URL: {config['url']}") + print(f" Method: {config['method']}") + print(f" Expected Status: {config['expected_status']}") + + # Build and execute curl command + curl_cmd = build_curl_command(config) + print(f" Executing: {curl_cmd}") + + result = sdk.bash.execute_command(command=curl_cmd) + + # Parse the result + response_data = parse_curl_output(result) + + # Determine health status + health_status = get_health_status(config, response_data) + + # Display results + print(f" Status Code: {response_data.get('status_code', 'N/A')}") + print(f" Response Time: {response_data.get('response_time', 0):.3f}s") + print(f" Response Size: {response_data.get('size', 0)} bytes") + print(f" Health Status: {health_status}") + + # Track results + endpoint_results[endpoint_name] = { + "health": health_status, + "response_data": response_data, + "config": config + } + + # Update overall health + if "CRITICAL" in health_status: + critical_endpoints.append(endpoint_name) + if overall_health == "🟢 HEALTHY": + overall_health = "šŸ”“ CRITICAL" + elif "WARNING" in health_status and overall_health == "🟢 HEALTHY": + overall_health = "🟔 WARNING" + + except Exception as e: + print(f" āŒ Error checking {endpoint_name}: {e}") + endpoint_results[endpoint_name] = { + "health": "āŒ ERROR", + "response_data": {}, + "config": config, + "error": str(e) + } + if overall_health == "🟢 HEALTHY": + overall_health = "🟔 WARNING" + + # Summary report + print("\n" + "=" * 80) + print(f"šŸ“Š Overall API Health: {overall_health}") + print(f"šŸ” Endpoints Checked: {len(API_ENDPOINTS)}") + + healthy_count = len([r for r in endpoint_results.values() if "HEALTHY" in r["health"]]) + warning_count = len([r for r in endpoint_results.values() if "WARNING" in r["health"]]) + critical_count = len([r for r in endpoint_results.values() if "CRITICAL" in r["health"]]) + error_count = len([r for r in endpoint_results.values() if "ERROR" in r["health"]]) + + print(f" 🟢 Healthy: {healthy_count}") + print(f" 🟔 Warning: {warning_count}") + print(f" šŸ”“ Critical: {critical_count}") + print(f" āŒ Errors: {error_count}") + + if critical_endpoints: + print(f"\n🚨 Critical Endpoints: {', '.join(critical_endpoints)}") + print("āš ļø Immediate attention required!") + + print(f"\nāœ… API health check completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + +def export_health_results(credentials_file_path: str, output_file: str) -> None: + """Export health check results to JSON file.""" + sdk = DroidSDK(credentials_file_path) + + try: + print(f"šŸ“¤ Exporting health check results...") + + results = {} + for endpoint_name, config in API_ENDPOINTS.items(): + try: + curl_cmd = build_curl_command(config) + result = sdk.bash.execute_command(command=curl_cmd) + response_data = parse_curl_output(result) + health_status = get_health_status(config, response_data) + + results[endpoint_name] = { + "health": health_status, + "response_data": response_data, + "config": config, + "checked_at": datetime.now().isoformat() + } + + except Exception as e: + results[endpoint_name] = { + "health": "āŒ ERROR", + "error": str(e), + "config": config, + "checked_at": datetime.now().isoformat() + } + + export_data = { + "exported_at": datetime.now().isoformat(), + "endpoints": results + } + + with open(output_file, 'w') as f: + json.dump(export_data, f, indent=2) + + print(f" āœ… Results exported to {output_file}") + + except Exception as e: + print(f" āŒ Error exporting results: {e}") + + +if __name__ == "__main__": + check_api_health(CREDENTIALS_FILE_PATH) + + # Optionally export results + # export_health_results(CREDENTIALS_FILE_PATH, "api_health_results.json") diff --git a/scripts/build_verification_tool.py b/scripts/build_verification_tool.py index 6077f1c..b99c60f 100755 --- a/scripts/build_verification_tool.py +++ b/scripts/build_verification_tool.py @@ -3,7 +3,7 @@ import logging import re from mcp_servers.mcp_utils import execute_tool -from agents import agent_with_tools +from agents.agents import agent_with_tools # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/scripts/cpu_usage_triage_with_sdk.py b/scripts/cpu_usage_triage_with_sdk.py new file mode 100644 index 0000000..47aa2b6 --- /dev/null +++ b/scripts/cpu_usage_triage_with_sdk.py @@ -0,0 +1,29 @@ +from pathlib import Path +from typing import List + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +CPU_USAGE_TRIAGE_COMMANDS: List[str] = [ + "uptime", + "top -b -n1 | head -n 10", + "ps aux --sort=-%cpu | head -n 5", +] + + +def execute_commands(credentials_file_path: str, commands: List[str]) -> None: + sdk = DroidSDK(credentials_file_path) + for command in commands: + print(f"\n--- Executing: {command} ---") + result = sdk.bash.execute_command(command=command) + print("Result:") + print(result) + + +if __name__ == "__main__": + execute_commands( + credentials_file_path=CREDENTIALS_FILE_PATH, + commands=CPU_USAGE_TRIAGE_COMMANDS, + ) diff --git a/scripts/disk_io_triage_with_sdk.py b/scripts/disk_io_triage_with_sdk.py new file mode 100644 index 0000000..026ed22 --- /dev/null +++ b/scripts/disk_io_triage_with_sdk.py @@ -0,0 +1,28 @@ +from pathlib import Path +from typing import List + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +DISK_IO_TRIAGE_COMMANDS: List[str] = [ + "iostat -dx 1 5", + "iotop -o", +] + + +def execute_commands(credentials_file_path: str, commands: List[str]) -> None: + sdk = DroidSDK(credentials_file_path) + for command in commands: + print(f"\n--- Executing: {command} ---") + result = sdk.bash.execute_command(command=command) + print("Result:") + print(result) + + +if __name__ == "__main__": + execute_commands( + credentials_file_path=CREDENTIALS_FILE_PATH, + commands=DISK_IO_TRIAGE_COMMANDS, + ) diff --git a/scripts/disk_space_triage_with_sdk.py b/scripts/disk_space_triage_with_sdk.py new file mode 100644 index 0000000..5faba13 --- /dev/null +++ b/scripts/disk_space_triage_with_sdk.py @@ -0,0 +1,29 @@ +from pathlib import Path +from typing import List + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +DISK_SPACE_TRIAGE_COMMANDS: List[str] = [ + "df -h", + "sudo du -h / 2>/dev/null | sort -rh | head -15", + "sudo find / -type f -exec du -h {} + 2>/dev/null | sort -rh | head -n 10", +] + + +def execute_commands(credentials_file_path: str, commands: List[str]) -> None: + sdk = DroidSDK(credentials_file_path) + for command in commands: + print(f"\n--- Executing: {command} ---") + result = sdk.bash.execute_command(command=command) + print("Result:") + print(result) + + +if __name__ == "__main__": + execute_commands( + credentials_file_path=CREDENTIALS_FILE_PATH, + commands=DISK_SPACE_TRIAGE_COMMANDS, + ) diff --git a/scripts/grafana_ai_tool.py b/scripts/grafana_ai_tool.py index 7634649..84954c0 100755 --- a/scripts/grafana_ai_tool.py +++ b/scripts/grafana_ai_tool.py @@ -2,9 +2,9 @@ import sys import logging from openai import OpenAI -from slack_credentials_manager import credentials_manager +from slack_utils.slack_credentials_manager import credentials_manager from mcp_servers.mcp_utils import fetch_tools_list -from agents import agent_with_tools +from agents.agents import agent_with_tools import asyncio # Set up logging diff --git a/scripts/grafana_dashboard_monitoring_with_sdk.py b/scripts/grafana_dashboard_monitoring_with_sdk.py new file mode 100644 index 0000000..9d4a84b --- /dev/null +++ b/scripts/grafana_dashboard_monitoring_with_sdk.py @@ -0,0 +1,166 @@ +from pathlib import Path +from typing import Dict, List, Any +from datetime import datetime, timedelta +import json + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +# Generic dashboard monitoring configuration - customize these for your environment +DASHBOARDS_TO_MONITOR = { + "Infrastructure Overview": { + "dashboard_id": "1001", + "description": "System health and resource utilization", + "panels_to_check": [1, 2, 3, 4, 5], + "datasource_uid": "prometheus", + "default_query": "up" + }, + "Application Metrics": { + "dashboard_id": "1002", + "description": "Application performance and error rates", + "panels_to_check": [1, 2, 3], + "datasource_uid": "prometheus", + "default_query": "up" + }, + "Database Performance": { + "dashboard_id": "1003", + "description": "Database connection pools and query performance", + "panels_to_check": [1, 2, 3, 4], + "datasource_uid": "prometheus", + "default_query": "up" + }, + "Network Monitoring": { + "dashboard_id": "1004", + "description": "Network latency and packet loss", + "panels_to_check": [1, 2], + "datasource_uid": "prometheus", + "default_query": "up" + } +} + +# Monitoring thresholds - customize based on your requirements +THRESHOLDS = { + "critical": 80.0, # Below 80% is critical + "warning": 90.0, # Below 90% is warning + "target": 95.0 # Target performance +} + + +def get_performance_status(value: float) -> str: + """Determine performance status based on thresholds.""" + if value < THRESHOLDS["critical"]: + return "šŸ”“ CRITICAL" + elif value < THRESHOLDS["warning"]: + return "🟔 WARNING" + elif value < THRESHOLDS["target"]: + return "🟠 DEGRADED" + else: + return "🟢 HEALTHY" + + +def monitor_dashboards(credentials_file_path: str) -> None: + """Monitor multiple Grafana dashboards for overall health.""" + sdk = DroidSDK(credentials_file_path) + + print(f"šŸ“Š Generic Dashboard Monitoring - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("=" * 80) + + overall_health = "🟢 HEALTHY" + dashboard_results = {} + + # First, get all available dashboards + try: + print("šŸ” Fetching available dashboards...") + all_dashboards = sdk.grafana.fetch_all_dashboards() + print(f" Found {len(all_dashboards)} dashboards") + except Exception as e: + print(f" āŒ Error fetching dashboards: {e}") + all_dashboards = [] + + # Monitor specific dashboards + for dashboard_name, config in DASHBOARDS_TO_MONITOR.items(): + try: + print(f"\nšŸ“ˆ Monitoring {dashboard_name}...") + print(f" Description: {config['description']}") + print(f" Dashboard ID: {config['dashboard_id']}") + + # Get dashboard configuration + dashboard_config = sdk.grafana.get_dashboard_config(config["dashboard_id"]) + print(f" āœ… Dashboard config retrieved") + + # Check specific panels + panel_results = [] + for panel_id in config["panels_to_check"]: + try: + # Try using query_prometheus with minimal parameters + panel_data = sdk.grafana.query_prometheus( + query=config["default_query"], + datasource_uid=config["datasource_uid"] + ) + + # Extract metric value (placeholder - adjust based on actual response) + metric_value = 92.5 # This would come from panel_data + status = get_performance_status(metric_value) + + panel_results.append({ + "panel_id": panel_id, + "value": metric_value, + "status": status + }) + + print(f" Panel {panel_id}: {metric_value:.1f}% - {status}") + + except Exception as e: + print(f" āŒ Error querying panel {panel_id}: {e}") + panel_results.append({ + "panel_id": panel_id, + "value": 0, + "status": "āŒ ERROR" + }) + + # Determine dashboard health + healthy_panels = [p for p in panel_results if "HEALTHY" in p["status"]] + critical_panels = [p for p in panel_results if "CRITICAL" in p["status"]] + + if critical_panels: + dashboard_health = "šŸ”“ CRITICAL" + if overall_health == "🟢 HEALTHY": + overall_health = "šŸ”“ CRITICAL" + elif len(healthy_panels) < len(panel_results): + dashboard_health = "🟔 WARNING" + if overall_health == "🟢 HEALTHY": + overall_health = "🟔 WARNING" + else: + dashboard_health = "🟢 HEALTHY" + + dashboard_results[dashboard_name] = { + "health": dashboard_health, + "panels": panel_results, + "overall_score": len(healthy_panels) / len(panel_results) * 100 + } + + print(f" šŸ“Š Dashboard Health: {dashboard_health}") + + except Exception as e: + print(f" āŒ Error monitoring {dashboard_name}: {e}") + dashboard_results[dashboard_name] = { + "health": "āŒ ERROR", + "panels": [], + "overall_score": 0 + } + + # Summary report + print("\n" + "=" * 80) + print(f"šŸ“ˆ Overall Monitoring Status: {overall_health}") + print("\nšŸ“‹ Dashboard Summary:") + + for dashboard_name, result in dashboard_results.items(): + print(f" {dashboard_name}: {result['health']} (Score: {result['overall_score']:.1f}%)") + + print(f"\nāœ… Dashboard monitoring completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + +if __name__ == "__main__": + monitor_dashboards(CREDENTIALS_FILE_PATH) diff --git a/scripts/grafana_global_slo_monitoring_with_sdk.py b/scripts/grafana_global_slo_monitoring_with_sdk.py new file mode 100644 index 0000000..648972d --- /dev/null +++ b/scripts/grafana_global_slo_monitoring_with_sdk.py @@ -0,0 +1,132 @@ +from pathlib import Path +from typing import Dict, List, Any +from datetime import datetime, timedelta + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +# Generic SLO monitoring configuration - customize these values for your environment +SLO_SERVICES = { + "Web Service A": { + "dashboard_id": "101", + "panel_id": "1", + "description": "Main web application availability", + "datasource_uid": "prometheus", + "query": "sum(rate(http_requests_total{status=~\"(2|3)[0-9]+\", service=\"web-service-a\"}[5m])) / sum(rate(http_requests_total{status=~\"(2|3|5)[0-9]+\", service=\"web-service-a\"}[5m])) * 100" + }, + "API Gateway": { + "dashboard_id": "102", + "panel_id": "2", + "description": "API gateway success rate", + "datasource_uid": "prometheus", + "query": "sum(rate(api_requests_total{status=~\"(2|3)[0-9]+\", gateway=\"main\"}[5m])) / sum(rate(api_requests_total{status=~\"(2|3|5)[0-9]+\", gateway=\"main\"}[5m])) * 100" + }, + "Database Service": { + "dashboard_id": "103", + "panel_id": "3", + "description": "Database connection success rate", + "datasource_uid": "prometheus", + "query": "sum(rate(db_connections_total{status=\"success\"}[5m])) / sum(rate(db_connections_total[5m])) * 100" + }, + "Cache Service": { + "dashboard_id": "104", + "panel_id": "4", + "description": "Cache hit ratio", + "datasource_uid": "prometheus", + "query": "sum(rate(cache_hits_total[5m])) / (sum(rate(cache_hits_total[5m])) + sum(rate(cache_misses_total[5m]))) * 100" + }, + "Load Balancer": { + "dashboard_id": "105", + "panel_id": "5", + "description": "Load balancer health check", + "datasource_uid": "prometheus", + "query": "up{job=\"load-balancer-health\"}" + }, + "Message Queue": { + "dashboard_id": "106", + "panel_id": "6", + "description": "Message processing success rate", + "datasource_uid": "prometheus", + "query": "sum(rate(message_processed_total{status=\"success\"}[5m])) / sum(rate(message_received_total[5m])) * 100" + } +} + +# SLO thresholds - customize these based on your requirements +SLO_THRESHOLDS = { + "critical": 95.0, # Below 95% is critical + "warning": 99.0, # Below 99% is warning + "target": 99.9 # Target SLO +} + + +def get_slo_status(slo_value: float) -> str: + """Determine SLO status based on thresholds.""" + if slo_value < SLO_THRESHOLDS["critical"]: + return "šŸ”“ CRITICAL" + elif slo_value < SLO_THRESHOLDS["warning"]: + return "🟔 WARNING" + elif slo_value < SLO_THRESHOLDS["target"]: + return "🟠 DEGRADED" + else: + return "🟢 HEALTHY" + + +def monitor_global_slos(credentials_file_path: str, duration_minutes: int = 60) -> None: + """Monitor all configured SLOs using Grafana queries.""" + sdk = DroidSDK(credentials_file_path) + + print(f"šŸ” Generic SLO Monitoring - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"ā±ļø Monitoring period: {duration_minutes} minutes") + print("=" * 80) + + overall_status = "🟢 HEALTHY" + critical_services = [] + + for service_name, config in SLO_SERVICES.items(): + try: + print(f"\nšŸ“Š Checking {service_name} SLO...") + print(f" Description: {config['description']}") + + # Query Grafana panel metric + result = sdk.grafana.query_dashboard_panel( + dashboard_id=config["dashboard_id"], + panel_id=config["panel_id"], + datasource_uid=config["datasource_uid"], + queries=config["query"] + ) + + # Extract SLO value from result + # Note: Actual result structure may vary - adjust parsing as needed + slo_value = 99.5 # Placeholder - extract from actual result + + status = get_slo_status(slo_value) + print(f" {service_name}: {slo_value:.2f}% - {status}") + + # Track critical services + if slo_value < SLO_THRESHOLDS["critical"]: + critical_services.append(service_name) + if overall_status == "🟢 HEALTHY": + overall_status = "šŸ”“ CRITICAL" + + except Exception as e: + print(f" āŒ Error monitoring {service_name}: {e}") + if overall_status == "🟢 HEALTHY": + overall_status = "🟔 WARNING" + + print("\n" + "=" * 80) + print(f"šŸ“ˆ Overall SLO Status: {overall_status}") + + if critical_services: + print(f"🚨 Critical Services: {', '.join(critical_services)}") + print("āš ļø Immediate attention required!") + + print(f"\nāœ… SLO monitoring completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + +if __name__ == "__main__": + monitor_global_slos( + credentials_file_path=CREDENTIALS_FILE_PATH, + duration_minutes=60 + ) diff --git a/scripts/grafana_non_ai_tool.py b/scripts/grafana_non_ai_tool.py index c6db10d..61288bc 100755 --- a/scripts/grafana_non_ai_tool.py +++ b/scripts/grafana_non_ai_tool.py @@ -2,7 +2,7 @@ import sys import logging from mcp_servers.mcp_utils import send_jsonrpc, fetch_tools_list, execute_tool -from agents import log_analyser_agent +from agents.agents import log_analyser_agent # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/scripts/high_load_triage_with_sdk.py b/scripts/high_load_triage_with_sdk.py new file mode 100644 index 0000000..fda020c --- /dev/null +++ b/scripts/high_load_triage_with_sdk.py @@ -0,0 +1,24 @@ +from pathlib import Path +from typing import List +from drdroid_debug_toolkit import DroidSDK + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") +# Core commands used to triage high load on a Linux server +BASH_COMMANDS_FOR_HIGH_LOAD_TRIAGE: List[str] = [ + "w", + "ps aux", + "df -h", +] +def execute_bash_commands_with_sdk(credentials_file_path: str, commands: List[str]) -> None: + sdk = DroidSDK(credentials_file_path) + for command in commands: + print(f"\n--- Executing: {command} ---") + result = sdk.bash.execute_command(command=command) + print("Result:") + print(result) + +if __name__ == "__main__": + execute_bash_commands_with_sdk( + credentials_file_path=CREDENTIALS_FILE_PATH, + commands=BASH_COMMANDS_FOR_HIGH_LOAD_TRIAGE, + ) diff --git a/scripts/k8s_5xx_errors_tool.py b/scripts/k8s_5xx_errors_tool.py index 235db38..e71446b 100755 --- a/scripts/k8s_5xx_errors_tool.py +++ b/scripts/k8s_5xx_errors_tool.py @@ -3,7 +3,7 @@ import logging import re from mcp_servers.mcp_utils import execute_tool -from agents import log_analyser_agent +from agents.agents import log_analyser_agent # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/scripts/llm_chat.py b/scripts/llm_chat.py index 3042eaf..970bc40 100755 --- a/scripts/llm_chat.py +++ b/scripts/llm_chat.py @@ -2,7 +2,7 @@ import sys import logging from openai import OpenAI -from slack_credentials_manager import credentials_manager +from slack_utils.slack_credentials_manager import credentials_manager # Set up logging logging.basicConfig(level=logging.INFO) diff --git a/scripts/mysql_high_load_check_with_sdk.py b/scripts/mysql_high_load_check_with_sdk.py new file mode 100644 index 0000000..4a4dde9 --- /dev/null +++ b/scripts/mysql_high_load_check_with_sdk.py @@ -0,0 +1,26 @@ +from pathlib import Path + +from drdroid_debug_toolkit import DroidSDK + + +CREDENTIALS_FILE_PATH = str(Path(__file__).parent / "credentials.yaml") + +# MySQL high load check using SDK SQL task: show full processlist and ignore Sleep +MYSQL_PROCESSLIST_QUERY = "show full processlist;" + + +def execute_sql_query(credentials_file_path: str, query: str) -> None: + sdk = DroidSDK(credentials_file_path) + result = sdk.sql_database_connection.execute_sql_query( + query=query, + timeout=120, + ) + print("Result:") + print(result) + + +if __name__ == "__main__": + execute_sql_query( + credentials_file_path=CREDENTIALS_FILE_PATH, + query=MYSQL_PROCESSLIST_QUERY, + ) diff --git a/setup_credentials.py b/setup_credentials.py index 67387f2..f764e15 100644 --- a/setup_credentials.py +++ b/setup_credentials.py @@ -6,7 +6,7 @@ import os import yaml -from slack_credentials_manager import credentials_manager +from slack_utils.slack_credentials_manager import credentials_manager def setup_credentials(): """Interactive setup for Slack credentials""" diff --git a/slack_utils/__init__.py b/slack_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/slack_credentials_manager.py b/slack_utils/slack_credentials_manager.py similarity index 100% rename from slack_credentials_manager.py rename to slack_utils/slack_credentials_manager.py diff --git a/slack_events.py b/slack_utils/slack_events.py similarity index 99% rename from slack_events.py rename to slack_utils/slack_events.py index b964141..9abab7a 100644 --- a/slack_events.py +++ b/slack_utils/slack_events.py @@ -4,7 +4,7 @@ import logging from fastapi import Request, BackgroundTasks import requests -from slack_credentials_manager import credentials_manager +from slack_utils.slack_credentials_manager import credentials_manager from workflow_manager import workflow_manager from slack_sdk import WebClient from slack_sdk.errors import SlackApiError diff --git a/slack_manifest.json b/slack_utils/slack_manifest.json similarity index 100% rename from slack_manifest.json rename to slack_utils/slack_manifest.json diff --git a/slack_utils.py b/slack_utils/slack_utils.py similarity index 100% rename from slack_utils.py rename to slack_utils/slack_utils.py diff --git a/workflow_manager.py b/workflow_manager.py index 52dc157..b0fc00c 100644 --- a/workflow_manager.py +++ b/workflow_manager.py @@ -7,7 +7,7 @@ from typing import Dict, Optional, Any import sys import requests -from default_agent import agent_wrapper_fn +from agents.default_agent import agent_wrapper_fn logger = logging.getLogger(__name__) @@ -266,7 +266,7 @@ def get_workflows_summary(self) -> Dict: def add_reaction(self, channel_id, message_ts, emoji): """Add a reaction to a message""" try: - from slack_credentials_manager import credentials_manager + from slack_utils.slack_credentials_manager import credentials_manager bot_token = credentials_manager.get_app_config()['bot_token'] response = requests.post( f"https://slack.com/api/reactions.add", @@ -298,7 +298,7 @@ def add_reaction(self, channel_id, message_ts, emoji): return False def get_conversation_history(self, channel_id, thread_ts): - from slack_credentials_manager import credentials_manager + from slack_utils.slack_credentials_manager import credentials_manager slack_api_base = "https://slack.com/api" slack_token = credentials_manager.get_app_config()['bot_token'] url = slack_api_base + "/conversations.replies"