123 lines
3.6 KiB
Python
123 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Health Check Script - Monitor the monitoring service
|
|
Can be used with cron or external monitoring tools
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
LOG_FILE = "/var/log/pikvm-monitor.log"
|
|
STATE_FILE = "/var/lib/pikvm-monitor/state.txt"
|
|
|
|
def check_log_recent():
|
|
"""Check if monitor has logged something recently (within 10 minutes)."""
|
|
if not Path(LOG_FILE).exists():
|
|
print(f"✗ Log file not found: {LOG_FILE}")
|
|
return False
|
|
|
|
try:
|
|
mtime = Path(LOG_FILE).stat().st_mtime
|
|
last_update = datetime.fromtimestamp(mtime)
|
|
age = datetime.now() - last_update
|
|
|
|
if age > timedelta(minutes=10):
|
|
print(f"✗ Log file not updated recently (last: {age.total_seconds():.0f}s ago)")
|
|
return False
|
|
|
|
print(f"✓ Log file updated {age.total_seconds():.0f}s ago")
|
|
return True
|
|
except Exception as e:
|
|
print(f"✗ Error checking log: {e}")
|
|
return False
|
|
|
|
def check_container_running():
|
|
"""Check if Docker container is running."""
|
|
try:
|
|
import docker
|
|
client = docker.from_env()
|
|
containers = client.containers.list()
|
|
for container in containers:
|
|
if "pikvm-monitor" in container.name:
|
|
if container.status == "running":
|
|
print(f"✓ Container {container.name} is running")
|
|
return True
|
|
else:
|
|
print(f"✗ Container {container.name} is {container.status}")
|
|
return False
|
|
print("✗ Container pikvm-monitor not found")
|
|
return False
|
|
except Exception as e:
|
|
print(f"⚠ Could not check container (not using Docker?): {e}")
|
|
return True # Don't fail if not using Docker
|
|
|
|
def check_process_running():
|
|
"""Check if Python monitor process is running."""
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run(["pgrep", "-f", "monitor.py"], capture_output=True)
|
|
if result.returncode == 0:
|
|
print("✓ Monitor process is running")
|
|
return True
|
|
else:
|
|
print("✗ Monitor process not running")
|
|
return False
|
|
except Exception as e:
|
|
print(f"⚠ Could not check process: {e}")
|
|
return True
|
|
|
|
def check_network():
|
|
"""Check basic network connectivity."""
|
|
import socket
|
|
try:
|
|
socket.create_connection(("8.8.8.8", 53), timeout=3)
|
|
print("✓ Network connectivity OK")
|
|
return True
|
|
except:
|
|
print("✗ Network connectivity issue")
|
|
return False
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("PiKVM Monitor Health Check")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
checks = [
|
|
("Process Running", check_process_running),
|
|
("Container Running", check_container_running),
|
|
("Log File Recent", check_log_recent),
|
|
("Network Connectivity", check_network),
|
|
]
|
|
|
|
results = []
|
|
for name, check_func in checks:
|
|
print(f"Checking: {name}...")
|
|
try:
|
|
results.append(check_func())
|
|
except Exception as e:
|
|
print(f"✗ Check failed: {e}")
|
|
results.append(False)
|
|
print()
|
|
|
|
passed = sum(results)
|
|
total = len(results)
|
|
|
|
print("=" * 60)
|
|
print(f"Results: {passed}/{total} checks passed")
|
|
|
|
if passed == total:
|
|
print("✓ Monitor is healthy")
|
|
sys.exit(0)
|
|
elif passed >= total - 1:
|
|
print("⚠ Monitor has minor issues")
|
|
sys.exit(1)
|
|
else:
|
|
print("✗ Monitor has critical issues")
|
|
sys.exit(2)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|