177 lines
6.0 KiB
Python
177 lines
6.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PiKVM Auto-Restart Monitor
|
|
Monitors host connectivity and performs hard reset if downtime exceeds threshold.
|
|
"""
|
|
|
|
import ping3
|
|
import time
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from gpiozero import Button
|
|
except ImportError:
|
|
# Fallback if gpiozero not available
|
|
import RPi.GPIO as GPIO
|
|
|
|
# Configuration
|
|
CONFIG = {
|
|
"host_ip": os.getenv("HOST_IP", "192.168.1.10"),
|
|
"gateway_ip": os.getenv("GATEWAY_IP", "192.168.1.1"),
|
|
"ping_interval": int(os.getenv("PING_INTERVAL", 180)), # 3 minutes
|
|
"downtime_threshold": int(os.getenv("DOWNTIME_THRESHOLD", 15)), # 15 minutes
|
|
"power_button_gpio": int(os.getenv("POWER_BUTTON_GPIO", 17)),
|
|
"long_press_duration": float(os.getenv("LONG_PRESS_DURATION", 5)), # 5 seconds to power down
|
|
"short_press_duration": float(os.getenv("SHORT_PRESS_DURATION", 1)), # 1 second to power on
|
|
"wait_before_reboot": int(os.getenv("WAIT_BEFORE_REBOOT", 90)), # 90 seconds
|
|
"log_file": os.getenv("LOG_FILE", "/var/log/pikvm-monitor.log"),
|
|
}
|
|
|
|
# Setup logging
|
|
os.makedirs(os.path.dirname(CONFIG["log_file"]), exist_ok=True)
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(CONFIG["log_file"]),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# State file to track reboots
|
|
STATE_FILE = "/var/lib/pikvm-monitor/state.txt"
|
|
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
|
|
|
|
|
def press_power_button(duration):
|
|
"""Simulate power button press via GPIO."""
|
|
try:
|
|
# Try using gpiozero first (preferred)
|
|
try:
|
|
button = Button(CONFIG["power_button_gpio"])
|
|
logger.info(f"Pressing power button for {duration} seconds")
|
|
button.pin.drive_high()
|
|
time.sleep(duration)
|
|
button.pin.drive_low()
|
|
logger.info("Power button press complete")
|
|
except NameError:
|
|
# Fallback to RPi.GPIO
|
|
GPIO.setmode(GPIO.BCM)
|
|
GPIO.setup(CONFIG["power_button_gpio"], GPIO.OUT, initial=GPIO.LOW)
|
|
logger.info(f"Pressing power button for {duration} seconds")
|
|
GPIO.output(CONFIG["power_button_gpio"], GPIO.HIGH)
|
|
time.sleep(duration)
|
|
GPIO.output(CONFIG["power_button_gpio"], GPIO.LOW)
|
|
logger.info("Power button press complete")
|
|
GPIO.cleanup()
|
|
except Exception as e:
|
|
logger.error(f"Error pressing power button: {e}")
|
|
raise
|
|
|
|
|
|
def ping_host(ip_address):
|
|
"""Ping host and return True if alive."""
|
|
try:
|
|
response = ping3.ping(ip_address, timeout=5)
|
|
return response is not None
|
|
except Exception as e:
|
|
logger.debug(f"Ping to {ip_address} failed: {e}")
|
|
return False
|
|
|
|
|
|
def check_host_alive():
|
|
"""Check if host is alive, with fallback to gateway."""
|
|
if ping_host(CONFIG["host_ip"]):
|
|
logger.debug(f"Host {CONFIG['host_ip']} is alive")
|
|
return True
|
|
|
|
logger.debug(f"Host {CONFIG['host_ip']} not responding, trying gateway fallback")
|
|
if ping_host(CONFIG["gateway_ip"]):
|
|
logger.debug(f"Gateway {CONFIG['gateway_ip']} is alive (host assumed up)")
|
|
return True
|
|
|
|
logger.warning(f"Both host ({CONFIG['host_ip']}) and gateway ({CONFIG['gateway_ip']}) unreachable")
|
|
return False
|
|
|
|
|
|
def perform_reset():
|
|
"""Perform hard reset: long press power down, wait, short press power on."""
|
|
logger.warning("=" * 60)
|
|
logger.warning("INITIATING HARD RESET SEQUENCE")
|
|
logger.warning("=" * 60)
|
|
|
|
try:
|
|
# Power down
|
|
logger.info("Step 1: Long press to power down")
|
|
press_power_button(CONFIG["long_press_duration"])
|
|
|
|
# Wait for shutdown to complete
|
|
logger.info(f"Step 2: Waiting {CONFIG['wait_before_reboot']} seconds for cool-down")
|
|
time.sleep(CONFIG["wait_before_reboot"])
|
|
|
|
# Power on
|
|
logger.info("Step 3: Short press to power on")
|
|
press_power_button(CONFIG["short_press_duration"])
|
|
|
|
logger.info("Step 4: Reset sequence complete")
|
|
logger.warning("=" * 60)
|
|
|
|
# Record in state file
|
|
with open(STATE_FILE, "a") as f:
|
|
f.write(f"{datetime.now().isoformat()}: Reset performed\n")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during reset sequence: {e}")
|
|
raise
|
|
|
|
|
|
def main():
|
|
"""Main monitoring loop."""
|
|
logger.info("=" * 60)
|
|
logger.info("PiKVM Auto-Restart Monitor Started")
|
|
logger.info(f"Configuration: Host={CONFIG['host_ip']}, Gateway={CONFIG['gateway_ip']}")
|
|
logger.info(f"Ping interval={CONFIG['ping_interval']}s, Threshold={CONFIG['downtime_threshold']}min")
|
|
logger.info("=" * 60)
|
|
|
|
consecutive_failures = 0
|
|
last_success = datetime.now()
|
|
|
|
while True:
|
|
try:
|
|
if check_host_alive():
|
|
consecutive_failures = 0
|
|
last_success = datetime.now()
|
|
logger.info("✓ Host is alive")
|
|
else:
|
|
consecutive_failures += 1
|
|
downtime_minutes = (consecutive_failures * CONFIG["ping_interval"]) / 60
|
|
logger.warning(
|
|
f"✗ Host unreachable ({consecutive_failures} attempts, {downtime_minutes:.1f} min downtime)"
|
|
)
|
|
|
|
# Check if downtime threshold exceeded
|
|
if consecutive_failures * CONFIG["ping_interval"] >= CONFIG["downtime_threshold"] * 60:
|
|
logger.error(
|
|
f"Downtime threshold exceeded ({downtime_minutes:.1f} min). Initiating reset."
|
|
)
|
|
perform_reset()
|
|
consecutive_failures = 0 # Reset counter after reboot
|
|
|
|
time.sleep(CONFIG["ping_interval"])
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Monitor stopped by user")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error: {e}", exc_info=True)
|
|
time.sleep(CONFIG["ping_interval"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|