first commit
This commit is contained in:
commit
0e10f1f135
31
.env.example
Normal file
31
.env.example
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# PiKVM Monitor Configuration
|
||||||
|
|
||||||
|
# IP address of the host to monitor
|
||||||
|
HOST_IP=192.168.1.10
|
||||||
|
|
||||||
|
# Fallback gateway IP (for network-level connectivity check)
|
||||||
|
GATEWAY_IP=192.168.1.1
|
||||||
|
|
||||||
|
# Interval between pings (in seconds) - default: 180 (3 minutes)
|
||||||
|
PING_INTERVAL=180
|
||||||
|
|
||||||
|
# Downtime threshold before triggering reset (in minutes) - default: 15
|
||||||
|
# 15 minutes = 5 failed pings at 3-minute intervals
|
||||||
|
DOWNTIME_THRESHOLD=15
|
||||||
|
|
||||||
|
# GPIO pin number for power button control - default: BCM pin 17
|
||||||
|
# Check your PiKVM documentation for the correct GPIO pin
|
||||||
|
POWER_BUTTON_GPIO=17
|
||||||
|
|
||||||
|
# Duration of long press to power down (in seconds) - default: 5
|
||||||
|
LONG_PRESS_DURATION=5
|
||||||
|
|
||||||
|
# Duration of short press to power on (in seconds) - default: 1
|
||||||
|
SHORT_PRESS_DURATION=1
|
||||||
|
|
||||||
|
# Wait time between power down and power on (in seconds) - default: 90 (1.5 min)
|
||||||
|
# Allows time for cooldown and system shutdown
|
||||||
|
WAIT_BEFORE_REBOOT=90
|
||||||
|
|
||||||
|
# Log file path (inside container at /var/log/pikvm-monitor.log)
|
||||||
|
LOG_FILE=/var/log/pikvm-monitor.log
|
||||||
45
.gitignore
vendored
Normal file
45
.gitignore
vendored
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# Environment variables
|
||||||
|
.env
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
.dockerignore
|
||||||
23
Dockerfile
Normal file
23
Dockerfile
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
FROM python:3.11-bullseye
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for GPIO control
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
python3-gpiozero \
|
||||||
|
python3-pip \
|
||||||
|
iputils-ping \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy application
|
||||||
|
COPY monitor.py .
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Create directories for logs and state
|
||||||
|
RUN mkdir -p /var/log /var/lib/pikvm-monitor
|
||||||
|
|
||||||
|
# Run the monitor
|
||||||
|
CMD ["python3", "monitor.py"]
|
||||||
139
INSTALL_NATIVE.md
Normal file
139
INSTALL_NATIVE.md
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
# Alternative: Run without Docker
|
||||||
|
|
||||||
|
If you prefer to run directly on PiKVM without Docker, follow these steps:
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y python3 python3-pip python3-gpiozero python3-pip iputils-ping
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Install Python Packages
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip3 install --user ping3==4.0.1 gpiozero==2.0.1 RPi.GPIO==0.7.0
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Copy Files
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo mkdir -p /opt/pikvm-monitor
|
||||||
|
sudo cp monitor.py /opt/pikvm-monitor/
|
||||||
|
sudo chown pikvm:pikvm /opt/pikvm-monitor/monitor.py
|
||||||
|
sudo chmod +x /opt/pikvm-monitor/monitor.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Option A: Run as systemd Service
|
||||||
|
|
||||||
|
### 1. Create systemd service file
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo tee /etc/systemd/system/pikvm-monitor.service << EOF
|
||||||
|
[Unit]
|
||||||
|
Description=PiKVM Auto-Restart Monitor
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=pikvm
|
||||||
|
WorkingDirectory=/opt/pikvm-monitor
|
||||||
|
ExecStart=/usr/bin/python3 /opt/pikvm-monitor/monitor.py
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
|
# GPIO requires this
|
||||||
|
ExecStartPost=/bin/sh -c 'echo pikvm | sudo -S usermod -a -G gpio,spi pikvm'
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
Environment="HOST_IP=192.168.1.10"
|
||||||
|
Environment="GATEWAY_IP=192.168.1.1"
|
||||||
|
Environment="POWER_BUTTON_GPIO=17"
|
||||||
|
Environment="PING_INTERVAL=180"
|
||||||
|
Environment="DOWNTIME_THRESHOLD=15"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Enable and Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable pikvm-monitor
|
||||||
|
sudo systemctl start pikvm-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Check Status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo systemctl status pikvm-monitor
|
||||||
|
sudo journalctl -u pikvm-monitor -f
|
||||||
|
```
|
||||||
|
|
||||||
|
## Option B: Run in Screen/Tmux
|
||||||
|
|
||||||
|
```bash
|
||||||
|
screen -S pikvm-monitor python3 /opt/pikvm-monitor/monitor.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Option C: Cron-based Task
|
||||||
|
|
||||||
|
For lightweight monitoring, you could create a simple cron check, but systemd service is recommended for continuous monitoring.
|
||||||
|
|
||||||
|
## GPIO Permissions
|
||||||
|
|
||||||
|
Make sure the `pikvm` user has GPIO access:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo usermod -a -G gpio pikvm
|
||||||
|
sudo usermod -a -G spi pikvm
|
||||||
|
```
|
||||||
|
|
||||||
|
Then log out and log back in.
|
||||||
|
|
||||||
|
## Logs
|
||||||
|
|
||||||
|
Logs go to `/var/log/pikvm-monitor.log`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tail -f /var/log/pikvm-monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Permission denied on GPIO
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo usermod -a -G gpio pikvm
|
||||||
|
# Log out and back in
|
||||||
|
```
|
||||||
|
|
||||||
|
### Module not found
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip3 install --user ping3 gpiozero RPi.GPIO
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test GPIO locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -c "from gpiozero import Button; print('GPIO OK')"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docker vs Native
|
||||||
|
|
||||||
|
| | Docker | Native |
|
||||||
|
|---|--------|--------|
|
||||||
|
| Isolation | ✓ | ✗ |
|
||||||
|
| Portability | ✓ | ✗ |
|
||||||
|
| Resource use | Higher | Lower |
|
||||||
|
| Setup time | Faster | More steps |
|
||||||
|
| Debugging | docker logs | journalctl |
|
||||||
|
| Recommended | ✓ | Used if Docker unavailable |
|
||||||
100
PROJECT_GUIDE.md
Normal file
100
PROJECT_GUIDE.md
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
# Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
plex-restart/
|
||||||
|
├── monitor.py # Main monitoring agent
|
||||||
|
├── Dockerfile # Container definition
|
||||||
|
├── docker-compose.yml # Service orchestration
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── test_gpio.py # GPIO testing utility
|
||||||
|
├── health_check.py # Service health monitor
|
||||||
|
├── .env.example # Configuration template
|
||||||
|
├── INSTALL_NATIVE.md # Native installation guide
|
||||||
|
├── README.md # Complete documentation
|
||||||
|
└── .gitignore # Git ignore rules
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
### Deploy (Docker Compose)
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env with your host IP and GPIO pin
|
||||||
|
docker-compose up -d
|
||||||
|
docker-compose logs -f
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deploy (Native)
|
||||||
|
See `INSTALL_NATIVE.md` for systemd service or manual installation.
|
||||||
|
|
||||||
|
### Test GPIO
|
||||||
|
```bash
|
||||||
|
POWER_BUTTON_GPIO=17 python3 test_gpio.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Health
|
||||||
|
```bash
|
||||||
|
python3 health_check.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### View Logs
|
||||||
|
```bash
|
||||||
|
# Docker
|
||||||
|
docker-compose logs -f pikvm-monitor
|
||||||
|
|
||||||
|
# Native/systemd
|
||||||
|
journalctl -u pikvm-monitor -f
|
||||||
|
tail -f /var/log/pikvm-monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Parameters
|
||||||
|
|
||||||
|
- **Ping Interval**: 180 seconds (3 minutes)
|
||||||
|
- **Downtime Threshold**: 15 minutes (5 failed pings)
|
||||||
|
- **Long Press**: 5 seconds (power down)
|
||||||
|
- **Wait**: 90 seconds (cool-down)
|
||||||
|
- **Short Press**: 1 second (power on)
|
||||||
|
|
||||||
|
Adjust in `.env` as needed for faster/slower recovery.
|
||||||
|
|
||||||
|
## Files Overview
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `monitor.py` | Main monitoring loop with GPIO control |
|
||||||
|
| `Dockerfile` | Builds container image |
|
||||||
|
| `docker-compose.yml` | Defines service with volume/env mapping |
|
||||||
|
| `requirements.txt` | Python package versions |
|
||||||
|
| `test_gpio.py` | Tests GPIO pin configuration |
|
||||||
|
| `health_check.py` | Verifies monitor is running |
|
||||||
|
| `.env.example` | Configuration template |
|
||||||
|
| `INSTALL_NATIVE.md` | Systemd/manual setup |
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
PiKVM (Runs Docker Compose)
|
||||||
|
└── pikvm-monitor Container
|
||||||
|
├── Pings HOST_IP every 3 min
|
||||||
|
├── Falls back to GATEWAY_IP
|
||||||
|
├── Tracks consecutive failures
|
||||||
|
├── After 15 min downtime:
|
||||||
|
│ ├── Long press GPIO (5 sec) → Power down
|
||||||
|
│ ├── Wait 90 seconds
|
||||||
|
│ └── Short press GPIO (1 sec) → Power on
|
||||||
|
└── Logs to /var/log/pikvm-monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Configure**: Edit `.env` with your IPs and GPIO pin
|
||||||
|
2. **Test GPIO**: Run `test_gpio.py` to verify pin works
|
||||||
|
3. **Deploy**: Use `docker-compose up -d`
|
||||||
|
4. **Monitor**: Check logs with `docker-compose logs -f`
|
||||||
|
5. **Verify**: Run `health_check.py` periodically
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
- PiKVM Docs: https://docs.pikvm.org/
|
||||||
|
- gpiozero: https://gpiozero.readthedocs.io/
|
||||||
|
- Docker Compose: https://docs.docker.com/compose/
|
||||||
222
README.md
Normal file
222
README.md
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
# PiKVM Auto-Restart Monitor
|
||||||
|
|
||||||
|
Automatic monitoring and recovery agent for PiKVM that watches a connected host and performs a hard reset if the system becomes unresponsive due to thermal throttling or other issues.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Continuous Monitoring**: Pings the host every 3 minutes (configurable)
|
||||||
|
- **Fallback Detection**: Tries host IP first, falls back to gateway for network-level connectivity check
|
||||||
|
- **Auto Recovery**: Performs hard reset after 15 minutes of downtime (configurable)
|
||||||
|
- **GPIO Control**: Simulates power button presses for graceful shutdown followed by restart
|
||||||
|
- **Cool-Down Period**: Waits 90 seconds between power down and restart to allow system cooling
|
||||||
|
- **Docker Native**: Runs as a Docker Compose service on PiKVM
|
||||||
|
- **Comprehensive Logging**: Tracks all events and reboot history
|
||||||
|
|
||||||
|
## Hardware Requirements
|
||||||
|
|
||||||
|
- PiKVM with Raspberry Pi (4 or better recommended)
|
||||||
|
- GPIO pins configured for power button control
|
||||||
|
- Typically BCM GPIO 17 for power button
|
||||||
|
- Check your PiKVM documentation to confirm
|
||||||
|
- Network access to the host and gateway
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Clone or Copy to PiKVM
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <this-repo> /home/pikvm/plex-restart
|
||||||
|
cd /home/pikvm/plex-restart
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure Environment
|
||||||
|
|
||||||
|
Copy the example config and update with your values:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit `.env` with your host details:
|
||||||
|
|
||||||
|
```env
|
||||||
|
HOST_IP=192.168.1.10 # Your host's IP address
|
||||||
|
GATEWAY_IP=192.168.1.1 # Your network gateway IP
|
||||||
|
POWER_BUTTON_GPIO=17 # GPIO pin (confirm with PiKVM docs)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Deploy with Docker Compose
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify it's running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose logs -f pikvm-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Options
|
||||||
|
|
||||||
|
All settings can be configured via environment variables in `.env`:
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `HOST_IP` | `192.168.1.10` | IP address of host to monitor |
|
||||||
|
| `GATEWAY_IP` | `192.168.1.1` | Fallback gateway for connectivity check |
|
||||||
|
| `PING_INTERVAL` | `180` | Seconds between pings (3 min) |
|
||||||
|
| `DOWNTIME_THRESHOLD` | `15` | Minutes of downtime before reset |
|
||||||
|
| `POWER_BUTTON_GPIO` | `17` | BCM GPIO pin for power button |
|
||||||
|
| `LONG_PRESS_DURATION` | `5` | Seconds to hold for power down |
|
||||||
|
| `SHORT_PRESS_DURATION` | `1` | Seconds to hold for power on |
|
||||||
|
| `WAIT_BEFORE_REBOOT` | `90` | Seconds to wait between power down/up |
|
||||||
|
|
||||||
|
### Example: Faster Recovery
|
||||||
|
|
||||||
|
To recover in 9 minutes instead of 15:
|
||||||
|
|
||||||
|
```env
|
||||||
|
PING_INTERVAL=180 # 3 minutes
|
||||||
|
DOWNTIME_THRESHOLD=9 # 9 minutes
|
||||||
|
```
|
||||||
|
|
||||||
|
This triggers reset after 3 failed pings (9 minutes total).
|
||||||
|
|
||||||
|
## Monitoring & Logs
|
||||||
|
|
||||||
|
### View Live Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose logs -f pikvm-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inside Container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec pikvm-monitor tail -f /var/log/pikvm-monitor.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Reset History
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker exec pikvm-monitor cat /var/lib/pikvm-monitor/state.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Manual Control
|
||||||
|
|
||||||
|
### Stop Monitor
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
### Restart Monitor
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose restart pikvm-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
### View Status
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose ps
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Monitor Not Starting
|
||||||
|
|
||||||
|
Check logs:
|
||||||
|
```bash
|
||||||
|
docker-compose logs pikvm-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
Common issues:
|
||||||
|
- GPIO pins in use by another service
|
||||||
|
- Incorrect GPIO pin number
|
||||||
|
- Network connectivity issues
|
||||||
|
|
||||||
|
### Not Detecting Host Down
|
||||||
|
|
||||||
|
Verify connectivity manually:
|
||||||
|
```bash
|
||||||
|
ping <HOST_IP>
|
||||||
|
ping <GATEWAY_IP>
|
||||||
|
```
|
||||||
|
|
||||||
|
Check:
|
||||||
|
- Host IP is correct in `.env`
|
||||||
|
- Network can reach both IPs
|
||||||
|
- PiKVM has network access
|
||||||
|
|
||||||
|
### Power Button Not Working
|
||||||
|
|
||||||
|
1. Verify GPIO pin number in PiKVM documentation
|
||||||
|
2. Update `POWER_BUTTON_GPIO` in `.env`
|
||||||
|
3. Test GPIO access:
|
||||||
|
```bash
|
||||||
|
docker exec pikvm-monitor python3 -c "from gpiozero import Button; b = Button(17); print('GPIO working')"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The monitor runs as a single long-running process:
|
||||||
|
|
||||||
|
```
|
||||||
|
Startup
|
||||||
|
↓
|
||||||
|
Load Configuration
|
||||||
|
↓
|
||||||
|
Every 180 seconds:
|
||||||
|
├─ Ping HOST_IP
|
||||||
|
│ └─ If fails, ping GATEWAY_IP (fallback)
|
||||||
|
├─ If alive: Reset counter
|
||||||
|
└─ If down: Increment counter
|
||||||
|
└─ If counter × PING_INTERVAL ≥ DOWNTIME_THRESHOLD:
|
||||||
|
├─ Long press power button (5 sec)
|
||||||
|
├─ Wait 90 seconds
|
||||||
|
├─ Short press power button (1 sec)
|
||||||
|
└─ Reset counter
|
||||||
|
↓
|
||||||
|
Repeat
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Considerations
|
||||||
|
|
||||||
|
- **CPU**: Minimal (~5-10% during checks)
|
||||||
|
- **Memory**: ~50-80MB
|
||||||
|
- **Network**: Single ICMP ping every 3 minutes
|
||||||
|
- **GPIO**: Brief pulses only during reset
|
||||||
|
|
||||||
|
Safe to run alongside other PiKVM services.
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Local Testing (without GPIO)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Mock GPIO by catching exceptions during testing
|
||||||
|
python3 monitor.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building Custom Image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t pikvm-monitor:latest .
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues with PiKVM GPIO access:
|
||||||
|
- [PiKVM Documentation](https://docs.pikvm.org/)
|
||||||
|
- [gpiozero Library](https://gpiozero.readthedocs.io/)
|
||||||
|
|
||||||
|
For issues with this monitor:
|
||||||
|
- Check logs: `docker-compose logs`
|
||||||
|
- Verify `.env` configuration
|
||||||
|
- Test GPIO pin access manually
|
||||||
33
docker-compose.yml
Normal file
33
docker-compose.yml
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
pikvm-monitor:
|
||||||
|
build: .
|
||||||
|
image: pikvm-monitor:latest
|
||||||
|
container_name: pikvm-monitor
|
||||||
|
restart: always
|
||||||
|
privileged: true # Required for GPIO access
|
||||||
|
volumes:
|
||||||
|
- /var/log/pikvm-monitor:/var/log
|
||||||
|
- /var/lib/pikvm-monitor:/var/lib/pikvm-monitor
|
||||||
|
environment:
|
||||||
|
- HOST_IP=${HOST_IP:-192.168.1.10}
|
||||||
|
- GATEWAY_IP=${GATEWAY_IP:-192.168.1.1}
|
||||||
|
- PING_INTERVAL=${PING_INTERVAL:-180}
|
||||||
|
- DOWNTIME_THRESHOLD=${DOWNTIME_THRESHOLD:-15}
|
||||||
|
- POWER_BUTTON_GPIO=${POWER_BUTTON_GPIO:-17}
|
||||||
|
- LONG_PRESS_DURATION=${LONG_PRESS_DURATION:-5}
|
||||||
|
- SHORT_PRESS_DURATION=${SHORT_PRESS_DURATION:-1}
|
||||||
|
- WAIT_BEFORE_REBOOT=${WAIT_BEFORE_REBOOT:-90}
|
||||||
|
- LOG_FILE=/var/log/pikvm-monitor.log
|
||||||
|
logging:
|
||||||
|
driver: "json-file"
|
||||||
|
options:
|
||||||
|
max-size: "10m"
|
||||||
|
max-file: "3"
|
||||||
|
networks:
|
||||||
|
- default
|
||||||
|
|
||||||
|
networks:
|
||||||
|
default:
|
||||||
|
driver: bridge
|
||||||
122
health_check.py
Normal file
122
health_check.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Health Check Script - Monitor the monitoring service
|
||||||
|
Can be used with cron or external monitoring tools
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
LOG_FILE = "/var/log/pikvm-monitor.log"
|
||||||
|
STATE_FILE = "/var/lib/pikvm-monitor/state.txt"
|
||||||
|
|
||||||
|
def check_log_recent():
|
||||||
|
"""Check if monitor has logged something recently (within 10 minutes)."""
|
||||||
|
if not Path(LOG_FILE).exists():
|
||||||
|
print(f"✗ Log file not found: {LOG_FILE}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
mtime = Path(LOG_FILE).stat().st_mtime
|
||||||
|
last_update = datetime.fromtimestamp(mtime)
|
||||||
|
age = datetime.now() - last_update
|
||||||
|
|
||||||
|
if age > timedelta(minutes=10):
|
||||||
|
print(f"✗ Log file not updated recently (last: {age.total_seconds():.0f}s ago)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"✓ Log file updated {age.total_seconds():.0f}s ago")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error checking log: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_container_running():
|
||||||
|
"""Check if Docker container is running."""
|
||||||
|
try:
|
||||||
|
import docker
|
||||||
|
client = docker.from_env()
|
||||||
|
containers = client.containers.list()
|
||||||
|
for container in containers:
|
||||||
|
if "pikvm-monitor" in container.name:
|
||||||
|
if container.status == "running":
|
||||||
|
print(f"✓ Container {container.name} is running")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"✗ Container {container.name} is {container.status}")
|
||||||
|
return False
|
||||||
|
print("✗ Container pikvm-monitor not found")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠ Could not check container (not using Docker?): {e}")
|
||||||
|
return True # Don't fail if not using Docker
|
||||||
|
|
||||||
|
def check_process_running():
|
||||||
|
"""Check if Python monitor process is running."""
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
result = subprocess.run(["pgrep", "-f", "monitor.py"], capture_output=True)
|
||||||
|
if result.returncode == 0:
|
||||||
|
print("✓ Monitor process is running")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("✗ Monitor process not running")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠ Could not check process: {e}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def check_network():
|
||||||
|
"""Check basic network connectivity."""
|
||||||
|
import socket
|
||||||
|
try:
|
||||||
|
socket.create_connection(("8.8.8.8", 53), timeout=3)
|
||||||
|
print("✓ Network connectivity OK")
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
print("✗ Network connectivity issue")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("PiKVM Monitor Health Check")
|
||||||
|
print("=" * 60)
|
||||||
|
print()
|
||||||
|
|
||||||
|
checks = [
|
||||||
|
("Process Running", check_process_running),
|
||||||
|
("Container Running", check_container_running),
|
||||||
|
("Log File Recent", check_log_recent),
|
||||||
|
("Network Connectivity", check_network),
|
||||||
|
]
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for name, check_func in checks:
|
||||||
|
print(f"Checking: {name}...")
|
||||||
|
try:
|
||||||
|
results.append(check_func())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Check failed: {e}")
|
||||||
|
results.append(False)
|
||||||
|
print()
|
||||||
|
|
||||||
|
passed = sum(results)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Results: {passed}/{total} checks passed")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("✓ Monitor is healthy")
|
||||||
|
sys.exit(0)
|
||||||
|
elif passed >= total - 1:
|
||||||
|
print("⚠ Monitor has minor issues")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print("✗ Monitor has critical issues")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
176
monitor.py
Normal file
176
monitor.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
PiKVM Auto-Restart Monitor
|
||||||
|
Monitors host connectivity and performs hard reset if downtime exceeds threshold.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ping3
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gpiozero import Button
|
||||||
|
except ImportError:
|
||||||
|
# Fallback if gpiozero not available
|
||||||
|
import RPi.GPIO as GPIO
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CONFIG = {
|
||||||
|
"host_ip": os.getenv("HOST_IP", "192.168.1.10"),
|
||||||
|
"gateway_ip": os.getenv("GATEWAY_IP", "192.168.1.1"),
|
||||||
|
"ping_interval": int(os.getenv("PING_INTERVAL", 180)), # 3 minutes
|
||||||
|
"downtime_threshold": int(os.getenv("DOWNTIME_THRESHOLD", 15)), # 15 minutes
|
||||||
|
"power_button_gpio": int(os.getenv("POWER_BUTTON_GPIO", 17)),
|
||||||
|
"long_press_duration": float(os.getenv("LONG_PRESS_DURATION", 5)), # 5 seconds to power down
|
||||||
|
"short_press_duration": float(os.getenv("SHORT_PRESS_DURATION", 1)), # 1 second to power on
|
||||||
|
"wait_before_reboot": int(os.getenv("WAIT_BEFORE_REBOOT", 90)), # 90 seconds
|
||||||
|
"log_file": os.getenv("LOG_FILE", "/var/log/pikvm-monitor.log"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
os.makedirs(os.path.dirname(CONFIG["log_file"]), exist_ok=True)
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler(CONFIG["log_file"]),
|
||||||
|
logging.StreamHandler(),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# State file to track reboots
|
||||||
|
STATE_FILE = "/var/lib/pikvm-monitor/state.txt"
|
||||||
|
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def press_power_button(duration):
|
||||||
|
"""Simulate power button press via GPIO."""
|
||||||
|
try:
|
||||||
|
# Try using gpiozero first (preferred)
|
||||||
|
try:
|
||||||
|
button = Button(CONFIG["power_button_gpio"])
|
||||||
|
logger.info(f"Pressing power button for {duration} seconds")
|
||||||
|
button.pin.drive_high()
|
||||||
|
time.sleep(duration)
|
||||||
|
button.pin.drive_low()
|
||||||
|
logger.info("Power button press complete")
|
||||||
|
except NameError:
|
||||||
|
# Fallback to RPi.GPIO
|
||||||
|
GPIO.setmode(GPIO.BCM)
|
||||||
|
GPIO.setup(CONFIG["power_button_gpio"], GPIO.OUT, initial=GPIO.LOW)
|
||||||
|
logger.info(f"Pressing power button for {duration} seconds")
|
||||||
|
GPIO.output(CONFIG["power_button_gpio"], GPIO.HIGH)
|
||||||
|
time.sleep(duration)
|
||||||
|
GPIO.output(CONFIG["power_button_gpio"], GPIO.LOW)
|
||||||
|
logger.info("Power button press complete")
|
||||||
|
GPIO.cleanup()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error pressing power button: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def ping_host(ip_address):
|
||||||
|
"""Ping host and return True if alive."""
|
||||||
|
try:
|
||||||
|
response = ping3.ping(ip_address, timeout=5)
|
||||||
|
return response is not None
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Ping to {ip_address} failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def check_host_alive():
|
||||||
|
"""Check if host is alive, with fallback to gateway."""
|
||||||
|
if ping_host(CONFIG["host_ip"]):
|
||||||
|
logger.debug(f"Host {CONFIG['host_ip']} is alive")
|
||||||
|
return True
|
||||||
|
|
||||||
|
logger.debug(f"Host {CONFIG['host_ip']} not responding, trying gateway fallback")
|
||||||
|
if ping_host(CONFIG["gateway_ip"]):
|
||||||
|
logger.debug(f"Gateway {CONFIG['gateway_ip']} is alive (host assumed up)")
|
||||||
|
return True
|
||||||
|
|
||||||
|
logger.warning(f"Both host ({CONFIG['host_ip']}) and gateway ({CONFIG['gateway_ip']}) unreachable")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def perform_reset():
|
||||||
|
"""Perform hard reset: long press power down, wait, short press power on."""
|
||||||
|
logger.warning("=" * 60)
|
||||||
|
logger.warning("INITIATING HARD RESET SEQUENCE")
|
||||||
|
logger.warning("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Power down
|
||||||
|
logger.info("Step 1: Long press to power down")
|
||||||
|
press_power_button(CONFIG["long_press_duration"])
|
||||||
|
|
||||||
|
# Wait for shutdown to complete
|
||||||
|
logger.info(f"Step 2: Waiting {CONFIG['wait_before_reboot']} seconds for cool-down")
|
||||||
|
time.sleep(CONFIG["wait_before_reboot"])
|
||||||
|
|
||||||
|
# Power on
|
||||||
|
logger.info("Step 3: Short press to power on")
|
||||||
|
press_power_button(CONFIG["short_press_duration"])
|
||||||
|
|
||||||
|
logger.info("Step 4: Reset sequence complete")
|
||||||
|
logger.warning("=" * 60)
|
||||||
|
|
||||||
|
# Record in state file
|
||||||
|
with open(STATE_FILE, "a") as f:
|
||||||
|
f.write(f"{datetime.now().isoformat()}: Reset performed\n")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during reset sequence: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main monitoring loop."""
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("PiKVM Auto-Restart Monitor Started")
|
||||||
|
logger.info(f"Configuration: Host={CONFIG['host_ip']}, Gateway={CONFIG['gateway_ip']}")
|
||||||
|
logger.info(f"Ping interval={CONFIG['ping_interval']}s, Threshold={CONFIG['downtime_threshold']}min")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
consecutive_failures = 0
|
||||||
|
last_success = datetime.now()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
if check_host_alive():
|
||||||
|
consecutive_failures = 0
|
||||||
|
last_success = datetime.now()
|
||||||
|
logger.info("✓ Host is alive")
|
||||||
|
else:
|
||||||
|
consecutive_failures += 1
|
||||||
|
downtime_minutes = (consecutive_failures * CONFIG["ping_interval"]) / 60
|
||||||
|
logger.warning(
|
||||||
|
f"✗ Host unreachable ({consecutive_failures} attempts, {downtime_minutes:.1f} min downtime)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if downtime threshold exceeded
|
||||||
|
if consecutive_failures * CONFIG["ping_interval"] >= CONFIG["downtime_threshold"] * 60:
|
||||||
|
logger.error(
|
||||||
|
f"Downtime threshold exceeded ({downtime_minutes:.1f} min). Initiating reset."
|
||||||
|
)
|
||||||
|
perform_reset()
|
||||||
|
consecutive_failures = 0 # Reset counter after reboot
|
||||||
|
|
||||||
|
time.sleep(CONFIG["ping_interval"])
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Monitor stopped by user")
|
||||||
|
sys.exit(0)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error: {e}", exc_info=True)
|
||||||
|
time.sleep(CONFIG["ping_interval"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
ping3==4.0.1
|
||||||
|
gpiozero==2.0.1
|
||||||
|
RPi.GPIO==0.7.0
|
||||||
121
test_gpio.py
Normal file
121
test_gpio.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
GPIO Test Utility - Verify power button GPIO configuration
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gpiozero import Button
|
||||||
|
GPIOZERO_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
GPIOZERO_AVAILABLE = False
|
||||||
|
try:
|
||||||
|
import RPi.GPIO as GPIO
|
||||||
|
except ImportError:
|
||||||
|
print("ERROR: Neither gpiozero nor RPi.GPIO could be imported")
|
||||||
|
print("Install with: pip3 install gpiozero RPi.GPIO")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def test_gpio_gpiozero(pin):
|
||||||
|
"""Test GPIO with gpiozero library."""
|
||||||
|
print(f"Testing GPIO pin {pin} with gpiozero...")
|
||||||
|
try:
|
||||||
|
button = Button(pin)
|
||||||
|
print(f"✓ Button object created for pin {pin}")
|
||||||
|
|
||||||
|
# Test set high
|
||||||
|
print(f"Setting pin {pin} HIGH (simulating button press)...")
|
||||||
|
button.pin.drive_high()
|
||||||
|
time.sleep(1)
|
||||||
|
print("✓ Pin set HIGH")
|
||||||
|
|
||||||
|
# Test set low
|
||||||
|
print(f"Setting pin {pin} LOW (releasing button)...")
|
||||||
|
button.pin.drive_low()
|
||||||
|
time.sleep(0.5)
|
||||||
|
print("✓ Pin set LOW")
|
||||||
|
|
||||||
|
print("✓ GPIO test successful with gpiozero\n")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error with gpiozero: {e}\n")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_gpio_rpi(pin):
|
||||||
|
"""Test GPIO with RPi.GPIO library."""
|
||||||
|
print(f"Testing GPIO pin {pin} with RPi.GPIO...")
|
||||||
|
try:
|
||||||
|
GPIO.setmode(GPIO.BCM)
|
||||||
|
GPIO.setup(pin, GPIO.OUT, initial=GPIO.LOW)
|
||||||
|
print(f"✓ GPIO pin {pin} configured")
|
||||||
|
|
||||||
|
# Test set high
|
||||||
|
print(f"Setting pin {pin} HIGH (simulating button press)...")
|
||||||
|
GPIO.output(pin, GPIO.HIGH)
|
||||||
|
time.sleep(1)
|
||||||
|
print("✓ Pin set HIGH")
|
||||||
|
|
||||||
|
# Test set low
|
||||||
|
print(f"Setting pin {pin} LOW (releasing button)...")
|
||||||
|
GPIO.output(pin, GPIO.LOW)
|
||||||
|
time.sleep(0.5)
|
||||||
|
print("✓ Pin set LOW")
|
||||||
|
|
||||||
|
GPIO.cleanup()
|
||||||
|
print("✓ GPIO test successful with RPi.GPIO\n")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Error with RPi.GPIO: {e}\n")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("PiKVM Power Button GPIO Test Utility")
|
||||||
|
print("=" * 60)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Get GPIO pin from environment or default
|
||||||
|
gpio_pin = int(os.getenv("POWER_BUTTON_GPIO", 17))
|
||||||
|
print(f"Testing GPIO pin: {gpio_pin}")
|
||||||
|
print("This will briefly activate the GPIO pin (1 second).")
|
||||||
|
print("Make sure this is safe before proceeding!\n")
|
||||||
|
|
||||||
|
response = input("Continue? (yes/no): ").strip().lower()
|
||||||
|
if response not in ['yes', 'y']:
|
||||||
|
print("Cancelled.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
|
||||||
|
# Try gpiozero first
|
||||||
|
if GPIOZERO_AVAILABLE:
|
||||||
|
success = test_gpio_gpiozero(gpio_pin)
|
||||||
|
if success:
|
||||||
|
print("✓ GPIO is working correctly with gpiozero")
|
||||||
|
print("\nYou can use this pin in your .env configuration:")
|
||||||
|
print(f" POWER_BUTTON_GPIO={gpio_pin}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Fallback to RPi.GPIO
|
||||||
|
if not GPIOZERO_AVAILABLE or not success:
|
||||||
|
print("Trying RPi.GPIO fallback...\n")
|
||||||
|
success = test_gpio_rpi(gpio_pin)
|
||||||
|
if success:
|
||||||
|
print("✓ GPIO is working correctly with RPi.GPIO")
|
||||||
|
print("\nYou can use this pin in your .env configuration:")
|
||||||
|
print(f" POWER_BUTTON_GPIO={gpio_pin}")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
print("\n✗ GPIO test failed")
|
||||||
|
print("\nTroubleshooting:")
|
||||||
|
print("1. Verify the GPIO pin number is correct for your PiKVM")
|
||||||
|
print("2. Check that the GPIO pins are not already in use")
|
||||||
|
print("3. Ensure you're running with appropriate permissions (sudo)")
|
||||||
|
print("4. Check PiKVM documentation for correct pin configuration")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user