Flux Operations Guide
Day-to-day operational procedures for managing Flux deployments.
Gap Recovery
Identify Missing Data
-- Check for gaps in telemetry
WITH time_series AS (
SELECT generate_series(
'2024-01-01 00:00:00'::timestamptz,
'2024-01-01 23:59:00'::timestamptz,
'1 minute'::interval
) AS expected_time
)
SELECT
ts.expected_time,
r.time IS NULL as is_missing
FROM time_series ts
LEFT JOIN flux.mg_bess_readings r
ON date_trunc('minute', r.time) = ts.expected_time
AND r.device_id = 'your-device-id'
WHERE r.time IS NULL
ORDER BY ts.expected_time;
Run Gap Recovery Script
# gap_recovery.py
python scripts/gap_recovery.py \
--source prod \
--target mgf \
--start "2024-01-01 12:00:00" \
--end "2024-01-01 18:00:00" \
--site WLCE
Site Management
Add New Device
- Update Device Registry:
INSERT INTO flux.mg_device_registry (id, name, type, site, metadata)
VALUES (
gen_random_uuid(),
'WLCE-BESS-02',
'bess',
'WLCE',
'{"modbus_address": 2, "max_power": 100}'::jsonb
);
- Update Site Config:
devices:
bess:
- id: "new-device-uuid"
name: "WLCE-BESS-02"
modbus:
address: 2
- Deploy and Restart:
scp config.yaml pi@site-ip:~/bess_controller/
ssh pi@site-ip "sudo systemctl restart bess_controller"
Remove Device
- Mark as inactive in registry
- Remove from config
- Archive historical data if needed
Performance Tuning
Optimize Polling Interval
# Reduce for more responsive control
controller:
poll_interval: 30s # Default is 60s
Batch Upload Size
# Increase for better throughput
data_platform:
upload:
batch_size: 200 # Default is 100
Aggregation Settings
-- Refresh continuous aggregates more frequently
ALTER MATERIALIZED VIEW flux.mg_meter_readings_5m_intermediate
SET (timescaledb.refresh_interval = '5 minutes');
Monitoring
Health Checks
#!/bin/bash
# health_check.sh
# Check service status
if ! systemctl is-active --quiet bess_controller; then
echo "ERROR: Service not running"
exit 1
fi
# Check recent telemetry
RECENT=$(psql -h db.host -U user -d flux -t -c "
SELECT COUNT(*)
FROM flux.mg_bess_readings
WHERE time > now() - interval '5 minutes'
")
if [ "$RECENT" -eq 0 ]; then
echo "WARNING: No recent telemetry"
exit 2
fi
echo "OK: System healthy"
Alert Configuration
# Grafana alert rules
- alert: BatterySOELow
expr: bess_soe < 10
for: 5m
annotations:
summary: "Battery SOE below 10%"
- alert: NoRecentTelemetry
expr: time() - last_telemetry_time > 300
for: 5m
annotations:
summary: "No telemetry for 5+ minutes"
Backup and Recovery
Database Backup
# Full backup
pg_dump -h db.host -U user -d flux --schema=flux -f flux_backup.sql
# Telemetry only (last 7 days)
pg_dump -h db.host -U user -d flux \
--table="flux.mg_*_readings" \
--where="time > now() - interval '7 days'" \
-f telemetry_backup.sql
Configuration Backup
# Backup all site configs
for site in wlce hmce lfce; do
scp pi@${site}-ip:~/bess_controller/config.yaml \
backups/${site}_config_$(date +%Y%m%d).yaml
done
Maintenance Windows
Scheduled Maintenance
# Disable control during maintenance
controller:
components:
- type: "maintenance_mode"
enabled: true
priority: 0 # Highest priority
schedule:
- start: "02:00"
end: "04:00"
days: ["sunday"]
Emergency Stop
# Immediate stop
ssh pi@site-ip "sudo systemctl stop bess_controller"
# Safe mode (0 power)
ssh pi@site-ip "
echo 'controller:
components:
- type: to_soe
config:
target_power: 0' > ~/bess_controller/override.yaml
sudo systemctl restart bess_controller
"
Log Management
Log Rotation
# /etc/logrotate.d/bess_controller
/var/log/bess_controller/*.log {
daily
rotate 30
compress
delaycompress
notifempty
create 640 pi pi
}
Log Analysis
# Error summary
journalctl -u bess_controller --since "1 day ago" | grep ERROR | sort | uniq -c
# Performance metrics
journalctl -u bess_controller --since "1 hour ago" | grep "cycle_time"
Security
Key Rotation
- Generate new keys in Supabase
- Update environment files on all sites
- Restart services
- Verify connectivity
Access Audit
# Check SSH access
last -n 20
# Review sudo usage
sudo journalctl -u sudo
Disaster Recovery
Site Offline
- Check VPN connectivity
- Contact site for physical inspection
- Review last known telemetry
- Prepare replacement hardware if needed
Database Recovery
# Restore from backup
psql -h db.host -U user -d flux < flux_backup.sql
# Verify data integrity
SELECT COUNT(*), MAX(time), MIN(time)
FROM flux.mg_bess_readings;
Next Steps
- Deployment Guide - Initial setup
- Controller Architecture - Technical details
- Monitoring Setup - Grafana configuration