factor out health-checks/logging and add reboot.sh

This commit is contained in:
Jörg Thalheim
2025-08-10 09:42:39 +02:00
parent c0d5ea14a5
commit 8e5f29472e
5 changed files with 384 additions and 167 deletions

View File

@@ -32,6 +32,7 @@
"*.envrc.private-template"
];
programs.shellcheck.enable = true;
settings.formatter.shellcheck.options = [ "-x" ];
programs.deno.enable = !pkgs.deno.meta.broken;
};
packages = {

View File

@@ -1,7 +1,14 @@
#!/usr/bin/env bash
# shellcheck source-path=SCRIPTDIR
set -euo pipefail
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Source logging functions
source "${SCRIPT_DIR}/logging.sh"
WIKI_HOST="wiki.nixos.org"
SSH_TARGET="root@${WIKI_HOST}"
FLAKE_TARGET=".#nixos-wiki-nixos-org"
@@ -23,23 +30,8 @@ ssh() {
command ssh ${SSH_OPTS} "$@"
}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
log() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
}
error() {
echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
}
warning() {
echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
}
# Source health checks
source "${SCRIPT_DIR}/health_checks.sh"
nixBuild() {
if command -v nom -v &>/dev/null; then
@@ -77,7 +69,7 @@ pre_deployment_checks() {
# Build the system
build_system() {
log "Building NixOS configuration..."
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L
}
# Deploy with retries
@@ -102,155 +94,6 @@ deploy_system() {
return 1
}
# Health check functions
check_nginx() {
log "Checking nginx service..."
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
error "Nginx service is not active"
ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
return 1
fi
# Check if main page loads with wiki content
local response_code
local response_body
response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
if [[ $response_code != "200" ]]; then
error "Main page returned HTTP status code: $response_code"
if [[ $response_code == "000" ]]; then
error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
fi
return 1
fi
# Check page content (follow redirects)
response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
error "Failed to fetch main page content: $?"
return 1
}
if ! echo "$response_body" | grep -q "<title>.*NixOS Wiki.*</title>"; then
error "Main page does not contain expected title"
error "Page title: $(echo "$response_body" | grep -o '<title>[^<]*</title>' | head -1 || echo "Could not extract title")"
error "First 500 chars of response:"
echo "$response_body" | head -c 500
return 1
fi
return 0
}
check_postgresql() {
log "Checking PostgreSQL service..."
ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
# Check if database is accessible
if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
error "PostgreSQL database 'mediawiki' is not accessible"
return 1
fi
return 0
}
check_postfix() {
log "Checking Postfix service..."
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
error "Postfix service is not active"
return 1
fi
# Check if postfix queue is processing (not stuck)
local queue_status
queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
if echo "$queue_status" | grep -q "Mail queue is empty"; then
log " Postfix queue is empty (good)"
elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
local queue_count
queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
if [ "${queue_count:-0}" -gt 50 ]; then
warning " Postfix has many queued emails: $queue_status"
else
log " Postfix has $queue_count queued email(s) (acceptable)"
fi
else
warning " Could not determine postfix queue status"
fi
return 0
}
check_backup_services() {
log "Checking backup services..."
# Check if backup timers are active
local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
for service in "${backup_services[@]}"; do
# shellcheck disable=SC2029
if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
log "$service is active"
else
warning "$service is not active"
fi
done
return 0
}
# Main health check
run_health_checks() {
log "Running post-deployment health checks..."
local failed_checks=0
local start_time
start_time=$(date +%s)
# Wait for system to stabilize
log "Waiting for system to stabilize..."
sleep 10
# Run individual health checks
local checks=(
"check_nginx"
"check_postgresql"
"check_postfix"
"check_backup_services"
)
for check in "${checks[@]}"; do
if $check; then
log "$check passed"
else
error "$check failed"
failed_checks=$((failed_checks + 1))
fi
done
# Check overall system status
log "Checking overall system status..."
local system_status
system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
if [[ $system_status == "running" ]]; then
log "System status: running"
else
warning "System status: $system_status"
if [[ $system_status == "degraded" ]]; then
log "Failed units:"
ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
fi
fi
local elapsed=$(($(date +%s) - start_time))
log "Health checks completed in ${elapsed}s"
if [ $failed_checks -gt 0 ]; then
error "$failed_checks health checks failed"
return 1
fi
return 0
}
# Rollback function
rollback() {
if [ -z "${CURRENT_GENERATION:-}" ]; then
@@ -293,6 +136,8 @@ main() {
fi
# Always run health checks to see current system state
log "Running post-deployment health checks..."
export WAIT_FOR_STABILIZATION=true
if ! run_health_checks; then
error "Post-deployment health checks failed"

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env bash
# shellcheck source-path=SCRIPTDIR
# Health check functions for NixOS Wiki
# This file can be sourced by other scripts
set -euo pipefail
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Source logging functions
source "${SCRIPT_DIR}/logging.sh"
# Required variables that should be set by the calling script
: "${WIKI_HOST:=wiki.nixos.org}"
: "${SSH_TARGET:=root@${WIKI_HOST}}"
# Health check functions
check_nginx() {
log "Checking nginx service..."
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
error "Nginx service is not active"
ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
return 1
fi
# Check if main page loads with wiki content
local response_code
local response_body
response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
if [[ $response_code != "200" ]]; then
error "Main page returned HTTP status code: $response_code"
if [[ $response_code == "000" ]]; then
error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
fi
return 1
fi
# Check page content (follow redirects)
response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
error "Failed to fetch main page content: $?"
return 1
}
if ! echo "$response_body" | grep -q "<title>.*NixOS Wiki.*</title>"; then
error "Main page does not contain expected title"
error "Page title: $(echo "$response_body" | grep -o '<title>[^<]*</title>' | head -1 || echo "Could not extract title")"
error "First 500 chars of response:"
echo "$response_body" | head -c 500
return 1
fi
return 0
}
check_postgresql() {
log "Checking PostgreSQL service..."
ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
# Check if database is accessible
if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
error "PostgreSQL database 'mediawiki' is not accessible"
return 1
fi
return 0
}
check_postfix() {
log "Checking Postfix service..."
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
error "Postfix service is not active"
return 1
fi
# Check if postfix queue is processing (not stuck)
local queue_status
queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
if echo "$queue_status" | grep -q "Mail queue is empty"; then
log " Postfix queue is empty (good)"
elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
local queue_count
queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
if [ "${queue_count:-0}" -gt 50 ]; then
warning " Postfix has many queued emails: $queue_status"
else
log " Postfix has $queue_count queued email(s) (acceptable)"
fi
else
warning " Could not determine postfix queue status"
fi
return 0
}
check_backup_services() {
log "Checking backup services..."
# Check if backup timers are active
local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
for service in "${backup_services[@]}"; do
# shellcheck disable=SC2029
if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
log "$service is active"
else
warning "$service is not active"
fi
done
return 0
}
# Main health check runner
run_health_checks() {
log "Running health checks..."
local failed_checks=0
local start_time
start_time=$(date +%s)
# Wait for system to stabilize if requested
if [ "${WAIT_FOR_STABILIZATION:-true}" = "true" ]; then
log "Waiting for system to stabilize..."
sleep 10
fi
# Run individual health checks
local checks=(
"check_nginx"
"check_postgresql"
"check_postfix"
"check_backup_services"
)
for check in "${checks[@]}"; do
if $check; then
log "$check passed"
else
error "$check failed"
failed_checks=$((failed_checks + 1))
fi
done
# Check overall system status
log "Checking overall system status..."
local system_status
system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
if [[ $system_status == "running" ]]; then
log "System status: running"
else
warning "System status: $system_status"
if [[ $system_status == "degraded" ]]; then
log "Failed units:"
ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
fi
fi
local elapsed=$(($(date +%s) - start_time))
log "Health checks completed in ${elapsed}s"
if [ $failed_checks -gt 0 ]; then
error "$failed_checks health checks failed"
return 1
fi
return 0
}
# Function to wait for SSH connectivity
wait_for_ssh() {
local max_attempts="${1:-30}"
local wait_time="${2:-10}"
local attempt=0
log "Waiting for SSH connectivity to ${WIKI_HOST}..."
while [ $attempt -lt "$max_attempts" ]; do
if ssh -o ConnectTimeout=5 -o BatchMode=yes "${SSH_TARGET}" "echo 'SSH connection successful'" &>/dev/null; then
log "SSH connection established after $((attempt * wait_time)) seconds"
return 0
fi
attempt=$((attempt + 1))
if [ $attempt -lt "$max_attempts" ]; then
log " Attempt $attempt/$max_attempts failed, waiting ${wait_time}s..."
sleep "$wait_time"
fi
done
error "Failed to establish SSH connection after $((max_attempts * wait_time)) seconds"
return 1
}

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env bash
# shellcheck shell=bash
# Shared logging functions for NixOS Wiki scripts
# This file should be sourced by other scripts
# Colors for output
export RED='\033[0;31m'
export GREEN='\033[0;32m'
export YELLOW='\033[1;33m'
export NC='\033[0m' # No Color
# Basic logging functions
log() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
}
error() {
echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
}
warning() {
echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
}

View File

@@ -0,0 +1,154 @@
#!/usr/bin/env bash
# shellcheck source-path=SCRIPTDIR
set -euo pipefail
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Source logging functions
source "${SCRIPT_DIR}/logging.sh"
WIKI_HOST="wiki.nixos.org"
SSH_TARGET="root@${WIKI_HOST}"
MAX_SSH_WAIT_ATTEMPTS=30 # 30 attempts * 10 seconds = 5 minutes max wait
SSH_WAIT_INTERVAL=10
# SSH options for our direct SSH calls
SSH_TMPDIR=$(mktemp -d /tmp/wiki-reboot.XXXXXX)
trap 'rm -rf "$SSH_TMPDIR"' EXIT
SSH_CONTROL_PATH="${SSH_TMPDIR}/ssh-%h"
SSH_OPTS="-o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s"
# Function to use SSH with our options
ssh() {
# shellcheck disable=SC2086
command ssh ${SSH_OPTS} "$@"
}
# Source health checks
source "${SCRIPT_DIR}/health_checks.sh"
# Function to get system uptime in seconds
get_uptime_seconds() {
ssh "${SSH_TARGET}" "cat /proc/uptime | cut -d' ' -f1 | cut -d'.' -f1" 2>/dev/null || echo "0"
}
# Pre-reboot checks
pre_reboot_checks() {
log "Running pre-reboot checks..."
# Check SSH connectivity
log "Checking SSH connectivity to ${WIKI_HOST}..."
if ! ssh -o ConnectTimeout=10 "${SSH_TARGET}" "echo 'SSH connection successful'"; then
error "Cannot establish SSH connection to ${WIKI_HOST}"
return 1
fi
# Get current uptime for comparison after reboot
UPTIME_BEFORE=$(get_uptime_seconds)
log "Current system uptime: ${UPTIME_BEFORE} seconds"
# Quick health check before reboot
log "Running quick health check before reboot..."
export WAIT_FOR_STABILIZATION=false
if ! run_health_checks; then
warning "System has issues before reboot, but proceeding anyway"
fi
return 0
}
# Initiate reboot
initiate_reboot() {
log "Initiating system reboot..."
# Use shutdown -r now for a clean reboot
if ! ssh "${SSH_TARGET}" "shutdown -r now" 2>/dev/null; then
# Connection might drop immediately, which is expected
log "Reboot command sent (connection may have dropped)"
fi
# Give system time to start shutting down
sleep 5
}
# Wait for system to come back online
wait_for_system_online() {
log "Waiting for system to come back online..."
# First, wait for SSH to stop responding (system going down)
local down_confirmed=false
for _i in {1..10}; do
if ! ssh -o ConnectTimeout=2 -o BatchMode=yes "${SSH_TARGET}" "true" &>/dev/null; then
down_confirmed=true
log "System appears to be rebooting (SSH not responding)"
break
fi
sleep 2
done
if [ "$down_confirmed" = false ]; then
warning "System may not have rebooted properly (SSH still responding)"
fi
# Now wait for SSH to come back
if ! wait_for_ssh "$MAX_SSH_WAIT_ATTEMPTS" "$SSH_WAIT_INTERVAL"; then
error "System did not come back online within expected time"
return 1
fi
# Verify reboot actually happened by checking uptime
local uptime_after
uptime_after=$(get_uptime_seconds)
if [ "$uptime_after" -lt 300 ]; then
log "System successfully rebooted (uptime: ${uptime_after} seconds)"
else
warning "System uptime is ${uptime_after} seconds - reboot may not have occurred"
warning "Continuing with health checks anyway..."
fi
return 0
}
# Main reboot flow
main() {
log "Starting NixOS Wiki reboot procedure..."
# Pre-reboot checks
if ! pre_reboot_checks; then
error "Pre-reboot checks failed, aborting"
exit 1
fi
# Log reboot action
log "Rebooting the production NixOS Wiki server..."
# Initiate reboot
initiate_reboot
# Wait for system to come back
if ! wait_for_system_online; then
error "Failed to confirm system is back online"
error "Manual intervention may be required!"
exit 1
fi
# Run post-reboot health checks
log "Running post-reboot health checks..."
export WAIT_FOR_STABILIZATION=true
if ! run_health_checks; then
error "Post-reboot health checks failed!"
error "System is online but may not be functioning correctly"
error "Please investigate immediately!"
exit 1
fi
log "Reboot completed successfully! 🚀"
log "NixOS Wiki is healthy at https://${WIKI_HOST}"
}
# Run main function
main "$@"