mirror of
https://github.com/Mic92/nixos-wiki-infra.git
synced 2025-08-19 11:20:02 +02:00
factor out health-checks/logging and add reboot.sh
This commit is contained in:
@@ -32,6 +32,7 @@
|
|||||||
"*.envrc.private-template"
|
"*.envrc.private-template"
|
||||||
];
|
];
|
||||||
programs.shellcheck.enable = true;
|
programs.shellcheck.enable = true;
|
||||||
|
settings.formatter.shellcheck.options = [ "-x" ];
|
||||||
programs.deno.enable = !pkgs.deno.meta.broken;
|
programs.deno.enable = !pkgs.deno.meta.broken;
|
||||||
};
|
};
|
||||||
packages = {
|
packages = {
|
||||||
|
@@ -1,7 +1,14 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck source-path=SCRIPTDIR
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Get the directory where this script is located
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# Source logging functions
|
||||||
|
source "${SCRIPT_DIR}/logging.sh"
|
||||||
|
|
||||||
WIKI_HOST="wiki.nixos.org"
|
WIKI_HOST="wiki.nixos.org"
|
||||||
SSH_TARGET="root@${WIKI_HOST}"
|
SSH_TARGET="root@${WIKI_HOST}"
|
||||||
FLAKE_TARGET=".#nixos-wiki-nixos-org"
|
FLAKE_TARGET=".#nixos-wiki-nixos-org"
|
||||||
@@ -23,23 +30,8 @@ ssh() {
|
|||||||
command ssh ${SSH_OPTS} "$@"
|
command ssh ${SSH_OPTS} "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Colors for output
|
# Source health checks
|
||||||
RED='\033[0;31m'
|
source "${SCRIPT_DIR}/health_checks.sh"
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
log() {
|
|
||||||
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
error() {
|
|
||||||
echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
warning() {
|
|
||||||
echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
nixBuild() {
|
nixBuild() {
|
||||||
if command -v nom -v &>/dev/null; then
|
if command -v nom -v &>/dev/null; then
|
||||||
@@ -77,7 +69,7 @@ pre_deployment_checks() {
|
|||||||
# Build the system
|
# Build the system
|
||||||
build_system() {
|
build_system() {
|
||||||
log "Building NixOS configuration..."
|
log "Building NixOS configuration..."
|
||||||
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs
|
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L
|
||||||
}
|
}
|
||||||
|
|
||||||
# Deploy with retries
|
# Deploy with retries
|
||||||
@@ -102,155 +94,6 @@ deploy_system() {
|
|||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# Health check functions
|
|
||||||
check_nginx() {
|
|
||||||
log "Checking nginx service..."
|
|
||||||
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
|
|
||||||
error "Nginx service is not active"
|
|
||||||
ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if main page loads with wiki content
|
|
||||||
local response_code
|
|
||||||
local response_body
|
|
||||||
response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
|
|
||||||
|
|
||||||
if [[ $response_code != "200" ]]; then
|
|
||||||
error "Main page returned HTTP status code: $response_code"
|
|
||||||
if [[ $response_code == "000" ]]; then
|
|
||||||
error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check page content (follow redirects)
|
|
||||||
response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
|
|
||||||
error "Failed to fetch main page content: $?"
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if ! echo "$response_body" | grep -q "<title>.*NixOS Wiki.*</title>"; then
|
|
||||||
error "Main page does not contain expected title"
|
|
||||||
error "Page title: $(echo "$response_body" | grep -o '<title>[^<]*</title>' | head -1 || echo "Could not extract title")"
|
|
||||||
error "First 500 chars of response:"
|
|
||||||
echo "$response_body" | head -c 500
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_postgresql() {
|
|
||||||
log "Checking PostgreSQL service..."
|
|
||||||
ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
|
|
||||||
|
|
||||||
# Check if database is accessible
|
|
||||||
if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
|
|
||||||
error "PostgreSQL database 'mediawiki' is not accessible"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_postfix() {
|
|
||||||
log "Checking Postfix service..."
|
|
||||||
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
|
|
||||||
error "Postfix service is not active"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check if postfix queue is processing (not stuck)
|
|
||||||
local queue_status
|
|
||||||
queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
|
|
||||||
if echo "$queue_status" | grep -q "Mail queue is empty"; then
|
|
||||||
log " Postfix queue is empty (good)"
|
|
||||||
elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
|
|
||||||
local queue_count
|
|
||||||
queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
|
|
||||||
if [ "${queue_count:-0}" -gt 50 ]; then
|
|
||||||
warning " Postfix has many queued emails: $queue_status"
|
|
||||||
else
|
|
||||||
log " Postfix has $queue_count queued email(s) (acceptable)"
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
warning " Could not determine postfix queue status"
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
check_backup_services() {
|
|
||||||
log "Checking backup services..."
|
|
||||||
|
|
||||||
# Check if backup timers are active
|
|
||||||
local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
|
|
||||||
for service in "${backup_services[@]}"; do
|
|
||||||
# shellcheck disable=SC2029
|
|
||||||
if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
|
|
||||||
log " ✓ $service is active"
|
|
||||||
else
|
|
||||||
warning " ✗ $service is not active"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Main health check
|
|
||||||
run_health_checks() {
|
|
||||||
log "Running post-deployment health checks..."
|
|
||||||
|
|
||||||
local failed_checks=0
|
|
||||||
local start_time
|
|
||||||
start_time=$(date +%s)
|
|
||||||
|
|
||||||
# Wait for system to stabilize
|
|
||||||
log "Waiting for system to stabilize..."
|
|
||||||
sleep 10
|
|
||||||
|
|
||||||
# Run individual health checks
|
|
||||||
local checks=(
|
|
||||||
"check_nginx"
|
|
||||||
"check_postgresql"
|
|
||||||
"check_postfix"
|
|
||||||
"check_backup_services"
|
|
||||||
)
|
|
||||||
|
|
||||||
for check in "${checks[@]}"; do
|
|
||||||
if $check; then
|
|
||||||
log " ✓ $check passed"
|
|
||||||
else
|
|
||||||
error " ✗ $check failed"
|
|
||||||
failed_checks=$((failed_checks + 1))
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Check overall system status
|
|
||||||
log "Checking overall system status..."
|
|
||||||
local system_status
|
|
||||||
system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
|
|
||||||
|
|
||||||
if [[ $system_status == "running" ]]; then
|
|
||||||
log "System status: running"
|
|
||||||
else
|
|
||||||
warning "System status: $system_status"
|
|
||||||
if [[ $system_status == "degraded" ]]; then
|
|
||||||
log "Failed units:"
|
|
||||||
ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
local elapsed=$(($(date +%s) - start_time))
|
|
||||||
log "Health checks completed in ${elapsed}s"
|
|
||||||
|
|
||||||
if [ $failed_checks -gt 0 ]; then
|
|
||||||
error "$failed_checks health checks failed"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Rollback function
|
# Rollback function
|
||||||
rollback() {
|
rollback() {
|
||||||
if [ -z "${CURRENT_GENERATION:-}" ]; then
|
if [ -z "${CURRENT_GENERATION:-}" ]; then
|
||||||
@@ -293,6 +136,8 @@ main() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Always run health checks to see current system state
|
# Always run health checks to see current system state
|
||||||
|
log "Running post-deployment health checks..."
|
||||||
|
export WAIT_FOR_STABILIZATION=true
|
||||||
if ! run_health_checks; then
|
if ! run_health_checks; then
|
||||||
error "Post-deployment health checks failed"
|
error "Post-deployment health checks failed"
|
||||||
|
|
||||||
|
193
targets/nixos-wiki.nixos.org/health_checks.sh
Normal file
193
targets/nixos-wiki.nixos.org/health_checks.sh
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck source-path=SCRIPTDIR
|
||||||
|
|
||||||
|
# Health check functions for NixOS Wiki
|
||||||
|
# This file can be sourced by other scripts
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Get the directory where this script is located
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# Source logging functions
|
||||||
|
source "${SCRIPT_DIR}/logging.sh"
|
||||||
|
|
||||||
|
# Required variables that should be set by the calling script
|
||||||
|
: "${WIKI_HOST:=wiki.nixos.org}"
|
||||||
|
: "${SSH_TARGET:=root@${WIKI_HOST}}"
|
||||||
|
|
||||||
|
# Health check functions
|
||||||
|
check_nginx() {
|
||||||
|
log "Checking nginx service..."
|
||||||
|
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
|
||||||
|
error "Nginx service is not active"
|
||||||
|
ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if main page loads with wiki content
|
||||||
|
local response_code
|
||||||
|
local response_body
|
||||||
|
response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
|
||||||
|
|
||||||
|
if [[ $response_code != "200" ]]; then
|
||||||
|
error "Main page returned HTTP status code: $response_code"
|
||||||
|
if [[ $response_code == "000" ]]; then
|
||||||
|
error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check page content (follow redirects)
|
||||||
|
response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
|
||||||
|
error "Failed to fetch main page content: $?"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! echo "$response_body" | grep -q "<title>.*NixOS Wiki.*</title>"; then
|
||||||
|
error "Main page does not contain expected title"
|
||||||
|
error "Page title: $(echo "$response_body" | grep -o '<title>[^<]*</title>' | head -1 || echo "Could not extract title")"
|
||||||
|
error "First 500 chars of response:"
|
||||||
|
echo "$response_body" | head -c 500
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_postgresql() {
|
||||||
|
log "Checking PostgreSQL service..."
|
||||||
|
ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
|
||||||
|
|
||||||
|
# Check if database is accessible
|
||||||
|
if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
|
||||||
|
error "PostgreSQL database 'mediawiki' is not accessible"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_postfix() {
|
||||||
|
log "Checking Postfix service..."
|
||||||
|
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
|
||||||
|
error "Postfix service is not active"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if postfix queue is processing (not stuck)
|
||||||
|
local queue_status
|
||||||
|
queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
|
||||||
|
if echo "$queue_status" | grep -q "Mail queue is empty"; then
|
||||||
|
log " Postfix queue is empty (good)"
|
||||||
|
elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
|
||||||
|
local queue_count
|
||||||
|
queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
|
||||||
|
if [ "${queue_count:-0}" -gt 50 ]; then
|
||||||
|
warning " Postfix has many queued emails: $queue_status"
|
||||||
|
else
|
||||||
|
log " Postfix has $queue_count queued email(s) (acceptable)"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
warning " Could not determine postfix queue status"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
check_backup_services() {
|
||||||
|
log "Checking backup services..."
|
||||||
|
|
||||||
|
# Check if backup timers are active
|
||||||
|
local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
|
||||||
|
for service in "${backup_services[@]}"; do
|
||||||
|
# shellcheck disable=SC2029
|
||||||
|
if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
|
||||||
|
log " ✓ $service is active"
|
||||||
|
else
|
||||||
|
warning " ✗ $service is not active"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main health check runner
|
||||||
|
run_health_checks() {
|
||||||
|
log "Running health checks..."
|
||||||
|
|
||||||
|
local failed_checks=0
|
||||||
|
local start_time
|
||||||
|
start_time=$(date +%s)
|
||||||
|
|
||||||
|
# Wait for system to stabilize if requested
|
||||||
|
if [ "${WAIT_FOR_STABILIZATION:-true}" = "true" ]; then
|
||||||
|
log "Waiting for system to stabilize..."
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run individual health checks
|
||||||
|
local checks=(
|
||||||
|
"check_nginx"
|
||||||
|
"check_postgresql"
|
||||||
|
"check_postfix"
|
||||||
|
"check_backup_services"
|
||||||
|
)
|
||||||
|
|
||||||
|
for check in "${checks[@]}"; do
|
||||||
|
if $check; then
|
||||||
|
log " ✓ $check passed"
|
||||||
|
else
|
||||||
|
error " ✗ $check failed"
|
||||||
|
failed_checks=$((failed_checks + 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check overall system status
|
||||||
|
log "Checking overall system status..."
|
||||||
|
local system_status
|
||||||
|
system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
|
||||||
|
|
||||||
|
if [[ $system_status == "running" ]]; then
|
||||||
|
log "System status: running"
|
||||||
|
else
|
||||||
|
warning "System status: $system_status"
|
||||||
|
if [[ $system_status == "degraded" ]]; then
|
||||||
|
log "Failed units:"
|
||||||
|
ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local elapsed=$(($(date +%s) - start_time))
|
||||||
|
log "Health checks completed in ${elapsed}s"
|
||||||
|
|
||||||
|
if [ $failed_checks -gt 0 ]; then
|
||||||
|
error "$failed_checks health checks failed"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to wait for SSH connectivity
|
||||||
|
wait_for_ssh() {
|
||||||
|
local max_attempts="${1:-30}"
|
||||||
|
local wait_time="${2:-10}"
|
||||||
|
local attempt=0
|
||||||
|
|
||||||
|
log "Waiting for SSH connectivity to ${WIKI_HOST}..."
|
||||||
|
|
||||||
|
while [ $attempt -lt "$max_attempts" ]; do
|
||||||
|
if ssh -o ConnectTimeout=5 -o BatchMode=yes "${SSH_TARGET}" "echo 'SSH connection successful'" &>/dev/null; then
|
||||||
|
log "SSH connection established after $((attempt * wait_time)) seconds"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
attempt=$((attempt + 1))
|
||||||
|
if [ $attempt -lt "$max_attempts" ]; then
|
||||||
|
log " Attempt $attempt/$max_attempts failed, waiting ${wait_time}s..."
|
||||||
|
sleep "$wait_time"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
error "Failed to establish SSH connection after $((max_attempts * wait_time)) seconds"
|
||||||
|
return 1
|
||||||
|
}
|
24
targets/nixos-wiki.nixos.org/logging.sh
Normal file
24
targets/nixos-wiki.nixos.org/logging.sh
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck shell=bash
|
||||||
|
|
||||||
|
# Shared logging functions for NixOS Wiki scripts
|
||||||
|
# This file should be sourced by other scripts
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
export RED='\033[0;31m'
|
||||||
|
export GREEN='\033[0;32m'
|
||||||
|
export YELLOW='\033[1;33m'
|
||||||
|
export NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Basic logging functions
|
||||||
|
log() {
|
||||||
|
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
error() {
|
||||||
|
echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
|
||||||
|
}
|
154
targets/nixos-wiki.nixos.org/reboot.sh
Executable file
154
targets/nixos-wiki.nixos.org/reboot.sh
Executable file
@@ -0,0 +1,154 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# shellcheck source-path=SCRIPTDIR
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Get the directory where this script is located
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
# Source logging functions
|
||||||
|
source "${SCRIPT_DIR}/logging.sh"
|
||||||
|
|
||||||
|
WIKI_HOST="wiki.nixos.org"
|
||||||
|
SSH_TARGET="root@${WIKI_HOST}"
|
||||||
|
MAX_SSH_WAIT_ATTEMPTS=30 # 30 attempts * 10 seconds = 5 minutes max wait
|
||||||
|
SSH_WAIT_INTERVAL=10
|
||||||
|
|
||||||
|
# SSH options for our direct SSH calls
|
||||||
|
SSH_TMPDIR=$(mktemp -d /tmp/wiki-reboot.XXXXXX)
|
||||||
|
trap 'rm -rf "$SSH_TMPDIR"' EXIT
|
||||||
|
|
||||||
|
SSH_CONTROL_PATH="${SSH_TMPDIR}/ssh-%h"
|
||||||
|
SSH_OPTS="-o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s"
|
||||||
|
|
||||||
|
# Function to use SSH with our options
|
||||||
|
ssh() {
|
||||||
|
# shellcheck disable=SC2086
|
||||||
|
command ssh ${SSH_OPTS} "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Source health checks
|
||||||
|
source "${SCRIPT_DIR}/health_checks.sh"
|
||||||
|
|
||||||
|
# Function to get system uptime in seconds
|
||||||
|
get_uptime_seconds() {
|
||||||
|
ssh "${SSH_TARGET}" "cat /proc/uptime | cut -d' ' -f1 | cut -d'.' -f1" 2>/dev/null || echo "0"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Pre-reboot checks
|
||||||
|
pre_reboot_checks() {
|
||||||
|
log "Running pre-reboot checks..."
|
||||||
|
|
||||||
|
# Check SSH connectivity
|
||||||
|
log "Checking SSH connectivity to ${WIKI_HOST}..."
|
||||||
|
if ! ssh -o ConnectTimeout=10 "${SSH_TARGET}" "echo 'SSH connection successful'"; then
|
||||||
|
error "Cannot establish SSH connection to ${WIKI_HOST}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get current uptime for comparison after reboot
|
||||||
|
UPTIME_BEFORE=$(get_uptime_seconds)
|
||||||
|
log "Current system uptime: ${UPTIME_BEFORE} seconds"
|
||||||
|
|
||||||
|
# Quick health check before reboot
|
||||||
|
log "Running quick health check before reboot..."
|
||||||
|
export WAIT_FOR_STABILIZATION=false
|
||||||
|
if ! run_health_checks; then
|
||||||
|
warning "System has issues before reboot, but proceeding anyway"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initiate reboot
|
||||||
|
initiate_reboot() {
|
||||||
|
log "Initiating system reboot..."
|
||||||
|
|
||||||
|
# Use shutdown -r now for a clean reboot
|
||||||
|
if ! ssh "${SSH_TARGET}" "shutdown -r now" 2>/dev/null; then
|
||||||
|
# Connection might drop immediately, which is expected
|
||||||
|
log "Reboot command sent (connection may have dropped)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Give system time to start shutting down
|
||||||
|
sleep 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# Wait for system to come back online
|
||||||
|
wait_for_system_online() {
|
||||||
|
log "Waiting for system to come back online..."
|
||||||
|
|
||||||
|
# First, wait for SSH to stop responding (system going down)
|
||||||
|
local down_confirmed=false
|
||||||
|
for _i in {1..10}; do
|
||||||
|
if ! ssh -o ConnectTimeout=2 -o BatchMode=yes "${SSH_TARGET}" "true" &>/dev/null; then
|
||||||
|
down_confirmed=true
|
||||||
|
log "System appears to be rebooting (SSH not responding)"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$down_confirmed" = false ]; then
|
||||||
|
warning "System may not have rebooted properly (SSH still responding)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Now wait for SSH to come back
|
||||||
|
if ! wait_for_ssh "$MAX_SSH_WAIT_ATTEMPTS" "$SSH_WAIT_INTERVAL"; then
|
||||||
|
error "System did not come back online within expected time"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Verify reboot actually happened by checking uptime
|
||||||
|
local uptime_after
|
||||||
|
uptime_after=$(get_uptime_seconds)
|
||||||
|
|
||||||
|
if [ "$uptime_after" -lt 300 ]; then
|
||||||
|
log "System successfully rebooted (uptime: ${uptime_after} seconds)"
|
||||||
|
else
|
||||||
|
warning "System uptime is ${uptime_after} seconds - reboot may not have occurred"
|
||||||
|
warning "Continuing with health checks anyway..."
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main reboot flow
|
||||||
|
main() {
|
||||||
|
log "Starting NixOS Wiki reboot procedure..."
|
||||||
|
|
||||||
|
# Pre-reboot checks
|
||||||
|
if ! pre_reboot_checks; then
|
||||||
|
error "Pre-reboot checks failed, aborting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Log reboot action
|
||||||
|
log "Rebooting the production NixOS Wiki server..."
|
||||||
|
|
||||||
|
# Initiate reboot
|
||||||
|
initiate_reboot
|
||||||
|
|
||||||
|
# Wait for system to come back
|
||||||
|
if ! wait_for_system_online; then
|
||||||
|
error "Failed to confirm system is back online"
|
||||||
|
error "Manual intervention may be required!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run post-reboot health checks
|
||||||
|
log "Running post-reboot health checks..."
|
||||||
|
export WAIT_FOR_STABILIZATION=true
|
||||||
|
if ! run_health_checks; then
|
||||||
|
error "Post-reboot health checks failed!"
|
||||||
|
error "System is online but may not be functioning correctly"
|
||||||
|
error "Please investigate immediately!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Reboot completed successfully! 🚀"
|
||||||
|
log "NixOS Wiki is healthy at https://${WIKI_HOST}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run main function
|
||||||
|
main "$@"
|
Reference in New Issue
Block a user