diff --git a/formatter.nix b/formatter.nix index c4e0f57..bbc4966 100644 --- a/formatter.nix +++ b/formatter.nix @@ -32,6 +32,7 @@ "*.envrc.private-template" ]; programs.shellcheck.enable = true; + settings.formatter.shellcheck.options = [ "-x" ]; programs.deno.enable = !pkgs.deno.meta.broken; }; packages = { diff --git a/targets/nixos-wiki.nixos.org/deploy.sh b/targets/nixos-wiki.nixos.org/deploy.sh index bebb0a8..abfc44c 100755 --- a/targets/nixos-wiki.nixos.org/deploy.sh +++ b/targets/nixos-wiki.nixos.org/deploy.sh @@ -1,7 +1,14 @@ #!/usr/bin/env bash +# shellcheck source-path=SCRIPTDIR set -euo pipefail +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source logging functions +source "${SCRIPT_DIR}/logging.sh" + WIKI_HOST="wiki.nixos.org" SSH_TARGET="root@${WIKI_HOST}" FLAKE_TARGET=".#nixos-wiki-nixos-org" @@ -23,23 +30,8 @@ ssh() { command ssh ${SSH_OPTS} "$@" } -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -log() { - echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" -} - -error() { - echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2 -} - -warning() { - echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" -} +# Source health checks +source "${SCRIPT_DIR}/health_checks.sh" nixBuild() { if command -v nom -v &>/dev/null; then @@ -77,7 +69,7 @@ pre_deployment_checks() { # Build the system build_system() { log "Building NixOS configuration..." - nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs + nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L } # Deploy with retries @@ -102,155 +94,6 @@ deploy_system() { return 1 } -# Health check functions -check_nginx() { - log "Checking nginx service..." - if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then - error "Nginx service is not active" - ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true - return 1 - fi - - # Check if main page loads with wiki content - local response_code - local response_body - response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000") - - if [[ $response_code != "200" ]]; then - error "Main page returned HTTP status code: $response_code" - if [[ $response_code == "000" ]]; then - error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page" - fi - return 1 - fi - - # Check page content (follow redirects) - response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || { - error "Failed to fetch main page content: $?" - return 1 - } - - if ! echo "$response_body" | grep -q ".*NixOS Wiki.*"; then - error "Main page does not contain expected title" - error "Page title: $(echo "$response_body" | grep -o '[^<]*' | head -1 || echo "Could not extract title")" - error "First 500 chars of response:" - echo "$response_body" | head -c 500 - return 1 - fi - - return 0 -} - -check_postgresql() { - log "Checking PostgreSQL service..." - ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1 - - # Check if database is accessible - if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then - error "PostgreSQL database 'mediawiki' is not accessible" - return 1 - fi - return 0 -} - -check_postfix() { - log "Checking Postfix service..." - if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then - error "Postfix service is not active" - return 1 - fi - - # Check if postfix queue is processing (not stuck) - local queue_status - queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1) - if echo "$queue_status" | grep -q "Mail queue is empty"; then - log " Postfix queue is empty (good)" - elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then - local queue_count - queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1) - if [ "${queue_count:-0}" -gt 50 ]; then - warning " Postfix has many queued emails: $queue_status" - else - log " Postfix has $queue_count queued email(s) (acceptable)" - fi - else - warning " Could not determine postfix queue status" - fi - - return 0 -} - -check_backup_services() { - log "Checking backup services..." - - # Check if backup timers are active - local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer") - for service in "${backup_services[@]}"; do - # shellcheck disable=SC2029 - if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then - log " ✓ $service is active" - else - warning " ✗ $service is not active" - fi - done - return 0 -} - -# Main health check -run_health_checks() { - log "Running post-deployment health checks..." - - local failed_checks=0 - local start_time - start_time=$(date +%s) - - # Wait for system to stabilize - log "Waiting for system to stabilize..." - sleep 10 - - # Run individual health checks - local checks=( - "check_nginx" - "check_postgresql" - "check_postfix" - "check_backup_services" - ) - - for check in "${checks[@]}"; do - if $check; then - log " ✓ $check passed" - else - error " ✗ $check failed" - failed_checks=$((failed_checks + 1)) - fi - done - - # Check overall system status - log "Checking overall system status..." - local system_status - system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'") - - if [[ $system_status == "running" ]]; then - log "System status: running" - else - warning "System status: $system_status" - if [[ $system_status == "degraded" ]]; then - log "Failed units:" - ssh "${SSH_TARGET}" "systemctl --failed --no-pager" - fi - fi - - local elapsed=$(($(date +%s) - start_time)) - log "Health checks completed in ${elapsed}s" - - if [ $failed_checks -gt 0 ]; then - error "$failed_checks health checks failed" - return 1 - fi - - return 0 -} - # Rollback function rollback() { if [ -z "${CURRENT_GENERATION:-}" ]; then @@ -293,6 +136,8 @@ main() { fi # Always run health checks to see current system state + log "Running post-deployment health checks..." + export WAIT_FOR_STABILIZATION=true if ! run_health_checks; then error "Post-deployment health checks failed" diff --git a/targets/nixos-wiki.nixos.org/health_checks.sh b/targets/nixos-wiki.nixos.org/health_checks.sh new file mode 100644 index 0000000..250e3c5 --- /dev/null +++ b/targets/nixos-wiki.nixos.org/health_checks.sh @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +# shellcheck source-path=SCRIPTDIR + +# Health check functions for NixOS Wiki +# This file can be sourced by other scripts + +set -euo pipefail + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source logging functions +source "${SCRIPT_DIR}/logging.sh" + +# Required variables that should be set by the calling script +: "${WIKI_HOST:=wiki.nixos.org}" +: "${SSH_TARGET:=root@${WIKI_HOST}}" + +# Health check functions +check_nginx() { + log "Checking nginx service..." + if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then + error "Nginx service is not active" + ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true + return 1 + fi + + # Check if main page loads with wiki content + local response_code + local response_body + response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000") + + if [[ $response_code != "200" ]]; then + error "Main page returned HTTP status code: $response_code" + if [[ $response_code == "000" ]]; then + error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page" + fi + return 1 + fi + + # Check page content (follow redirects) + response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || { + error "Failed to fetch main page content: $?" + return 1 + } + + if ! echo "$response_body" | grep -q ".*NixOS Wiki.*"; then + error "Main page does not contain expected title" + error "Page title: $(echo "$response_body" | grep -o '[^<]*' | head -1 || echo "Could not extract title")" + error "First 500 chars of response:" + echo "$response_body" | head -c 500 + return 1 + fi + + return 0 +} + +check_postgresql() { + log "Checking PostgreSQL service..." + ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1 + + # Check if database is accessible + if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then + error "PostgreSQL database 'mediawiki' is not accessible" + return 1 + fi + return 0 +} + +check_postfix() { + log "Checking Postfix service..." + if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then + error "Postfix service is not active" + return 1 + fi + + # Check if postfix queue is processing (not stuck) + local queue_status + queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1) + if echo "$queue_status" | grep -q "Mail queue is empty"; then + log " Postfix queue is empty (good)" + elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then + local queue_count + queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1) + if [ "${queue_count:-0}" -gt 50 ]; then + warning " Postfix has many queued emails: $queue_status" + else + log " Postfix has $queue_count queued email(s) (acceptable)" + fi + else + warning " Could not determine postfix queue status" + fi + + return 0 +} + +check_backup_services() { + log "Checking backup services..." + + # Check if backup timers are active + local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer") + for service in "${backup_services[@]}"; do + # shellcheck disable=SC2029 + if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then + log " ✓ $service is active" + else + warning " ✗ $service is not active" + fi + done + return 0 +} + +# Main health check runner +run_health_checks() { + log "Running health checks..." + + local failed_checks=0 + local start_time + start_time=$(date +%s) + + # Wait for system to stabilize if requested + if [ "${WAIT_FOR_STABILIZATION:-true}" = "true" ]; then + log "Waiting for system to stabilize..." + sleep 10 + fi + + # Run individual health checks + local checks=( + "check_nginx" + "check_postgresql" + "check_postfix" + "check_backup_services" + ) + + for check in "${checks[@]}"; do + if $check; then + log " ✓ $check passed" + else + error " ✗ $check failed" + failed_checks=$((failed_checks + 1)) + fi + done + + # Check overall system status + log "Checking overall system status..." + local system_status + system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'") + + if [[ $system_status == "running" ]]; then + log "System status: running" + else + warning "System status: $system_status" + if [[ $system_status == "degraded" ]]; then + log "Failed units:" + ssh "${SSH_TARGET}" "systemctl --failed --no-pager" + fi + fi + + local elapsed=$(($(date +%s) - start_time)) + log "Health checks completed in ${elapsed}s" + + if [ $failed_checks -gt 0 ]; then + error "$failed_checks health checks failed" + return 1 + fi + + return 0 +} + +# Function to wait for SSH connectivity +wait_for_ssh() { + local max_attempts="${1:-30}" + local wait_time="${2:-10}" + local attempt=0 + + log "Waiting for SSH connectivity to ${WIKI_HOST}..." + + while [ $attempt -lt "$max_attempts" ]; do + if ssh -o ConnectTimeout=5 -o BatchMode=yes "${SSH_TARGET}" "echo 'SSH connection successful'" &>/dev/null; then + log "SSH connection established after $((attempt * wait_time)) seconds" + return 0 + fi + + attempt=$((attempt + 1)) + if [ $attempt -lt "$max_attempts" ]; then + log " Attempt $attempt/$max_attempts failed, waiting ${wait_time}s..." + sleep "$wait_time" + fi + done + + error "Failed to establish SSH connection after $((max_attempts * wait_time)) seconds" + return 1 +} diff --git a/targets/nixos-wiki.nixos.org/logging.sh b/targets/nixos-wiki.nixos.org/logging.sh new file mode 100644 index 0000000..5fe70af --- /dev/null +++ b/targets/nixos-wiki.nixos.org/logging.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# shellcheck shell=bash + +# Shared logging functions for NixOS Wiki scripts +# This file should be sourced by other scripts + +# Colors for output +export RED='\033[0;31m' +export GREEN='\033[0;32m' +export YELLOW='\033[1;33m' +export NC='\033[0m' # No Color + +# Basic logging functions +log() { + echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" +} + +error() { + echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2 +} + +warning() { + echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" +} diff --git a/targets/nixos-wiki.nixos.org/reboot.sh b/targets/nixos-wiki.nixos.org/reboot.sh new file mode 100755 index 0000000..647a4dc --- /dev/null +++ b/targets/nixos-wiki.nixos.org/reboot.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# shellcheck source-path=SCRIPTDIR + +set -euo pipefail + +# Get the directory where this script is located +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Source logging functions +source "${SCRIPT_DIR}/logging.sh" + +WIKI_HOST="wiki.nixos.org" +SSH_TARGET="root@${WIKI_HOST}" +MAX_SSH_WAIT_ATTEMPTS=30 # 30 attempts * 10 seconds = 5 minutes max wait +SSH_WAIT_INTERVAL=10 + +# SSH options for our direct SSH calls +SSH_TMPDIR=$(mktemp -d /tmp/wiki-reboot.XXXXXX) +trap 'rm -rf "$SSH_TMPDIR"' EXIT + +SSH_CONTROL_PATH="${SSH_TMPDIR}/ssh-%h" +SSH_OPTS="-o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s" + +# Function to use SSH with our options +ssh() { + # shellcheck disable=SC2086 + command ssh ${SSH_OPTS} "$@" +} + +# Source health checks +source "${SCRIPT_DIR}/health_checks.sh" + +# Function to get system uptime in seconds +get_uptime_seconds() { + ssh "${SSH_TARGET}" "cat /proc/uptime | cut -d' ' -f1 | cut -d'.' -f1" 2>/dev/null || echo "0" +} + +# Pre-reboot checks +pre_reboot_checks() { + log "Running pre-reboot checks..." + + # Check SSH connectivity + log "Checking SSH connectivity to ${WIKI_HOST}..." + if ! ssh -o ConnectTimeout=10 "${SSH_TARGET}" "echo 'SSH connection successful'"; then + error "Cannot establish SSH connection to ${WIKI_HOST}" + return 1 + fi + + # Get current uptime for comparison after reboot + UPTIME_BEFORE=$(get_uptime_seconds) + log "Current system uptime: ${UPTIME_BEFORE} seconds" + + # Quick health check before reboot + log "Running quick health check before reboot..." + export WAIT_FOR_STABILIZATION=false + if ! run_health_checks; then + warning "System has issues before reboot, but proceeding anyway" + fi + + return 0 +} + +# Initiate reboot +initiate_reboot() { + log "Initiating system reboot..." + + # Use shutdown -r now for a clean reboot + if ! ssh "${SSH_TARGET}" "shutdown -r now" 2>/dev/null; then + # Connection might drop immediately, which is expected + log "Reboot command sent (connection may have dropped)" + fi + + # Give system time to start shutting down + sleep 5 +} + +# Wait for system to come back online +wait_for_system_online() { + log "Waiting for system to come back online..." + + # First, wait for SSH to stop responding (system going down) + local down_confirmed=false + for _i in {1..10}; do + if ! ssh -o ConnectTimeout=2 -o BatchMode=yes "${SSH_TARGET}" "true" &>/dev/null; then + down_confirmed=true + log "System appears to be rebooting (SSH not responding)" + break + fi + sleep 2 + done + + if [ "$down_confirmed" = false ]; then + warning "System may not have rebooted properly (SSH still responding)" + fi + + # Now wait for SSH to come back + if ! wait_for_ssh "$MAX_SSH_WAIT_ATTEMPTS" "$SSH_WAIT_INTERVAL"; then + error "System did not come back online within expected time" + return 1 + fi + + # Verify reboot actually happened by checking uptime + local uptime_after + uptime_after=$(get_uptime_seconds) + + if [ "$uptime_after" -lt 300 ]; then + log "System successfully rebooted (uptime: ${uptime_after} seconds)" + else + warning "System uptime is ${uptime_after} seconds - reboot may not have occurred" + warning "Continuing with health checks anyway..." + fi + + return 0 +} + +# Main reboot flow +main() { + log "Starting NixOS Wiki reboot procedure..." + + # Pre-reboot checks + if ! pre_reboot_checks; then + error "Pre-reboot checks failed, aborting" + exit 1 + fi + + # Log reboot action + log "Rebooting the production NixOS Wiki server..." + + # Initiate reboot + initiate_reboot + + # Wait for system to come back + if ! wait_for_system_online; then + error "Failed to confirm system is back online" + error "Manual intervention may be required!" + exit 1 + fi + + # Run post-reboot health checks + log "Running post-reboot health checks..." + export WAIT_FOR_STABILIZATION=true + if ! run_health_checks; then + error "Post-reboot health checks failed!" + error "System is online but may not be functioning correctly" + error "Please investigate immediately!" + exit 1 + fi + + log "Reboot completed successfully! 🚀" + log "NixOS Wiki is healthy at https://${WIKI_HOST}" +} + +# Run main function +main "$@"