diff --git a/formatter.nix b/formatter.nix
index c4e0f57..bbc4966 100644
--- a/formatter.nix
+++ b/formatter.nix
@@ -32,6 +32,7 @@
"*.envrc.private-template"
];
programs.shellcheck.enable = true;
+ settings.formatter.shellcheck.options = [ "-x" ];
programs.deno.enable = !pkgs.deno.meta.broken;
};
packages = {
diff --git a/targets/nixos-wiki.nixos.org/deploy.sh b/targets/nixos-wiki.nixos.org/deploy.sh
index bebb0a8..abfc44c 100755
--- a/targets/nixos-wiki.nixos.org/deploy.sh
+++ b/targets/nixos-wiki.nixos.org/deploy.sh
@@ -1,7 +1,14 @@
#!/usr/bin/env bash
+# shellcheck source-path=SCRIPTDIR
set -euo pipefail
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Source logging functions
+source "${SCRIPT_DIR}/logging.sh"
+
WIKI_HOST="wiki.nixos.org"
SSH_TARGET="root@${WIKI_HOST}"
FLAKE_TARGET=".#nixos-wiki-nixos-org"
@@ -23,23 +30,8 @@ ssh() {
command ssh ${SSH_OPTS} "$@"
}
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-log() {
- echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
-}
-
-error() {
- echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
-}
-
-warning() {
- echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
-}
+# Source health checks
+source "${SCRIPT_DIR}/health_checks.sh"
nixBuild() {
if command -v nom -v &>/dev/null; then
@@ -77,7 +69,7 @@ pre_deployment_checks() {
# Build the system
build_system() {
log "Building NixOS configuration..."
- nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs
+ nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L
}
# Deploy with retries
@@ -102,155 +94,6 @@ deploy_system() {
return 1
}
-# Health check functions
-check_nginx() {
- log "Checking nginx service..."
- if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
- error "Nginx service is not active"
- ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
- return 1
- fi
-
- # Check if main page loads with wiki content
- local response_code
- local response_body
- response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
-
- if [[ $response_code != "200" ]]; then
- error "Main page returned HTTP status code: $response_code"
- if [[ $response_code == "000" ]]; then
- error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
- fi
- return 1
- fi
-
- # Check page content (follow redirects)
- response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
- error "Failed to fetch main page content: $?"
- return 1
- }
-
- if ! echo "$response_body" | grep -q "
.*NixOS Wiki.*"; then
- error "Main page does not contain expected title"
- error "Page title: $(echo "$response_body" | grep -o '[^<]*' | head -1 || echo "Could not extract title")"
- error "First 500 chars of response:"
- echo "$response_body" | head -c 500
- return 1
- fi
-
- return 0
-}
-
-check_postgresql() {
- log "Checking PostgreSQL service..."
- ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
-
- # Check if database is accessible
- if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
- error "PostgreSQL database 'mediawiki' is not accessible"
- return 1
- fi
- return 0
-}
-
-check_postfix() {
- log "Checking Postfix service..."
- if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
- error "Postfix service is not active"
- return 1
- fi
-
- # Check if postfix queue is processing (not stuck)
- local queue_status
- queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
- if echo "$queue_status" | grep -q "Mail queue is empty"; then
- log " Postfix queue is empty (good)"
- elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
- local queue_count
- queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
- if [ "${queue_count:-0}" -gt 50 ]; then
- warning " Postfix has many queued emails: $queue_status"
- else
- log " Postfix has $queue_count queued email(s) (acceptable)"
- fi
- else
- warning " Could not determine postfix queue status"
- fi
-
- return 0
-}
-
-check_backup_services() {
- log "Checking backup services..."
-
- # Check if backup timers are active
- local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
- for service in "${backup_services[@]}"; do
- # shellcheck disable=SC2029
- if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
- log " ✓ $service is active"
- else
- warning " ✗ $service is not active"
- fi
- done
- return 0
-}
-
-# Main health check
-run_health_checks() {
- log "Running post-deployment health checks..."
-
- local failed_checks=0
- local start_time
- start_time=$(date +%s)
-
- # Wait for system to stabilize
- log "Waiting for system to stabilize..."
- sleep 10
-
- # Run individual health checks
- local checks=(
- "check_nginx"
- "check_postgresql"
- "check_postfix"
- "check_backup_services"
- )
-
- for check in "${checks[@]}"; do
- if $check; then
- log " ✓ $check passed"
- else
- error " ✗ $check failed"
- failed_checks=$((failed_checks + 1))
- fi
- done
-
- # Check overall system status
- log "Checking overall system status..."
- local system_status
- system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
-
- if [[ $system_status == "running" ]]; then
- log "System status: running"
- else
- warning "System status: $system_status"
- if [[ $system_status == "degraded" ]]; then
- log "Failed units:"
- ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
- fi
- fi
-
- local elapsed=$(($(date +%s) - start_time))
- log "Health checks completed in ${elapsed}s"
-
- if [ $failed_checks -gt 0 ]; then
- error "$failed_checks health checks failed"
- return 1
- fi
-
- return 0
-}
-
# Rollback function
rollback() {
if [ -z "${CURRENT_GENERATION:-}" ]; then
@@ -293,6 +136,8 @@ main() {
fi
# Always run health checks to see current system state
+ log "Running post-deployment health checks..."
+ export WAIT_FOR_STABILIZATION=true
if ! run_health_checks; then
error "Post-deployment health checks failed"
diff --git a/targets/nixos-wiki.nixos.org/health_checks.sh b/targets/nixos-wiki.nixos.org/health_checks.sh
new file mode 100644
index 0000000..250e3c5
--- /dev/null
+++ b/targets/nixos-wiki.nixos.org/health_checks.sh
@@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+# shellcheck source-path=SCRIPTDIR
+
+# Health check functions for NixOS Wiki
+# This file can be sourced by other scripts
+
+set -euo pipefail
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Source logging functions
+source "${SCRIPT_DIR}/logging.sh"
+
+# Required variables that should be set by the calling script
+: "${WIKI_HOST:=wiki.nixos.org}"
+: "${SSH_TARGET:=root@${WIKI_HOST}}"
+
+# Health check functions
+check_nginx() {
+ log "Checking nginx service..."
+ if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
+ error "Nginx service is not active"
+ ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
+ return 1
+ fi
+
+ # Check if main page loads with wiki content
+ local response_code
+ local response_body
+ response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
+
+ if [[ $response_code != "200" ]]; then
+ error "Main page returned HTTP status code: $response_code"
+ if [[ $response_code == "000" ]]; then
+ error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
+ fi
+ return 1
+ fi
+
+ # Check page content (follow redirects)
+ response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
+ error "Failed to fetch main page content: $?"
+ return 1
+ }
+
+ if ! echo "$response_body" | grep -q ".*NixOS Wiki.*"; then
+ error "Main page does not contain expected title"
+ error "Page title: $(echo "$response_body" | grep -o '[^<]*' | head -1 || echo "Could not extract title")"
+ error "First 500 chars of response:"
+ echo "$response_body" | head -c 500
+ return 1
+ fi
+
+ return 0
+}
+
+check_postgresql() {
+ log "Checking PostgreSQL service..."
+ ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
+
+ # Check if database is accessible
+ if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
+ error "PostgreSQL database 'mediawiki' is not accessible"
+ return 1
+ fi
+ return 0
+}
+
+check_postfix() {
+ log "Checking Postfix service..."
+ if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
+ error "Postfix service is not active"
+ return 1
+ fi
+
+ # Check if postfix queue is processing (not stuck)
+ local queue_status
+ queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
+ if echo "$queue_status" | grep -q "Mail queue is empty"; then
+ log " Postfix queue is empty (good)"
+ elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
+ local queue_count
+ queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
+ if [ "${queue_count:-0}" -gt 50 ]; then
+ warning " Postfix has many queued emails: $queue_status"
+ else
+ log " Postfix has $queue_count queued email(s) (acceptable)"
+ fi
+ else
+ warning " Could not determine postfix queue status"
+ fi
+
+ return 0
+}
+
+check_backup_services() {
+ log "Checking backup services..."
+
+ # Check if backup timers are active
+ local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
+ for service in "${backup_services[@]}"; do
+ # shellcheck disable=SC2029
+ if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
+ log " ✓ $service is active"
+ else
+ warning " ✗ $service is not active"
+ fi
+ done
+ return 0
+}
+
+# Main health check runner
+run_health_checks() {
+ log "Running health checks..."
+
+ local failed_checks=0
+ local start_time
+ start_time=$(date +%s)
+
+ # Wait for system to stabilize if requested
+ if [ "${WAIT_FOR_STABILIZATION:-true}" = "true" ]; then
+ log "Waiting for system to stabilize..."
+ sleep 10
+ fi
+
+ # Run individual health checks
+ local checks=(
+ "check_nginx"
+ "check_postgresql"
+ "check_postfix"
+ "check_backup_services"
+ )
+
+ for check in "${checks[@]}"; do
+ if $check; then
+ log " ✓ $check passed"
+ else
+ error " ✗ $check failed"
+ failed_checks=$((failed_checks + 1))
+ fi
+ done
+
+ # Check overall system status
+ log "Checking overall system status..."
+ local system_status
+ system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
+
+ if [[ $system_status == "running" ]]; then
+ log "System status: running"
+ else
+ warning "System status: $system_status"
+ if [[ $system_status == "degraded" ]]; then
+ log "Failed units:"
+ ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
+ fi
+ fi
+
+ local elapsed=$(($(date +%s) - start_time))
+ log "Health checks completed in ${elapsed}s"
+
+ if [ $failed_checks -gt 0 ]; then
+ error "$failed_checks health checks failed"
+ return 1
+ fi
+
+ return 0
+}
+
+# Function to wait for SSH connectivity
+wait_for_ssh() {
+ local max_attempts="${1:-30}"
+ local wait_time="${2:-10}"
+ local attempt=0
+
+ log "Waiting for SSH connectivity to ${WIKI_HOST}..."
+
+ while [ $attempt -lt "$max_attempts" ]; do
+ if ssh -o ConnectTimeout=5 -o BatchMode=yes "${SSH_TARGET}" "echo 'SSH connection successful'" &>/dev/null; then
+ log "SSH connection established after $((attempt * wait_time)) seconds"
+ return 0
+ fi
+
+ attempt=$((attempt + 1))
+ if [ $attempt -lt "$max_attempts" ]; then
+ log " Attempt $attempt/$max_attempts failed, waiting ${wait_time}s..."
+ sleep "$wait_time"
+ fi
+ done
+
+ error "Failed to establish SSH connection after $((max_attempts * wait_time)) seconds"
+ return 1
+}
diff --git a/targets/nixos-wiki.nixos.org/logging.sh b/targets/nixos-wiki.nixos.org/logging.sh
new file mode 100644
index 0000000..5fe70af
--- /dev/null
+++ b/targets/nixos-wiki.nixos.org/logging.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# shellcheck shell=bash
+
+# Shared logging functions for NixOS Wiki scripts
+# This file should be sourced by other scripts
+
+# Colors for output
+export RED='\033[0;31m'
+export GREEN='\033[0;32m'
+export YELLOW='\033[1;33m'
+export NC='\033[0m' # No Color
+
+# Basic logging functions
+log() {
+ echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
+}
+
+error() {
+ echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
+}
+
+warning() {
+ echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
+}
diff --git a/targets/nixos-wiki.nixos.org/reboot.sh b/targets/nixos-wiki.nixos.org/reboot.sh
new file mode 100755
index 0000000..647a4dc
--- /dev/null
+++ b/targets/nixos-wiki.nixos.org/reboot.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# shellcheck source-path=SCRIPTDIR
+
+set -euo pipefail
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Source logging functions
+source "${SCRIPT_DIR}/logging.sh"
+
+WIKI_HOST="wiki.nixos.org"
+SSH_TARGET="root@${WIKI_HOST}"
+MAX_SSH_WAIT_ATTEMPTS=30 # 30 attempts * 10 seconds = 5 minutes max wait
+SSH_WAIT_INTERVAL=10
+
+# SSH options for our direct SSH calls
+SSH_TMPDIR=$(mktemp -d /tmp/wiki-reboot.XXXXXX)
+trap 'rm -rf "$SSH_TMPDIR"' EXIT
+
+SSH_CONTROL_PATH="${SSH_TMPDIR}/ssh-%h"
+SSH_OPTS="-o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s"
+
+# Function to use SSH with our options
+ssh() {
+ # shellcheck disable=SC2086
+ command ssh ${SSH_OPTS} "$@"
+}
+
+# Source health checks
+source "${SCRIPT_DIR}/health_checks.sh"
+
+# Function to get system uptime in seconds
+get_uptime_seconds() {
+ ssh "${SSH_TARGET}" "cat /proc/uptime | cut -d' ' -f1 | cut -d'.' -f1" 2>/dev/null || echo "0"
+}
+
+# Pre-reboot checks
+pre_reboot_checks() {
+ log "Running pre-reboot checks..."
+
+ # Check SSH connectivity
+ log "Checking SSH connectivity to ${WIKI_HOST}..."
+ if ! ssh -o ConnectTimeout=10 "${SSH_TARGET}" "echo 'SSH connection successful'"; then
+ error "Cannot establish SSH connection to ${WIKI_HOST}"
+ return 1
+ fi
+
+ # Get current uptime for comparison after reboot
+ UPTIME_BEFORE=$(get_uptime_seconds)
+ log "Current system uptime: ${UPTIME_BEFORE} seconds"
+
+ # Quick health check before reboot
+ log "Running quick health check before reboot..."
+ export WAIT_FOR_STABILIZATION=false
+ if ! run_health_checks; then
+ warning "System has issues before reboot, but proceeding anyway"
+ fi
+
+ return 0
+}
+
+# Initiate reboot
+initiate_reboot() {
+ log "Initiating system reboot..."
+
+ # Use shutdown -r now for a clean reboot
+ if ! ssh "${SSH_TARGET}" "shutdown -r now" 2>/dev/null; then
+ # Connection might drop immediately, which is expected
+ log "Reboot command sent (connection may have dropped)"
+ fi
+
+ # Give system time to start shutting down
+ sleep 5
+}
+
+# Wait for system to come back online
+wait_for_system_online() {
+ log "Waiting for system to come back online..."
+
+ # First, wait for SSH to stop responding (system going down)
+ local down_confirmed=false
+ for _i in {1..10}; do
+ if ! ssh -o ConnectTimeout=2 -o BatchMode=yes "${SSH_TARGET}" "true" &>/dev/null; then
+ down_confirmed=true
+ log "System appears to be rebooting (SSH not responding)"
+ break
+ fi
+ sleep 2
+ done
+
+ if [ "$down_confirmed" = false ]; then
+ warning "System may not have rebooted properly (SSH still responding)"
+ fi
+
+ # Now wait for SSH to come back
+ if ! wait_for_ssh "$MAX_SSH_WAIT_ATTEMPTS" "$SSH_WAIT_INTERVAL"; then
+ error "System did not come back online within expected time"
+ return 1
+ fi
+
+ # Verify reboot actually happened by checking uptime
+ local uptime_after
+ uptime_after=$(get_uptime_seconds)
+
+ if [ "$uptime_after" -lt 300 ]; then
+ log "System successfully rebooted (uptime: ${uptime_after} seconds)"
+ else
+ warning "System uptime is ${uptime_after} seconds - reboot may not have occurred"
+ warning "Continuing with health checks anyway..."
+ fi
+
+ return 0
+}
+
+# Main reboot flow
+main() {
+ log "Starting NixOS Wiki reboot procedure..."
+
+ # Pre-reboot checks
+ if ! pre_reboot_checks; then
+ error "Pre-reboot checks failed, aborting"
+ exit 1
+ fi
+
+ # Log reboot action
+ log "Rebooting the production NixOS Wiki server..."
+
+ # Initiate reboot
+ initiate_reboot
+
+ # Wait for system to come back
+ if ! wait_for_system_online; then
+ error "Failed to confirm system is back online"
+ error "Manual intervention may be required!"
+ exit 1
+ fi
+
+ # Run post-reboot health checks
+ log "Running post-reboot health checks..."
+ export WAIT_FOR_STABILIZATION=true
+ if ! run_health_checks; then
+ error "Post-reboot health checks failed!"
+ error "System is online but may not be functioning correctly"
+ error "Please investigate immediately!"
+ exit 1
+ fi
+
+ log "Reboot completed successfully! 🚀"
+ log "NixOS Wiki is healthy at https://${WIKI_HOST}"
+}
+
+# Run main function
+main "$@"