Start on multiple GPUs by default (top 75 percentile by free_mem); UI selection for 'cpu' or 'auto' or a list of specific GPUs, which is now linked to the backend; Dynamically start/stop render threads for the devices, without requiring a full program restart

2025-08-08 23:44:39 +02:00 · 2022-11-14 11:23:22 +05:30
parent a19ba40672
commit ea03fd22db
8 changed files with 339 additions and 179 deletions
--- a/ui/index.html
+++ b/ui/index.html
@ -7,7 +7,7 @@
    <link rel="icon" type="image/png" href="/media/images/favicon-32x32.png" sizes="32x32">
    <link rel="stylesheet" href="/media/css/fonts.css?v=1">
    <link rel="stylesheet" href="/media/css/themes.css?v=2">
-    <link rel="stylesheet" href="/media/css/main.css?v=10">
+    <link rel="stylesheet" href="/media/css/main.css?v=11">
    <link rel="stylesheet" href="/media/css/auto-save.css?v=5">
    <link rel="stylesheet" href="/media/css/modifier-thumbnails.css?v=4">
    <link rel="stylesheet" href="/media/css/fontawesome-all.min.css?v=1">
@ -19,7 +19,7 @@
 <div id="container">
    <div id="top-nav">
        <div id="logo">
-            <h1>Stable Diffusion UI <small>v2.3.14 <span id="updateBranchLabel"></span></small></h1>
+            <h1>Stable Diffusion UI <small>v2.4 <span id="updateBranchLabel"></span></small></h1>
        </div>
        <div id="server-status">
            <div id="server-status-color">●</div>
@ -35,6 +35,9 @@
            <span id="tab-about" class="tab">
                <span><i class="fa fa-comments icon"></i> Help & Community</span>
            </span>
+            <!-- <span id="tab-system-info" class="tab">
+                <span><i class="fa fa-microchip icon"></i> System Info</span>
+            </span> -->
        </div>
    </div>

@ -245,7 +248,7 @@
            <div class="tab-content-inner">
                <div class="float-container">
                    <div class="float-child">
-			<h1>Help</h1>
+                        <h1>Help</h1>
                        <ul id="help-links">
                            <li><span class="help-section">Using the software</span>
                            <ul>
@ -270,7 +273,7 @@
                    </div>
  
                    <div class="float-child">
-			<h1>Community</h1>
+                        <h1>Community</h1>
                        <ul id="community-links">
                            <li><a href="https://discord.com/invite/u9yhsFmEkB" target="_blank"><i class="fa-brands fa-discord fa-fw"></i> Discord user community</a></li>
                            <li><a href="https://www.reddit.com/r/StableDiffusionUI/" target="_blank"><i class="fa-brands fa-reddit fa-fw"></i> Reddit community</a></li>
@ -280,6 +283,18 @@
                </div>
            </div>
        </div>
+        <!-- <div id="tab-content-system-info" class="tab-content">
+            <div id="system-info" class="tab-content-inner">
+                <h1>System Info</h1>
+                <table>
+                    <tr><td><label>Processor:</label></td><td id="system-info-cpu" class="value">Dingus</td></tr>
+                    <tr><td><label>RAM:</label></td><td id="system-info-ram" class="value">Dingus Another</td></tr>
+                    <tr><td><label>Compatible Graphics Cards (all):</label></td><td id="system-info-all-gpus" class="value">Dingus</td></tr>
+                    <tr><td></td><td>&nbsp;</td></tr>
+                    <tr><td><label>What's being used for rendering 🔥:</label></td><td id="system-info-active-gpus" class="value">Dingus<br/>Intel Graphics SOmething<br/>Another thing</td></tr>
+                </table>
+            </div>
+        </div> -->
    </div>
    

@ -317,13 +332,13 @@
 </div>
 </body>

-<script src="media/js/parameters.js?v=4"></script>
+<script src="media/js/parameters.js?v=5"></script>
 <script src="media/js/plugins.js?v=1"></script>
 <script src="media/js/utils.js?v=6"></script>
 <script src="media/js/inpainting-editor.js?v=1"></script>
 <script src="media/js/image-modifiers.js?v=6"></script>
 <script src="media/js/auto-save.js?v=7"></script>
-<script src="media/js/main.js?v=13"></script>
+<script src="media/js/main.js?v=14"></script>
 <script src="media/js/themes.js?v=4"></script>
 <script src="media/js/dnd.js?v=8"></script>
 <script>
--- a/ui/media/css/main.css
+++ b/ui/media/css/main.css
@ -123,7 +123,7 @@ label {
    padding: 16px;
    display: flex;
    flex-direction: column;
-    flex: 0 0 350pt;
+    flex: 0 0 370pt;
 }
 #editor label {
    font-weight: normal;
@ -887,3 +887,9 @@ input::file-selector-button {
    margin-bottom: 15px;
    box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.15), 0 6px 20px 0 rgba(0, 0, 0, 0.15);
 }
+#system-info .value {
+    text-align: left;
+}
+#system-info label {
+    float: right;
+}
--- a/ui/media/js/main.js
+++ b/ui/media/js/main.js
@ -27,6 +27,7 @@ let maskImageSelector = document.querySelector("#mask")
 let maskImagePreview = document.querySelector("#mask_preview")
 let turboField = document.querySelector('#turbo')
 let useCPUField = document.querySelector('#use_cpu')
+let autoPickGPUsField = document.querySelector('#auto_pick_gpus')
 let useGPUsField = document.querySelector('#use_gpus')
 let useFullPrecisionField = document.querySelector('#use_full_precision')
 let saveToDiskField = document.querySelector('#save_to_disk')
@ -810,14 +811,15 @@ function getCurrentUserRequest() {
 }

 function getCurrentRenderDeviceSelection() {
-    if (useCPUField.checked) {
+    let selectedGPUs = $('#use_gpus').val()
+
+    if (useCPUField.checked && !autoPickGPUsField.checked) {
        return 'cpu'
    }
-
-    let selectedGPUs = $(useGPUsField).val()
-    if (selectedGPUs.length == 0) {
-        selectedGPUs = ['auto']
+    if (autoPickGPUsField.checked || selectedGPUs.length == 0) {
+        return 'auto'
    }
+
    return selectedGPUs.join(',')
 }

@ -1136,10 +1138,27 @@ updatePromptStrength()

 useCPUField.addEventListener('click', function() {
    let gpuSettingEntry = getParameterSettingsEntry('use_gpus')
+    let autoPickGPUSettingEntry = getParameterSettingsEntry('auto_pick_gpus')
    if (this.checked) {
        gpuSettingEntry.style.display = 'none'
+        autoPickGPUSettingEntry.style.display = 'none'
+        autoPickGPUsField.setAttribute('data-old-value', autoPickGPUsField.checked)
+        autoPickGPUsField.checked = false
    } else if (useGPUsField.options.length >= MIN_GPUS_TO_SHOW_SELECTION) {
        gpuSettingEntry.style.display = ''
+        autoPickGPUSettingEntry.style.display = ''
+        autoPickGPUsField.checked = (autoPickGPUsField.getAttribute('data-old-value') === 'true')
+    }
+})
+
+useGPUsField.addEventListener('click', function() {
+    let selectedGPUs = $('#use_gpus').val()
+    autoPickGPUsField.checked = (selectedGPUs.length === 0)
+})
+
+autoPickGPUsField.addEventListener('click', function() {
+    if (this.checked) {
+        $('#use_gpus').val([])
    }
 })

@ -1360,6 +1379,8 @@ async function getDevices() {
            if (allDeviceIds.length < MIN_GPUS_TO_SHOW_SELECTION) {
                let gpuSettingEntry = getParameterSettingsEntry('use_gpus')
                gpuSettingEntry.style.display = 'none'
+                let autoPickGPUSettingEntry = getParameterSettingsEntry('auto_pick_gpus')
+                autoPickGPUSettingEntry.style.display = 'none'

                if (allDeviceIds.length === 0) {
                    useCPUField.checked = true
@ -1367,14 +1388,18 @@ async function getDevices() {
                }
            }

-            useGPUsField.innerHTML = ''
+            autoPickGPUsField.checked = (res['config'] === 'auto')

+            useGPUsField.innerHTML = ''
            allDeviceIds.forEach(device => {
                let deviceName = res['all'][device]
-                let selected = (activeDeviceIds.includes(device) ? 'selected' : '')
-                let deviceOption = `<option value="${device}" ${selected}>${deviceName}</option>`
+                let deviceOption = `<option value="${device}">${deviceName}</option>`
                useGPUsField.insertAdjacentHTML('beforeend', deviceOption)
            })
+
+            if (!autoPickGPUsField.checked) {
+                $('#use_gpus').val(activeDeviceIds)
+            }
        }
    } catch (e) {
        console.log('error fetching devices', e)
--- a/ui/media/js/parameters.js
+++ b/ui/media/js/parameters.js
@ -73,6 +73,12 @@ var PARAMETERS = [
 		note: "warning: this will be *very* slow",
 		default: false,
 	},
+	{
+		id: "auto_pick_gpus",
+		type: ParameterType.checkbox,
+		label: "Automatically pick the GPUs",
+		default: false,
+	},
 	{
 		id: "use_gpus",
 		type: ParameterType.select_multiple,
--- a/ui/sd_internal/device_manager.py
+++ b/ui/sd_internal/device_manager.py
@ -0,0 +1,156 @@
+import os
+import torch
+import traceback
+import re
+
+COMPARABLE_GPU_PERCENTILE = 0.75 # if a GPU's free_mem is within this % of the GPU with the most free_mem, it will be picked
+
+def get_device_delta(render_devices, active_devices):
+    '''
+    render_devices: 'cpu', or 'auto' or ['cuda:N'...]
+    active_devices: ['cpu', 'cuda:N'...]
+    '''
+
+    if render_devices is not None:
+        if render_devices in ('cpu', 'auto'):
+            render_devices = [render_devices]
+        elif isinstance(render_devices, list) and len(render_devices) > 0:
+            render_devices = list(filter(lambda x: x.startswith('cuda:'), render_devices))
+            if len(render_devices) == 0:
+                raise Exception('Invalid render_devices value in config.json. Valid: {"render_devices": ["cuda:0", "cuda:1"...]}, or {"render_devices": "cpu"} or {"render_devices": "auto"}')
+
+            render_devices = list(filter(lambda x: is_device_compatible(x), render_devices))
+            if len(render_devices) == 0:
+                raise Exception('Sorry, none of the render_devices configured in config.json are compatible with Stable Diffusion')
+        else:
+            raise Exception('Invalid render_devices value in config.json. Valid: {"render_devices": ["cuda:0", "cuda:1"...]}, or {"render_devices": "cpu"} or {"render_devices": "auto"}')
+    else:
+        render_devices = ['auto']
+
+    if 'auto' in render_devices:
+        render_devices = auto_pick_devices(active_devices)
+        if 'cpu' in render_devices:
+            print('WARNING: Could not find a compatible GPU. Using the CPU, but this will be very slow!')
+
+    active_devices = set(active_devices)
+    render_devices = set(render_devices)
+
+    devices_to_start = render_devices - active_devices
+    devices_to_stop = active_devices - render_devices
+
+    return devices_to_start, devices_to_stop
+
+def auto_pick_devices(currently_active_devices):
+    if not torch.cuda.is_available(): return ['cpu']
+
+    device_count = torch.cuda.device_count()
+    if device_count == 1:
+        return ['cuda:0'] if is_device_compatible('cuda:0') else ['cpu']
+
+    print('Autoselecting GPU. Using most free memory.')
+    devices = []
+    for device in range(device_count):
+        device = f'cuda:{device}'
+        if not is_device_compatible(device):
+            continue
+
+        mem_free, mem_total = torch.cuda.mem_get_info(device)
+        mem_free /= float(10**9)
+        mem_total /= float(10**9)
+        device_name = torch.cuda.get_device_name(device)
+        print(f'{device} detected: {device_name} - Memory: {round(mem_total - mem_free, 2)}Gb / {round(mem_total, 2)}Gb')
+        devices.append({'device': device, 'device_name': device_name, 'mem_free': mem_free})
+
+    devices.sort(key=lambda x:x['mem_free'], reverse=True)
+    max_free_mem = devices[0]['mem_free']
+    free_mem_threshold = COMPARABLE_GPU_PERCENTILE * max_free_mem
+
+    # Auto-pick algorithm:
+    # 1. Pick the top 75 percentile of the GPUs, sorted by free_mem.
+    # 2. Also include already-running devices (GPU-only), otherwise their free_mem will
+    #    always be very low (since their VRAM contains the model).
+    #    These already-running devices probably aren't terrible, since they were picked in the past.
+    #    Worst case, the user can restart the program and that'll get rid of them.
+    devices = list(filter((lambda x: x['mem_free'] > free_mem_threshold or x['device'] in currently_active_devices), devices))
+    return devices
+
+def device_init(thread_data, device):
+    '''
+    This function assumes the 'device' has already been verified to be compatible.
+    `get_device_delta()` has already filtered out incompatible devices.
+    '''
+
+    validate_device_id(device, log_prefix='device_init')
+
+    if device == 'cpu':
+        thread_data.device = 'cpu'
+        thread_data.device_name = get_processor_name()
+        print('Render device CPU available as', thread_data.device_name)
+        return
+
+    thread_data.device_name = torch.cuda.get_device_name(device)
+    thread_data.device = device
+
+    # Force full precision on 1660 and 1650 NVIDIA cards to avoid creating green images
+    device_name = thread_data.device_name.lower()
+    thread_data.force_full_precision = ('nvidia' in device_name or 'geforce' in device_name) and (' 1660' in device_name or ' 1650' in device_name)
+    if thread_data.force_full_precision:
+        print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', thread_data.device_name)
+        # Apply force_full_precision now before models are loaded.
+        thread_data.precision = 'full'
+
+    print(f'Setting {device} as active')
+    torch.cuda.device(device)
+
+    return
+
+def validate_device_id(device, log_prefix=''):
+    def is_valid():
+        if not isinstance(device, str):
+            return False
+        if device == 'cpu':
+            return True
+        if not device.startswith('cuda:') or not device[5:].isnumeric():
+            return False
+        return True
+
+    if not is_valid():
+        raise EnvironmentError(f"{log_prefix}: device id should be 'cpu', or 'cuda:N' (where N is an integer index for the GPU). Got: {device}")
+
+def is_device_compatible(device):
+    '''
+    Returns True/False, and prints any compatibility errors
+    '''
+    validate_device_id(device, log_prefix='is_device_compatible')
+
+    if device == 'cpu': return True
+    # Memory check
+    try:
+        _, mem_total = torch.cuda.mem_get_info(device)
+        mem_total /= float(10**9)
+        if mem_total < 3.0:
+            print(f'GPU {device} with less than 3 GB of VRAM is not compatible with Stable Diffusion')
+            return False
+    except RuntimeError as e:
+        print(str(e))
+        return False
+    return True
+
+def get_processor_name():
+    try:
+        import platform, subprocess
+        if platform.system() == "Windows":
+            return platform.processor()
+        elif platform.system() == "Darwin":
+            os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
+            command = "sysctl -n machdep.cpu.brand_string"
+            return subprocess.check_output(command).strip()
+        elif platform.system() == "Linux":
+            command = "cat /proc/cpuinfo"
+            all_info = subprocess.check_output(command, shell=True).decode().strip()
+            for line in all_info.split("\n"):
+                if "model name" in line:
+                    return re.sub(".*model name.*:", "", line, 1).strip()
+    except:
+        print(traceback.format_exc())
+        return "cpu"
--- a/ui/sd_internal/runtime.py
+++ b/ui/sd_internal/runtime.py
@ -37,6 +37,7 @@ config_yaml = "optimizedSD/v1-inference.yaml"
 filename_regex = re.compile('[^a-zA-Z0-9]')

 # api stuff
+from sd_internal import device_manager
 from . import Request, Response, Image as ResponseImage
 import base64
 from io import BytesIO
@ -45,73 +46,7 @@ from io import BytesIO
 from threading import local as LocalThreadVars
 thread_data = LocalThreadVars()

-def get_processor_name():
-    try:
-        import platform, subprocess
-        if platform.system() == "Windows":
-            return platform.processor()
-        elif platform.system() == "Darwin":
-            os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
-            command = "sysctl -n machdep.cpu.brand_string"
-            return subprocess.check_output(command).strip()
-        elif platform.system() == "Linux":
-            command = "cat /proc/cpuinfo"
-            all_info = subprocess.check_output(command, shell=True).decode().strip()
-            for line in all_info.split("\n"):
-                if "model name" in line:
-                    return re.sub(".*model name.*:", "", line, 1).strip()
-    except:
-        print(traceback.format_exc())
-        return "cpu"
-
-def validate_device_id(device, allow_auto=False, log_prefix=''):
-    device_names = ['cpu', 'auto'] if allow_auto else ['cpu']
-    if not isinstance(device, str) or (device not in device_names and (len(device) <= len('cuda:') or device[:5] != 'cuda:' or not device[5:].isnumeric())):
-        raise EnvironmentError(f"{log_prefix}: device id should be {', '.join(device_names)}, or 'cuda:N' (where N is an integer index for the GPU). Got: {device}")
-
-'''
-Returns True/False, and prints any compatibility errors
-'''
-def is_device_compatible(device):
-    validate_device_id(device, allow_auto=False, log_prefix='is_device_compatible')
-
-    if device == 'cpu': return True
-    # Memory check
-    try:
-        mem_free, mem_total = torch.cuda.mem_get_info(device)
-        mem_total /= float(10**9)
-        if mem_total < 3.0:
-            print('GPUs with less than 3 GB of VRAM are not compatible with Stable Diffusion')
-            return False
-    except RuntimeError as e:
-        print(str(e))
-        return False
-    return True
-
-def device_select(device):
-    validate_device_id(device, allow_auto=False, log_prefix='device_select')
-
-    if device == 'cpu': return True
-    if not torch.cuda.is_available(): return False
-    if not is_device_compatible(device):
-        return False
-
-    thread_data.device_name = torch.cuda.get_device_name(device)
-    thread_data.device = device
-
-    # Force full precision on 1660 and 1650 NVIDIA cards to avoid creating green images
-    device_name = thread_data.device_name.lower()
-    thread_data.force_full_precision = ('nvidia' in device_name or 'geforce' in device_name) and (' 1660' in device_name or ' 1650' in device_name)
-    if thread_data.force_full_precision:
-        print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', thread_data.device_name)
-        # Apply force_full_precision now before models are loaded.
-        thread_data.precision = 'full'
-
-    return True
-
-def device_init(device_selection):
-    validate_device_id(device_selection, allow_auto=True, log_prefix='device_init')
-
+def thread_init(device):
    # Thread bound properties
    thread_data.stop_processing = False
    thread_data.temp_images = {}
@ -140,50 +75,7 @@ def device_init(device_selection):
    thread_data.force_full_precision = False
    thread_data.reduced_memory = True

-    if device_selection == 'cpu':
-        thread_data.device = 'cpu'
-        thread_data.device_name = get_processor_name()
-        print('Render device CPU available as', thread_data.device_name)
-        return True
-    if not torch.cuda.is_available():
-        if device_selection == 'auto':
-            print('WARNING: Could not find a compatible GPU. Using the CPU, but this will be very slow!')
-            thread_data.device = 'cpu'
-            thread_data.device_name = get_processor_name()
-            return True
-        else:
-            raise EnvironmentError(f'Could not find a compatible GPU for the requested device_selection: {device_selection}!')
-
-    if device_selection == 'auto':
-        device_count = torch.cuda.device_count()
-        if device_count == 1 and device_select('cuda:0'):
-            torch.cuda.device('cuda:0')
-            return True
-
-        print('Autoselecting GPU. Using most free memory.')
-        max_mem_free = 0
-        best_device = None
-        for device in range(device_count):
-            device = f'cuda:{device}'
-            mem_free, mem_total = torch.cuda.mem_get_info(device)
-            mem_free /= float(10**9)
-            mem_total /= float(10**9)
-            device_name = torch.cuda.get_device_name(device)
-            print(f'{device} detected: {device_name} - Memory: {round(mem_total - mem_free, 2)}Gb / {round(mem_total, 2)}Gb')
-            if max_mem_free < mem_free:
-                max_mem_free = mem_free
-                best_device = device
-        if best_device and device_select(best_device):
-            print(f'Setting {device} as active')
-            torch.cuda.device(device)
-            return True
-
-    if device_selection != 'auto' and device_select(device_selection):
-        print(f'Setting {device_selection} as active')
-        torch.cuda.device(device_selection)
-        return True
-
-    return False
+    device_manager.device_init(thread_data, device)

 def load_model_ckpt():
    if not thread_data.ckpt_file: raise ValueError(f'Thread ckpt_file is undefined.')
@ -296,6 +188,8 @@ def unload_filters():
        del thread_data.model_real_esrgan
    thread_data.model_real_esrgan = None

+    gc()
+
 def unload_models():
    if thread_data.model is not None:
        print('Unloading models...')
@ -313,6 +207,8 @@ def unload_models():
    thread_data.modelCS = None
    thread_data.modelFS = None

+    gc()
+
 def wait_model_move_to(model, target_device): # Send to target_device and wait until complete.
    if thread_data.device == target_device: return
    start_mem = torch.cuda.memory_allocated(thread_data.device) / 1e6
@ -518,7 +414,6 @@ def do_mk_img(req: Request):
    if needs_model_reload:
        unload_models()
        unload_filters()
-        gc()
        load_model_ckpt()

    if thread_data.turbo != req.turbo:
--- a/ui/sd_internal/task_manager.py
+++ b/ui/sd_internal/task_manager.py
@ -14,7 +14,7 @@ import queue, threading, time, weakref
 from typing import Any, Generator, Hashable, Optional, Union

 from pydantic import BaseModel
-from sd_internal import Request, Response, runtime
+from sd_internal import Request, Response, runtime, device_manager

 THREAD_NAME_PREFIX = 'Runtime-Render/'
 ERR_LOCK_FAILED = ' failed to acquire lock within timeout.'
@ -253,11 +253,7 @@ def thread_render(device):
    global current_state, current_state_error, current_model_path, current_vae_path
    from . import runtime
    try:
-        if not runtime.device_init(device):
-            weak_thread_data[threading.current_thread()] = {
-                'error': f'Could not start on the selected device: {device}'
-            }
-            return
+        runtime.thread_init(device)
    except Exception as e:
        print(traceback.format_exc())
        weak_thread_data[threading.current_thread()] = {
@ -266,13 +262,19 @@ def thread_render(device):
        return
    weak_thread_data[threading.current_thread()] = {
        'device': runtime.thread_data.device,
-        'device_name': runtime.thread_data.device_name
+        'device_name': runtime.thread_data.device_name,
+        'alive': True
    }
    if runtime.thread_data.device != 'cpu' or is_alive() == 1:
        preload_model()
        current_state = ServerStates.Online
    while True:
        task_cache.clean()
+        if not weak_thread_data[threading.current_thread()]['alive']:
+            print(f'Shutting down thread for device {runtime.thread_data.device}')
+            runtime.unload_models()
+            runtime.unload_filters()
+            return
        if isinstance(current_state_error, SystemExit):
            current_state = ServerStates.Unavailable
            return
@ -371,12 +373,12 @@ def get_devices():
    gpu_count = torch.cuda.device_count()
    for device in range(gpu_count):
        device = f'cuda:{device}'
-        if not runtime.is_device_compatible(device):
+        if not device_manager.is_device_compatible(device):
            continue

        devices['all'].update({device: torch.cuda.get_device_name(device)})

-    devices['all'].update({'cpu': runtime.get_processor_name()})
+    devices['all'].update({'cpu': device_manager.get_processor_name()})

    # list the activated devices
    if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('get_devices' + ERR_LOCK_FAILED)
@ -411,13 +413,13 @@ def is_alive(device=None):
    finally:
        manager_lock.release()

-def start_render_thread(device='auto'):
-    if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('start_render_threads' + ERR_LOCK_FAILED)
+def start_render_thread(device):
+    if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('start_render_thread' + ERR_LOCK_FAILED)
    print('Start new Rendering Thread on device', device)
    try:
        rthread = threading.Thread(target=thread_render, kwargs={'device': device})
        rthread.daemon = True
-        rthread.name = THREAD_NAME_PREFIX + str(device)
+        rthread.name = THREAD_NAME_PREFIX + device
        rthread.start()
        render_threads.append(rthread)
    finally:
@ -425,6 +427,7 @@ def start_render_thread(device='auto'):
    timeout = DEVICE_START_TIMEOUT
    while not rthread.is_alive() or not rthread in weak_thread_data or not 'device' in weak_thread_data[rthread]:
        if rthread in weak_thread_data and 'error' in weak_thread_data[rthread]:
+            print(rthread, device, 'error:', weak_thread_data[rthread]['error'])
            return False
        if timeout <= 0:
            return False
@ -432,6 +435,59 @@ def start_render_thread(device='auto'):
        time.sleep(1)
    return True

+def stop_render_thread(device):
+    try:
+        device_manager.validate_device_id(device, log_prefix='stop_render_thread')
+    except:
+        print(traceback.format_exec())
+        return False
+
+    if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('stop_render_thread' + ERR_LOCK_FAILED)
+    print('Stopping Rendering Thread on device', device)
+
+    try:
+        thread_to_remove = None
+        for rthread in render_threads:
+            weak_data = weak_thread_data.get(rthread)
+            if weak_data is None or not 'device' in weak_data or weak_data['device'] is None:
+                continue
+            thread_device = weak_data['device']
+            if thread_device == device:
+                weak_data['alive'] = False
+                thread_to_remove = rthread
+                break
+        if thread_to_remove is not None:
+            render_threads.remove(rthread)
+            return True
+    finally:
+        manager_lock.release()
+
+    return False
+
+def update_render_threads(render_devices, active_devices):
+    devices_to_start, devices_to_stop = device_manager.get_device_delta(render_devices, active_devices)
+    print('devices_to_start', devices_to_start)
+    print('devices_to_stop', devices_to_stop)
+
+    for device in devices_to_stop:
+        if is_alive(device) <= 0:
+            print(device, 'is not alive')
+            continue
+        if not stop_render_thread(device):
+            print(device, 'could not stop render thread')
+
+    for device in devices_to_start:
+        if is_alive(device) >= 1:
+            print(device, 'already registered.')
+            continue
+        if not start_render_thread(device):
+            print(device, 'failed to start.')
+
+    if is_alive() <= 0: # No running devices, probably invalid user config.
+        raise EnvironmentError('ERROR: No active render devices! Please verify the "render_devices" value in config.json')
+
+    print('active devices', get_devices()['active'])
+
 def shutdown_event(): # Signal render thread to close on shutdown
    global current_state_error
    current_state_error = SystemExit('Application shutting down.')
@ -478,7 +534,6 @@ def render(req : ImageRequest):
        r.stream_image_progress = False

    new_task = RenderTask(r)
-    new_task.render_device = req.render_device

    if task_cache.put(r.session_id, new_task, TASK_TTL):
        # Use twice the normal timeout for adding user requests.
--- a/ui/server.py
+++ b/ui/server.py
@ -224,7 +224,10 @@ def read_web_data(key:str=None):
            raise HTTPException(status_code=500, detail="Config file is missing or unreadable")
        return JSONResponse(config, headers=NOCACHE_HEADERS)
    elif key == 'devices':
-        return JSONResponse(task_manager.get_devices(), headers=NOCACHE_HEADERS)
+        config = getConfig()
+        devices = task_manager.get_devices()
+        devices['config'] = config.get('render_devices', "auto")
+        return JSONResponse(devices, headers=NOCACHE_HEADERS)
    elif key == 'models':
        return JSONResponse(getModels(), headers=NOCACHE_HEADERS)
    elif key == 'modifiers': return FileResponse(os.path.join(SD_UI_DIR, 'modifiers.json'), headers=NOCACHE_HEADERS)
@ -272,17 +275,41 @@ def save_model_to_config(ckpt_model_name, vae_model_name):

    setConfig(config)

-@app.post('/render')
-def render(req : task_manager.ImageRequest):
+def save_render_devices_to_config(render_devices):
+    config = getConfig()
+    if 'render_devices' not in config:
+        config['render_devices'] = {}
+
+    config['render_devices'] = render_devices
+    if render_devices is None or len(render_devices) == 0:
+        del config['render_devices']
+
+    setConfig(config)
+
+def update_render_threads_on_request(req : task_manager.ImageRequest):
    if req.use_cpu:  # TODO Remove after transition.
        print('WARNING Replace {use_cpu: true} by {render_device: "cpu"}')
        req.render_device = 'cpu'
        del req.use_cpu
-    if req.render_device != 'cpu':
-        req.render_device = 'cuda:0' # temp hack to get beta working
-    if req.render_device and task_manager.is_alive(req.render_device) <= 0: raise HTTPException(status_code=403, detail=f'{req.render_device} rendering is not enabled in config.json or the thread has died...') # HTTP403 Forbidden
+
+    if req.render_device not in ('cpu', 'auto') and not req.render_device.startswith('cuda:'):
+        raise HTTPException(status_code=400, detail=f'Invalid render device requested: {req.render_device}')
+
+    if req.render_device.startswith('cuda:'):
+        req.render_device = req.render_device.split(',')
+
+    save_render_devices_to_config(req.render_device)
+    del req.render_device
+
+    update_render_threads()
+
+@app.post('/render')
+def render(req : task_manager.ImageRequest):
+    update_render_threads_on_request(req)
+
    if req.use_face_correction and task_manager.is_alive('cuda:0') <= 0: #TODO Remove when GFPGANer is fixed upstream.
-        raise HTTPException(status_code=412, detail=f'GFPGANer only works GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') # HTTP412 Precondition Failed
+        raise HTTPException(status_code=412, detail=f'The "Fix incorrect faces" feature works only on cuda:0. Disable "Fix incorrect faces" (in Image Settings), or use the CUDA_VISIBLE_DEVICES environment variable.')
+
    try:
        save_model_to_config(req.use_stable_diffusion_model, req.use_vae_model)
        req.use_stable_diffusion_model = resolve_ckpt_to_use(req.use_stable_diffusion_model)
@ -359,44 +386,19 @@ class LogSuppressFilter(logging.Filter):
        return True
 logging.getLogger('uvicorn.access').addFilter(LogSuppressFilter())

-config = getConfig()
-
 # Start the task_manager
 task_manager.default_model_to_load = resolve_ckpt_to_use()
 task_manager.default_vae_to_load = resolve_vae_to_use()
-if 'render_devices' in config:  # Start a new thread for each device.
-    if not isinstance(config['render_devices'], list):
-        raise Exception('Invalid render_devices value in config. Should be a list')
-    config['render_devices'] = set(config['render_devices']) # de-duplicate
-    for device in config['render_devices']:
-        if task_manager.is_alive(device) >= 1:
-            print(device, 'already registered.')
-            continue
-        if not task_manager.start_render_thread(device):
-            print(device, 'failed to start.')
-    if task_manager.is_alive() <= 0: # No running devices, probably invalid user config.
-        print('WARNING: No active render devices after loading config. Validate "render_devices" in config.json')
-        print('Loading default render devices to replace invalid render_devices field from config', config['render_devices'])

-if task_manager.is_alive() <= 0: # Either no defaults or no devices after loading config.
-    # Select best GPU device using free memory, if more than one device.
-    if task_manager.start_render_thread('auto'): # Detect best device for renders
-        # if cuda:0 is missing, another cuda device is better. try to start it...
-        if task_manager.is_alive('cuda:0') <= 0 and task_manager.is_alive('cpu') <= 0 and not task_manager.start_render_thread('cuda:0'):
-            print('Failed to start GPU:0...')
-    else:
-        print('Failed to start gpu device.')
-    if task_manager.is_alive('cpu') <= 0 and not task_manager.start_render_thread('cpu'): # Allow CPU to be used for renders
-        print('Failed to start CPU render device...')
+def update_render_threads():
+    config = getConfig()
+    render_devices = config.get('render_devices', "auto")
+    active_devices = task_manager.get_devices()['active'].keys()

-is_using_a_gpu = (task_manager.is_alive() > task_manager.is_alive('cpu'))
-if is_using_a_gpu and task_manager.is_alive('cuda:0') <= 0:
-    print('WARNING: GFPGANer only works on GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.')
-    print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer')
-    print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat')
-    print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh')
+    print('requesting for render_devices', render_devices)
+    task_manager.update_render_threads(render_devices, active_devices)

-print('active devices', task_manager.get_devices()['active'])
+update_render_threads()

 # start the browser ui
 import webbrowser; webbrowser.open('http://localhost:9000')