From 7c72608e1c8f9b36f5eb76509915b4eec4f6aaea Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 16 Oct 2022 21:41:39 -0400 Subject: [PATCH 001/221] First draft for Multi-GPU support --- ui/sd_internal/runtime.py | 450 +++++++++++++++++++-------------- ui/sd_internal/task_manager.py | 186 +++++++++----- ui/server.py | 324 ++++++++++++++---------- 3 files changed, 584 insertions(+), 376 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 0b0a3003..81baa103 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -35,63 +35,144 @@ import base64 from io import BytesIO #from colorama import Fore -# local -stop_processing = False -temp_images = {} +from threading import local as LocalThreadVars +thread_data = LocalThreadVars() -ckpt_file = None -gfpgan_file = None -real_esrgan_file = None +def device_would_fail(device): + if device == 'cpu': return None + # Returns None when no issues found, otherwise returns the detected error str. + # Memory check + try: + mem_free, mem_total = torch.cuda.mem_get_info(device) + mem_total /= float(10**9) + if mem_total < 3.0: + return 'GPUs with less than 3 GB of VRAM are not compatible with Stable Diffusion' + except RuntimeError as e: + return str(e) # Return cuda errors from mem_get_info as strings + return None -model = None -modelCS = None -modelFS = None -model_gfpgan = None -model_real_esrgan = None +def device_select(device): + if device == 'cpu': return True + if not torch.cuda.is_available(): return False + failure_msg = device_would_fail(device) + if failure_msg: + if 'invalid device' in failure_msg: + raise NameError(f'GPU "{device}" could not be found. Remove this device from config.render_devices or use one of "auto" or "cuda".') + print(failure_msg) + return False -model_is_half = False -model_fs_is_half = False -device = None -unet_bs = 1 -precision = 'autocast' -sampler_plms = None -sampler_ddim = None + device_name = torch.cuda.get_device_name(device) -has_valid_gpu = False -force_full_precision = False -try: - gpu = torch.cuda.current_device() - gpu_name = torch.cuda.get_device_name(gpu) - print('GPU detected: ', gpu_name) - - force_full_precision = ('nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower()) and (' 1660' in gpu_name or ' 1650' in gpu_name) # otherwise these NVIDIA cards create green images - if force_full_precision: + # otherwise these NVIDIA cards create green images + thread_data.force_full_precision = ('nvidia' in device_name.lower() or 'geforce' in device_name.lower()) and (' 1660' in device_name or ' 1650' in device_name) + if thread_data.force_full_precision: print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', gpu_name) - mem_free, mem_total = torch.cuda.mem_get_info(gpu) - mem_total /= float(10**9) - if mem_total < 3.0: - print("GPUs with less than 3 GB of VRAM are not compatible with Stable Diffusion") - raise Exception() + thread_data.device = device + thread_data.has_valid_gpu = True + return True - has_valid_gpu = True -except: +def device_init(device_selection=None): + # Thread bound properties + thread_data.stop_processing = False + thread_data.temp_images = {} + + thread_data.ckpt_file = None + thread_data.gfpgan_file = None + thread_data.real_esrgan_file = None + + thread_data.model = None + thread_data.modelCS = None + thread_data.modelFS = None + thread_data.model_gfpgan = None + thread_data.model_real_esrgan = None + + thread_data.model_is_half = False + thread_data.model_fs_is_half = False + thread_data.device = None + thread_data.unet_bs = 1 + thread_data.precision = 'autocast' + thread_data.sampler_plms = None + thread_data.sampler_ddim = None + + thread_data.turbo = False + thread_data.has_valid_gpu = False + thread_data.force_full_precision = False + + if device_selection.lower() == 'cpu': + print('CPU requested, skipping gpu init.') + thread_data.device = 'cpu' + return + if not torch.cuda.is_available(): + print('WARNING: torch.cuda is not available. Using the CPU, but this will be very slow!') + return + device_count = torch.cuda.device_count() + if device_count <= 1 and device_selection == 'auto': + device_selection = 'current' # Use 'auto' only when there is more than one compatible device found. + if device_selection == 'auto': + print('Autoselecting GPU. Using most free memory.') + max_mem_free = 0 + best_device = None + for device in range(device_count): + mem_free, mem_total = torch.cuda.mem_get_info(device) + mem_free /= float(10**9) + mem_total /= float(10**9) + device_name = torch.cuda.get_device_name(device) + print(f'GPU:{device} detected: {device_name} - Memory: {round(mem_total - mem_free, 2)}Go / {round(mem_total, 2)}Go') + if max_mem_free < mem_free: + max_mem_free = mem_free + best_device = device + if best_device and device_select(device): + print(f'Setting GPU:{device} as active') + torch.cuda.device(device) + return + if isinstance(device_selection, str): + device_selection = device_selection.lower() + if device_selection.startswith('gpu:'): + device_selection = int(device_selection[4:]) + if device_selection != 'cuda' and device_selection != 'current': + if device_select(device_selection): + if isinstance(device_selection, int): + print(f'Setting GPU:{device_selection} as active') + else: + print(f'Setting {device_selection} as active') + torch.cuda.device(device_selection) + return + # By default use current device. + print('Checking current GPU...') + device = torch.cuda.current_device() + device_name = torch.cuda.get_device_name(device) + print(f'GPU:{device} detected: {device_name}') + if device_select(device): + return print('WARNING: No compatible GPU found. Using the CPU, but this will be very slow!') - pass + thread_data.device = 'cpu' -def load_model_ckpt(ckpt_to_use, device_to_use='cuda', turbo=False, unet_bs_to_use=1, precision_to_use='autocast'): - global ckpt_file, model, modelCS, modelFS, model_is_half, device, unet_bs, precision, model_fs_is_half +def is_first_cuda_device(device): + if thread_data.device == 0 or thread_data.device == '0': + return True + if thread_data.device == 'cuda' or thread_data.device == 'cuda:0': + return True + if thread_data.device == torch.device(0): + return True + return False - device = device_to_use if has_valid_gpu else 'cpu' - precision = precision_to_use if not force_full_precision else 'full' - unet_bs = unet_bs_to_use +def load_model_ckpt(): + if not thread_data.ckpt_file: raise ValueError(f'Thread ckpt_file is undefined.') + if not os.path.exists(thread_data.ckpt_file + '.ckpt'): raise FileNotFoundError(f'Cannot find {thread_data.ckpt_file}.ckpt') + + if not thread_data.precision: + thread_data.precision = 'full' if thread_data.force_full_precision else 'autocast' + if not thread_data.unet_bs: + thread_data.unet_bs = 1 unload_model() - if device == 'cpu': - precision = 'full' + if thread_data.device == 'cpu': + thread_data.precision = 'full' - sd = load_model_from_config(f"{ckpt_to_use}.ckpt") + print('loading', thread_data.ckpt_file, 'to', thread_data.device, 'using precision', thread_data.precision) + sd = load_model_from_config(thread_data.ckpt_file + '.ckpt') li, lo = [], [] for key, value in sd.items(): sp = key.split(".") @@ -114,88 +195,84 @@ def load_model_ckpt(ckpt_to_use, device_to_use='cuda', turbo=False, unet_bs_to_u model = instantiate_from_config(config.modelUNet) _, _ = model.load_state_dict(sd, strict=False) model.eval() - model.cdevice = device - model.unet_bs = unet_bs - model.turbo = turbo + model.cdevice = torch.device(thread_data.device) + model.unet_bs = thread_data.unet_bs + model.turbo = thread_data.turbo + if thread_data.device != 'cpu': + model.to(thread_data.device) + thread_data.model = model modelCS = instantiate_from_config(config.modelCondStage) _, _ = modelCS.load_state_dict(sd, strict=False) modelCS.eval() - modelCS.cond_stage_model.device = device + modelCS.cond_stage_model.device = torch.device(thread_data.device) + if thread_data.device != 'cpu': + modelCS.to(thread_data.device) + thread_data.modelCS = modelCS modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() + if thread_data.device != 'cpu': + modelFS.to(thread_data.device) + thread_data.modelFS = modelFS del sd - if device != "cpu" and precision == "autocast": - model.half() - modelCS.half() - modelFS.half() - model_is_half = True - model_fs_is_half = True + if thread_data.device != "cpu" and thread_data.precision == "autocast": + thread_data.model.half() + thread_data.modelCS.half() + thread_data.modelFS.half() + thread_data.model_is_half = True + thread_data.model_fs_is_half = True else: - model_is_half = False - model_fs_is_half = False + thread_data.model_is_half = False + thread_data.model_fs_is_half = False - ckpt_file = ckpt_to_use - - print('loaded ', ckpt_file, 'to', device, 'precision', precision) + print('loaded', thread_data.ckpt_file, 'as', model.device, '->', modelCS.cond_stage_model.device, '->', thread_data.modelFS.device, 'using precision', thread_data.precision) def unload_model(): - global model, modelCS, modelFS + if thread_data.model is not None: + print('Unloading models...') + del thread_data.model + del thread_data.modelCS + del thread_data.modelFS + thread_data.model = None + thread_data.modelCS = None + thread_data.modelFS = None - if model is not None: - del model - del modelCS - del modelFS - - model = None - modelCS = None - modelFS = None - -def load_model_gfpgan(gfpgan_to_use): - global gfpgan_file, model_gfpgan - - if gfpgan_to_use is None: +def load_model_gfpgan(): + if thread_data.gfpgan_file is None: + print('load_model_gfpgan called without setting gfpgan_file') return + if thread_data.device != 'cpu' and not is_first_cuda_device(thread_data.device): + #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + raise Exception(f'Current device {torch.device(thread_data.device)} is not {torch.device(0)}.') + model_path = thread_data.gfpgan_file + ".pth" + thread_data.model_gfpgan = GFPGANer(device=torch.device(thread_data.device), model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None) + print('loaded', thread_data.gfpgan_file, 'to', thread_data.model_gfpgan.device, 'precision', thread_data.precision) - gfpgan_file = gfpgan_to_use - model_path = gfpgan_to_use + ".pth" - - if device == 'cpu': - model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cpu')) - else: - model_gfpgan = GFPGANer(model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=torch.device('cuda')) - - print('loaded ', gfpgan_to_use, 'to', device, 'precision', precision) - -def load_model_real_esrgan(real_esrgan_to_use): - global real_esrgan_file, model_real_esrgan - - if real_esrgan_to_use is None: +def load_model_real_esrgan(): + if thread_data.real_esrgan_file is None: + print('load_model_real_esrgan called without setting real_esrgan_file') return - - real_esrgan_file = real_esrgan_to_use - model_path = real_esrgan_to_use + ".pth" + model_path = thread_data.real_esrgan_file + ".pth" RealESRGAN_models = { 'RealESRGAN_x4plus': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4), 'RealESRGAN_x4plus_anime_6B': RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4) } - model_to_use = RealESRGAN_models[real_esrgan_to_use] + model_to_use = RealESRGAN_models[thread_data.real_esrgan_file] - if device == 'cpu': - model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=False) # cpu does not support half - model_real_esrgan.device = torch.device('cpu') - model_real_esrgan.model.to('cpu') + if thread_data.device == 'cpu': + thread_data.model_real_esrgan = RealESRGANer(device=torch.device(thread_data.device), scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=False) # cpu does not support half + #thread_data.model_real_esrgan.device = torch.device(thread_data.device) + thread_data.model_real_esrgan.model.to('cpu') else: - model_real_esrgan = RealESRGANer(scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=model_is_half) + thread_data.model_real_esrgan = RealESRGANer(device=torch.device(thread_data.device), scale=2, model_path=model_path, model=model_to_use, pre_pad=0, half=thread_data.model_is_half) - model_real_esrgan.model.name = real_esrgan_to_use - - print('loaded ', real_esrgan_to_use, 'to', device, 'precision', precision) + thread_data.model_real_esrgan.model.name = thread_data.real_esrgan_file + print('loaded ', thread_data.real_esrgan_file, 'to', thread_data.model_real_esrgan.device, 'precision', thread_data.precision) def get_base_path(disk_path, session_id, prompt, ext, suffix=None): if disk_path is None: return None @@ -214,14 +291,22 @@ def get_base_path(disk_path, session_id, prompt, ext, suffix=None): def apply_filters(filter_name, image_data): print(f'Applying filter {filter_name}...') + if isinstance(image_data, torch.Tensor): + print(image_data) + image_data.to(thread_data.device) + gc() if filter_name == 'gfpgan': - _, _, output = model_gfpgan.enhance(image_data[:,:,::-1], has_aligned=False, only_center_face=False, paste_back=True) + if thread_data.model_gfpgan is None: raise Exception('Model "gfpgan" not loaded.') + print('enhance with', thread_data.gfpgan_file, 'on', thread_data.model_gfpgan.device, 'precision', thread_data.precision) + _, _, output = thread_data.model_gfpgan.enhance(image_data[:,:,::-1], has_aligned=False, only_center_face=False, paste_back=True) image_data = output[:,:,::-1] if filter_name == 'real_esrgan': - output, _ = model_real_esrgan.enhance(image_data[:,:,::-1]) + if thread_data.model_real_esrgan is None: raise Exception('Model "gfpgan" not loaded.') + print('enhance with', thread_data.real_esrgan_file, 'on', thread_data.model_real_esrgan.device, 'precision', thread_data.precision) + output, _ = thread_data.model_real_esrgan.enhance(image_data[:,:,::-1]) image_data = output[:,:,::-1] return image_data @@ -234,12 +319,12 @@ def mk_img(req: Request): gc() - if device != "cpu": - modelFS.to("cpu") - modelCS.to("cpu") + if thread_data.device != "cpu": + thread_data.modelFS.to("cpu") + thread_data.modelCS.to("cpu") - model.model1.to("cpu") - model.model2.to("cpu") + thread_data.model.model1.to("cpu") + thread_data.model.model2.to("cpu") gc() @@ -249,66 +334,55 @@ def mk_img(req: Request): }) def do_mk_img(req: Request): - global ckpt_file - global model, modelCS, modelFS, device - global model_gfpgan, model_real_esrgan - global stop_processing - - stop_processing = False + thread_data.stop_processing = False res = Response() res.request = req res.images = [] - temp_images.clear() + thread_data.temp_images.clear() # custom model support: # the req.use_stable_diffusion_model needs to be a valid path # to the ckpt file (without the extension). + if not os.path.exists(req.use_stable_diffusion_model + '.ckpt'): raise FileNotFoundError(f'Cannot find {req.use_stable_diffusion_model}.ckpt') needs_model_reload = False - ckpt_to_use = ckpt_file - if ckpt_to_use != req.use_stable_diffusion_model: - ckpt_to_use = req.use_stable_diffusion_model + if thread_data.ckpt_file != req.use_stable_diffusion_model: + thread_data.ckpt_file = req.use_stable_diffusion_model needs_model_reload = True - model.turbo = req.turbo if req.use_cpu: - if device != 'cpu': - device = 'cpu' - - if model_is_half: - load_model_ckpt(ckpt_to_use, device) + if thread_data.device != 'cpu': + thread_data.device = 'cpu' + if thread_data.model_is_half: + load_model_ckpt() needs_model_reload = False - - load_model_gfpgan(gfpgan_file) - load_model_real_esrgan(real_esrgan_file) + load_model_gfpgan() + load_model_real_esrgan() else: - if has_valid_gpu: - prev_device = device - device = 'cuda' - - if (precision == 'autocast' and (req.use_full_precision or not model_is_half)) or \ - (precision == 'full' and not req.use_full_precision and not force_full_precision): - - load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, ('full' if req.use_full_precision else 'autocast')) + if thread_data.has_valid_gpu: + if (thread_data.precision == 'autocast' and (req.use_full_precision or not thread_data.model_is_half)) or \ + (thread_data.precision == 'full' and not req.use_full_precision and not thread_data.force_full_precision): + thread_data.precision = 'full' if req.use_full_precision else 'autocast' + load_model_ckpt() + load_model_gfpgan() + load_model_real_esrgan() needs_model_reload = False - if prev_device != device: - load_model_gfpgan(gfpgan_file) - load_model_real_esrgan(real_esrgan_file) - if needs_model_reload: - load_model_ckpt(ckpt_to_use, device, req.turbo, unet_bs, precision) + load_model_ckpt() - if req.use_face_correction != gfpgan_file: - load_model_gfpgan(req.use_face_correction) + if req.use_face_correction != thread_data.gfpgan_file: + thread_data.gfpgan_file = req.use_face_correction + load_model_gfpgan() + if req.use_upscale != thread_data.real_esrgan_file: + thread_data.real_esrgan_file = req.use_upscale + load_model_real_esrgan() - if req.use_upscale != real_esrgan_file: - load_model_real_esrgan(req.use_upscale) - - model.cdevice = device - modelCS.cond_stage_model.device = device + if thread_data.turbo != req.turbo: + thread_data.turbo = req.turbo + thread_data.model.turbo = req.turbo opt_prompt = req.prompt opt_seed = req.seed @@ -318,9 +392,8 @@ def do_mk_img(req: Request): opt_ddim_eta = 0.0 opt_init_img = req.init_image - print(req.to_string(), '\n device', device) - - print('\n\n Using precision:', precision) + print(req.to_string(), '\n device', thread_data.device) + print('\n\n Using precision:', thread_data.precision) seed_everything(opt_seed) @@ -329,7 +402,7 @@ def do_mk_img(req: Request): assert prompt is not None data = [batch_size * [prompt]] - if precision == "autocast" and device != "cpu": + if thread_data.precision == "autocast" and thread_data.device != "cpu": precision_scope = autocast else: precision_scope = nullcontext @@ -345,22 +418,22 @@ def do_mk_img(req: Request): handler = _img2img init_image = load_img(req.init_image, req.width, req.height) - init_image = init_image.to(device) + init_image = init_image.to(thread_data.device) - if device != "cpu" and precision == "autocast": + if thread_data.device != "cpu" and thread_data.precision == "autocast": init_image = init_image.half() - modelFS.to(device) + thread_data.modelFS.to(thread_data.device) init_image = repeat(init_image, '1 ... -> b ...', b=batch_size) - init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image)) # move to latent space + init_latent = thread_data.modelFS.get_first_stage_encoding(thread_data.modelFS.encode_first_stage(init_image)) # move to latent space if req.mask is not None: - mask = load_mask(req.mask, req.width, req.height, init_latent.shape[2], init_latent.shape[3], True).to(device) + mask = load_mask(req.mask, req.width, req.height, init_latent.shape[2], init_latent.shape[3], True).to(thread_data.device) mask = mask[0][0].unsqueeze(0).repeat(4, 1, 1).unsqueeze(0) mask = repeat(mask, '1 ... -> b ...', b=batch_size) - if device != "cpu" and precision == "autocast": + if thread_data.device != "cpu" and thread_data.precision == "autocast": mask = mask.half() move_fs_to_cpu() @@ -381,10 +454,10 @@ def do_mk_img(req: Request): for prompts in tqdm(data, desc="data"): with precision_scope("cuda"): - modelCS.to(device) + thread_data.modelCS.to(thread_data.device) uc = None if req.guidance_scale != 1.0: - uc = modelCS.get_learned_conditioning(batch_size * [req.negative_prompt]) + uc = thread_data.modelCS.get_learned_conditioning(batch_size * [req.negative_prompt]) if isinstance(prompts, tuple): prompts = list(prompts) @@ -397,11 +470,11 @@ def do_mk_img(req: Request): weight = weights[i] # if not skip_normalize: weight = weight / totalWeight - c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) + c = torch.add(c, thread_data.modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) else: - c = modelCS.get_learned_conditioning(prompts) + c = thread_data.modelCS.get_learned_conditioning(prompts) - modelFS.to(device) + thread_data.modelFS.to(thread_data.device) partial_x_samples = None def img_callback(x_samples, i): @@ -417,7 +490,7 @@ def do_mk_img(req: Request): partial_images = [] for i in range(batch_size): - x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) + x_samples_ddim = thread_data.modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") x_sample = x_sample.astype(np.uint8) @@ -429,18 +502,19 @@ def do_mk_img(req: Request): del img, x_sample, x_samples_ddim # don't delete x_samples, it is used in the code that called this callback - temp_images[str(req.session_id) + '/' + str(i)] = buf + thread_data.temp_images[str(req.session_id) + '/' + str(i)] = buf partial_images.append({'path': f'/image/tmp/{req.session_id}/{i}'}) progress['output'] = partial_images yield json.dumps(progress) - if stop_processing: + if thread_data.stop_processing: raise UserInitiatedStop("User requested that we stop processing") # run the handler try: + print('Running handler...') if handler == _txt2img: x_samples = _txt2img(req.width, req.height, req.num_outputs, req.num_inference_steps, req.guidance_scale, None, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, req.sampler) else: @@ -458,7 +532,7 @@ def do_mk_img(req: Request): print("saving images") for i in range(batch_size): - x_samples_ddim = modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) + x_samples_ddim = thread_data.modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") x_sample = x_sample.astype(np.uint8) @@ -469,7 +543,7 @@ def do_mk_img(req: Request): return_orig_img = not has_filters or not req.show_only_filtered_image - if stop_processing: + if thread_data.stop_processing: return_orig_img = True if req.save_to_disk_path is not None: @@ -489,7 +563,7 @@ def do_mk_img(req: Request): del img - if has_filters and not stop_processing: + if has_filters and not thread_data.stop_processing: filters_applied = [] if req.use_face_correction: x_sample = apply_filters('gfpgan', x_sample) @@ -514,7 +588,7 @@ def do_mk_img(req: Request): move_fs_to_cpu() gc() del x_samples, x_samples_ddim, x_sample - print("memory_final = ", torch.cuda.memory_allocated() / 1e6) + print(f'memory_final = {round(torch.cuda.memory_allocated(thread_data.device) / 1e6, 2)}Mo') print('Task completed') @@ -527,7 +601,7 @@ def save_image(img, img_out_path): print('could not save the file', traceback.format_exc()) def save_metadata(meta_out_path, req, prompt, opt_seed): - metadata = f"""{prompt} + metadata = f'''{prompt} Width: {req.width} Height: {req.height} Seed: {opt_seed} @@ -538,8 +612,8 @@ Use Face Correction: {req.use_face_correction} Use Upscaling: {req.use_upscale} Sampler: {req.sampler} Negative Prompt: {req.negative_prompt} -Stable Diffusion Model: {req.use_stable_diffusion_model + '.ckpt'} -""" +Stable Diffusion model: {req.use_stable_diffusion_model + '.ckpt'} +''' try: with open(meta_out_path, 'w') as f: f.write(metadata) @@ -549,16 +623,19 @@ Stable Diffusion Model: {req.use_stable_diffusion_model + '.ckpt'} def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, sampler_name): shape = [opt_n_samples, opt_C, opt_H // opt_f, opt_W // opt_f] - if device != "cpu": - mem = torch.cuda.memory_allocated() / 1e6 - modelCS.to("cpu") - while torch.cuda.memory_allocated() / 1e6 >= mem: + if thread_data.device != "cpu": + mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 + print('Device:', thread_data.device, 'CS_Model, Memory transfer starting. Memory Used:', round(mem, 2), 'Mo') + thread_data.modelCS.to("cpu") + while torch.cuda.memory_allocated(thread_data.device) / 1e6 >= mem and mem > 0: + print('Device:', thread_data.device, 'Waiting Memory transfer. Memory Used:', round(mem, 2), 'Mo') time.sleep(1) + print('Transfered', round(mem - torch.cuda.memory_allocated(thread_data.device) / 1e6, 2), 'Mo') if sampler_name == 'ddim': - model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False) + thread_data.model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False) - samples_ddim = model.sample( + samples_ddim = thread_data.model.sample( S=opt_ddim_steps, conditioning=c, seed=opt_seed, @@ -572,14 +649,13 @@ def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, mask=mask, sampler = sampler_name, ) - yield from samples_ddim def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, opt_ddim_eta, opt_seed, img_callback, mask): # encode (scaled latent) - z_enc = model.stochastic_encode( + z_enc = thread_data.model.stochastic_encode( init_latent, - torch.tensor([t_enc] * batch_size).to(device), + torch.tensor([t_enc] * batch_size).to(thread_data.device), opt_seed, opt_ddim_eta, opt_ddim_steps, @@ -587,7 +663,7 @@ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, o x_T = None if mask is None else init_latent # decode it - samples_ddim = model.sample( + samples_ddim = thread_data.model.sample( t_enc, c, z_enc, @@ -602,16 +678,19 @@ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, o yield from samples_ddim def move_fs_to_cpu(): - if device != "cpu": - mem = torch.cuda.memory_allocated() / 1e6 - modelFS.to("cpu") - while torch.cuda.memory_allocated() / 1e6 >= mem: + if thread_data.device != "cpu": + mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 + print('Device:', thread_data.device, 'FS_Model, Memory transfer starting. Memory Used:', round(mem, 2), 'Mo') + thread_data.modelFS.to("cpu") + while torch.cuda.memory_allocated(thread_data.device) / 1e6 >= mem and mem > 0: + print('Device:', thread_data.device, 'Waiting for Memory transfer. Memory Used:', round(mem, 2), 'Mo') time.sleep(1) + print('Transfered', round(mem - torch.cuda.memory_allocated(thread_data.device) / 1e6, 2), 'Mo') def gc(): - if device == 'cpu': + #gc.collect() + if thread_data.device == 'cpu': return - torch.cuda.empty_cache() torch.cuda.ipc_collect() @@ -621,7 +700,6 @@ def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) - def load_model_from_config(ckpt, verbose=False): print(f"Loading model from {ckpt}") pl_sd = torch.load(ckpt, map_location="cpu") diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 00364e0c..75e1d262 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -9,6 +9,10 @@ from typing import Any, Generator, Hashable, Optional, Union from pydantic import BaseModel from sd_internal import Request, Response +ERR_LOCK_FAILED = ' failed to acquire lock within timeout.' +LOCK_TIMEOUT = 15 # Maximum locking time in seconds before failing a task. +# It's better to get an exception than a deadlock... ALWAYS use timeout in critical paths. + class SymbolClass(type): # Print nicely formatted Symbol names. def __repr__(self): return self.__qualname__ def __str__(self): return self.__name__ @@ -66,17 +70,30 @@ class ImageRequest(BaseModel): stream_progress_updates: bool = False stream_image_progress: bool = False +class FilterRequest(BaseModel): + session_id: str = "session" + model: str = None + name: str = "" + init_image: str = None # base64 + width: int = 512 + height: int = 512 + save_to_disk_path: str = None + turbo: bool = True + use_cpu: bool = False + use_full_precision: bool = False + output_format: str = "jpeg" # or "png" + # Temporary cache to allow to query tasks results for a short time after they are completed. class TaskCache(): def __init__(self): self._base = dict() - self._lock: threading.Lock = threading.RLock() + self._lock: threading.Lock = threading.Lock() def _get_ttl_time(self, ttl: int) -> int: return int(time.time()) + ttl def _is_expired(self, timestamp: int) -> bool: return int(time.time()) >= timestamp def clean(self) -> None: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.clean failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.clean' + ERR_LOCK_FAILED) try: # Create a list of expired keys to delete to_delete = [] @@ -91,11 +108,11 @@ class TaskCache(): finally: self._lock.release() def clear(self) -> None: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.clear failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.clear' + ERR_LOCK_FAILED) try: self._base.clear() finally: self._lock.release() def delete(self, key: Hashable) -> bool: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.delete failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.delete' + ERR_LOCK_FAILED) try: if key not in self._base: return False @@ -104,7 +121,7 @@ class TaskCache(): finally: self._lock.release() def keep(self, key: Hashable, ttl: int) -> bool: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.keep failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.keep' + ERR_LOCK_FAILED) try: if key in self._base: _, value = self._base.get(key) @@ -114,7 +131,7 @@ class TaskCache(): finally: self._lock.release() def put(self, key: Hashable, value: Any, ttl: int) -> bool: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.put failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.put' + ERR_LOCK_FAILED) try: self._base[key] = ( self._get_ttl_time(ttl), value @@ -128,21 +145,23 @@ class TaskCache(): finally: self._lock.release() def tryGet(self, key: Hashable) -> Any: - if not self._lock.acquire(blocking=True, timeout=10): raise Exception('TaskCache.tryGet failed to acquire lock within timeout.') + if not self._lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('TaskCache.tryGet' + ERR_LOCK_FAILED) try: ttl, value = self._base.get(key, (None, None)) if ttl is not None and self._is_expired(ttl): print(f'Session {key} expired. Discarding data.') - self.delete(key) + del self._base[key] return None return value finally: self._lock.release() +manager_lock = threading.Lock() +render_threads = [] current_state = ServerStates.Init current_state_error:Exception = None current_model_path = None -tasks_queue = queue.Queue() +tasks_queue = [] task_cache = TaskCache() default_model_to_load = None @@ -155,7 +174,8 @@ def preload_model(file_path=None): current_state = ServerStates.LoadingModel try: from . import runtime - runtime.load_model_ckpt(ckpt_to_use=file_path) + runtime.thread_data.ckpt_file = file_path + runtime.load_model_ckpt() current_model_path = file_path current_state_error = None current_state = ServerStates.Online @@ -165,72 +185,96 @@ def preload_model(file_path=None): current_state = ServerStates.Unavailable print(traceback.format_exc()) -def thread_render(): +def thread_render(device): global current_state, current_state_error, current_model_path from . import runtime - current_state = ServerStates.Online + try: + runtime.device_init(device) + except: + print(traceback.format_exc()) + return preload_model() + current_state = ServerStates.Online while True: task_cache.clean() if isinstance(current_state_error, SystemExit): current_state = ServerStates.Unavailable return task = None - try: - task = tasks_queue.get(timeout=1) - except queue.Empty as e: - if isinstance(current_state_error, SystemExit): - current_state = ServerStates.Unavailable - return - else: continue + if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): + print('Render thread on device', runtime.thread_data.device, 'failed to acquire manager lock.') + time.sleep(1) + continue + if len(tasks_queue) <= 0: + manager_lock.release() + time.sleep(1) + continue + try: # Select a render task. + for queued_task in tasks_queue: + if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu': + continue # Cuda Tasks + if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': + continue # CPU Tasks + if queued_task.request.use_face_correction and not runtime.is_first_cuda_device(runtime.thread_data.device): + continue #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + task = queued_task + break + if task is not None: + del tasks_queue[tasks_queue.index(task)] + finally: + manager_lock.release() + if task is None: + time.sleep(1) + continue #if current_model_path != task.request.use_stable_diffusion_model: # preload_model(task.request.use_stable_diffusion_model) if current_state_error: task.error = current_state_error continue print(f'Session {task.request.session_id} starting task {id(task)}') + if not task.lock.acquire(blocking=False): raise Exception('Got locked task from queue.') try: - task.lock.acquire(blocking=False) + # Open data generator. res = runtime.mk_img(task.request) if current_model_path == task.request.use_stable_diffusion_model: current_state = ServerStates.Rendering else: current_state = ServerStates.LoadingModel + # Start reading from generator. + dataQueue = None + if task.request.stream_progress_updates: + dataQueue = task.buffer_queue + for result in res: + if current_state == ServerStates.LoadingModel: + current_state = ServerStates.Rendering + current_model_path = task.request.use_stable_diffusion_model + if isinstance(current_state_error, SystemExit) or isinstance(current_state_error, StopAsyncIteration) or isinstance(task.error, StopAsyncIteration): + runtime.stop_processing = True + if isinstance(current_state_error, StopAsyncIteration): + task.error = current_state_error + current_state_error = None + print(f'Session {task.request.session_id} sent cancel signal for task {id(task)}') + if dataQueue: + dataQueue.put(result) + if isinstance(result, str): + result = json.loads(result) + task.response = result + if 'output' in result: + for out_obj in result['output']: + if 'path' in out_obj: + img_id = out_obj['path'][out_obj['path'].rindex('/') + 1:] + task.temp_images[int(img_id)] = runtime.thread_data.temp_images[out_obj['path'][11:]] + elif 'data' in out_obj: + task.temp_images[result['output'].index(out_obj)] = out_obj['data'] + # Before looping back to the generator, mark cache as still alive. + task_cache.keep(task.request.session_id, TASK_TTL) except Exception as e: task.error = e - task.lock.release() - tasks_queue.task_done() print(traceback.format_exc()) continue - dataQueue = None - if task.request.stream_progress_updates: - dataQueue = task.buffer_queue - for result in res: - if current_state == ServerStates.LoadingModel: - current_state = ServerStates.Rendering - current_model_path = task.request.use_stable_diffusion_model - if isinstance(current_state_error, SystemExit) or isinstance(current_state_error, StopAsyncIteration) or isinstance(task.error, StopAsyncIteration): - runtime.stop_processing = True - if isinstance(current_state_error, StopAsyncIteration): - task.error = current_state_error - current_state_error = None - print(f'Session {task.request.session_id} sent cancel signal for task {id(task)}') - if dataQueue: - dataQueue.put(result) - if isinstance(result, str): - result = json.loads(result) - task.response = result - if 'output' in result: - for out_obj in result['output']: - if 'path' in out_obj: - img_id = out_obj['path'][out_obj['path'].rindex('/') + 1:] - task.temp_images[int(img_id)] = runtime.temp_images[out_obj['path'][11:]] - elif 'data' in out_obj: - task.temp_images[result['output'].index(out_obj)] = out_obj['data'] - task_cache.keep(task.request.session_id, TASK_TTL) - # Task completed - task.lock.release() - tasks_queue.task_done() + finally: + # Task completed + task.lock.release() task_cache.keep(task.request.session_id, TASK_TTL) if isinstance(task.error, StopAsyncIteration): print(f'Session {task.request.session_id} task {id(task)} cancelled!') @@ -240,19 +284,37 @@ def thread_render(): print(f'Session {task.request.session_id} task {id(task)} completed.') current_state = ServerStates.Online -render_thread = threading.Thread(target=thread_render) +def is_alive(name=None): + if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('is_alive' + ERR_LOCK_FAILED) + nbr_alive = 0 + try: + for rthread in render_threads: + if name and not rthread.name.endswith(name): + continue + if rthread.is_alive(): + nbr_alive += 1 + return nbr_alive + finally: + manager_lock.release() -def start_render_thread(): - # Start Rendering Thread - render_thread.daemon = True - render_thread.start() +def start_render_thread(device='auto'): + if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('start_render_threads' + ERR_LOCK_FAILED) + print('Start new Rendering Thread on device', device) + try: + rthread = threading.Thread(target=thread_render, kwargs={'device': device}) + rthread.daemon = True + rthread.name = 'Runner/' + device + rthread.start() + render_threads.append(rthread) + finally: + manager_lock.release() def shutdown_event(): # Signal render thread to close on shutdown global current_state_error current_state_error = SystemExit('Application shutting down.') def render(req : ImageRequest): - if not render_thread.is_alive(): # Render thread is dead + if not is_alive(): # Render thread is dead raise ChildProcessError('Rendering thread has died.') # Alive, check if task in cache task = task_cache.tryGet(req.session_id) @@ -293,6 +355,12 @@ def render(req : ImageRequest): new_task = RenderTask(r) if task_cache.put(r.session_id, new_task, TASK_TTL): - tasks_queue.put(new_task, block=True, timeout=30) - return new_task + # Use twice the normal timeout for adding user requests. + # Tries to force task_cache.put to fail before tasks_queue.put would. + if manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT * 2): + try: + tasks_queue.append(new_task) + return new_task + finally: + manager_lock.release() raise RuntimeError('Failed to add task to cache.') diff --git a/ui/server.py b/ui/server.py index a803ceb9..2592a8c0 100644 --- a/ui/server.py +++ b/ui/server.py @@ -15,14 +15,24 @@ MODELS_DIR = os.path.abspath(os.path.join(SD_DIR, '..', 'models')) OUTPUT_DIRNAME = "Stable Diffusion UI" # in the user's home folder TASK_TTL = 15 * 60 # Discard last session's task timeout +APP_CONFIG_DEFAULTS = { + # auto: selects the cuda device with the most free memory, cuda: use the currently active cuda device. + 'render_devices': ['auto'], # ['cuda'] or ['CPU', 'GPU:0', 'GPU:1', ...] or ['cpu'] + 'update_branch': 'main', +} +APP_CONFIG_DEFAULT_MODELS = [ + # needed to support the legacy installations + 'custom-model', # Check if user has a custom model, use it first. + 'sd-v1-4', # Default fallback. +] from fastapi import FastAPI, HTTPException from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, JSONResponse, StreamingResponse from pydantic import BaseModel import logging -import queue, threading, time -from typing import Any, Generator, Hashable, Optional, Union +#import queue, threading, time +from typing import Any, Generator, Hashable, List, Optional, Union from sd_internal import Request, Response, task_manager @@ -37,52 +47,173 @@ ACCESS_LOG_SUPPRESS_PATH_PREFIXES = ['/ping', '/image', '/modifier-thumbnails'] NOCACHE_HEADERS={"Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", "Expires": "0"} app.mount('/media', StaticFiles(directory=os.path.join(SD_UI_DIR, 'media/')), name="media") +config_cached = None +config_last_mod_time = 0 +def getConfig(default_val=APP_CONFIG_DEFAULTS): + global config_cached, config_last_mod_time + try: + config_json_path = os.path.join(CONFIG_DIR, 'config.json') + if not os.path.exists(config_json_path): + return default_val + if config_last_mod_time > 0 and config_cached is not None: + # Don't read if file was not modified + mtime = os.path.getmtime(config_json_path) + if mtime <= config_last_mod_time: + return config_cached + with open(config_json_path, 'r') as f: + config_cached = json.load(f) + config_last_mod_time = os.path.getmtime(config_json_path) + return config_cached + except Exception as e: + print(str(e)) + print(traceback.format_exc()) + return default_val + +def setConfig(config): + try: # config.json + config_json_path = os.path.join(CONFIG_DIR, 'config.json') + with open(config_json_path, 'w') as f: + return json.dump(config, f) + except: + print(traceback.format_exc()) + + if 'render_devices' in config: + gpu_devices = filter(lambda dev: dev.startswith('GPU:'), config['render_devices']) + else: + gpu_devices = [] + + try: # config.bat + config_bat = [ + f"@set update_branch={config['update_branch']}" + ] + if len(gpu_devices) > 0: + config_sh.append(f"@set CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") + config_bat_path = os.path.join(CONFIG_DIR, 'config.bat') + with open(config_bat_path, 'w') as f: + f.write(f.write('\r\n'.join(config_bat))) + except Exception as e: + print(traceback.format_exc()) + + try: # config.sh + config_sh = [ + '#!/bin/bash' + f"export update_branch={config['update_branch']}" + ] + if len(gpu_devices) > 0: + config_sh.append(f"CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") + config_sh_path = os.path.join(CONFIG_DIR, 'config.sh') + with open(config_sh_path, 'w') as f: + f.write('\n'.join(config_sh)) + except Exception as e: + print(traceback.format_exc()) + +def resolve_model_to_use(model_name:str=None): + if not model_name: # When None try user configured model. + config = getConfig() + if 'model' in config and 'stable-diffusion' in config['model']: + model_name = config['model']['stable-diffusion'] + if model_name: + if os.path.exists(model_name + '.ckpt'): + # Direct Path to file + return model_name + # Check models directory + models_dir_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name) + if os.path.exists(models_dir_path + '.ckpt'): + return models_dir_path + # Default locations + if model_name in APP_CONFIG_DEFAULT_MODELS: + default_model_path = os.path.join(SD_DIR, model_name) + if os.path.exists(default_model_path + '.ckpt'): + return default_model_path + # Can't find requested model, check the default paths. + for default_model in APP_CONFIG_DEFAULT_MODELS: + default_model_path = os.path.join(SD_DIR, default_model + '.ckpt') + if os.path.exists(default_model_path): + print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', default_model_path + '.ckpt') + return default_model_path + raise Exception('No valid models found.') + class SetAppConfigRequest(BaseModel): - update_branch: str = "main" + update_branch: str = None + render_devices: Union[List[str], List[int], str, int] = None -# needs to support the legacy installations -def get_initial_model_to_load(): +@app.post('/app_config') +async def setAppConfig(req : SetAppConfigRequest): + config = getConfig() + if req.update_branch: + config['update_branch'] = req.update_branch + if req.render_devices and hasattr(req.render_devices, "__len__"): # strings, array of strings or numbers. + render_devices = [] + if isinstance(req.render_devices, str): + req.render_devices = req.render_devices.split(',') + if isinstance(req.render_devices, list): + for gpu in req.render_devices: + if isinstance(req.render_devices, int): + render_devices.append('GPU:' + gpu) + else: + render_devices.append(gpu) + if isinstance(req.render_devices, int): + render_devices.append('GPU:' + req.render_devices) + if len(render_devices) > 0: + config['render_devices'] = render_devices + try: + setConfig(config) + return JSONResponse({'status': 'OK'}, headers=NOCACHE_HEADERS) + except Exception as e: + print(traceback.format_exc()) + return HTTPException(status_code=500, detail=str(e)) + +def getModels(): + models = { + 'active': { + 'stable-diffusion': 'sd-v1-4', + }, + 'options': { + 'stable-diffusion': ['sd-v1-4'], + }, + } + + # custom models + sd_models_dir = os.path.join(MODELS_DIR, 'stable-diffusion') + for file in os.listdir(sd_models_dir): + if file.endswith('.ckpt'): + model_name = os.path.splitext(file)[0] + models['options']['stable-diffusion'].append(model_name) + + # legacy custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt') - ckpt_to_use = "sd-v1-4" if not os.path.exists(custom_weight_path) else "custom-model" - - ckpt_to_use = os.path.join(SD_DIR, ckpt_to_use) + if os.path.exists(custom_weight_path): + models['active']['stable-diffusion'] = 'custom-model' + models['options']['stable-diffusion'].append('custom-model') config = getConfig() if 'model' in config and 'stable-diffusion' in config['model']: - model_name = config['model']['stable-diffusion'] - model_path = resolve_model_to_use(model_name) + models['active']['stable-diffusion'] = config['model']['stable-diffusion'] - if os.path.exists(model_path + '.ckpt'): - ckpt_to_use = model_path - else: - print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', ckpt_to_use + '.ckpt') - return ckpt_to_use + return models -def resolve_model_to_use(model_name): - if model_name in ('sd-v1-4', 'custom-model'): - model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name) - - legacy_model_path = os.path.join(SD_DIR, model_name) - if not os.path.exists(model_path + '.ckpt') and os.path.exists(legacy_model_path + '.ckpt'): - model_path = legacy_model_path +@app.get('/get/{key:path}') +def read_web_data(key:str=None): + if not key: # /get without parameters, stable-diffusion easter egg. + return HTTPException(status_code=418, detail="StableDiffusion is drawing a teapot!") # HTTP418 I'm a teapot + elif key == 'app_config': + config = getConfig(default_val=None) + if config is None: + return HTTPException(status_code=500, detail="Config file is missing or unreadable") + return JSONResponse(config, headers=NOCACHE_HEADERS) + elif key == 'models': + return JSONResponse(getModels(), headers=NOCACHE_HEADERS) + elif key == 'modifiers': return FileResponse(os.path.join(SD_UI_DIR, 'modifiers.json'), headers=NOCACHE_HEADERS) + elif key == 'output_dir': return JSONResponse({ 'output_dir': outpath }, headers=NOCACHE_HEADERS) else: - model_path = os.path.join(MODELS_DIR, 'stable-diffusion', model_name) - return model_path - -@app.on_event("shutdown") -def shutdown_event(): # Signal render thread to close on shutdown - task_manager.current_state_error = SystemExit('Application shutting down.') - -@app.get('/') -def read_root(): - return FileResponse(os.path.join(SD_UI_DIR, 'index.html'), headers=NOCACHE_HEADERS) + return HTTPException(status_code=404, detail=f'Request for unknown {key}') # HTTP404 Not Found @app.get('/ping') # Get server and optionally session status. def ping(session_id:str=None): - if not task_manager.render_thread.is_alive(): # Render thread is dead. - if task_manager.current_state_error: return HTTPException(status_code=500, detail=str(current_state_error)) + if task_manager.is_alive() <= 0: # Check that render threads are alive. + if task_manager.current_state_error: return HTTPException(status_code=500, detail=str(task_manager.current_state_error)) return HTTPException(status_code=500, detail='Render thread is dead.') - if task_manager.current_state_error and not isinstance(task_manager.current_state_error, StopAsyncIteration): return HTTPException(status_code=500, detail=str(current_state_error)) + if task_manager.current_state_error and not isinstance(task_manager.current_state_error, StopAsyncIteration): return HTTPException(status_code=500, detail=str(task_manager.current_state_error)) # Alive response = {'status': str(task_manager.current_state)} if session_id: @@ -119,7 +250,7 @@ def render(req : task_manager.ImageRequest): new_task = task_manager.render(req) response = { 'status': str(task_manager.current_state), - 'queue': task_manager.tasks_queue.qsize(), + 'queue': len(task_manager.tasks_queue), 'stream': f'/image/stream/{req.session_id}/{id(new_task)}', 'task': id(new_task) } @@ -172,100 +303,13 @@ def get_image(session_id, img_id): except KeyError as e: return HTTPException(status_code=500, detail=str(e)) -@app.post('/app_config') -async def setAppConfig(req : SetAppConfigRequest): - try: - config = { - 'update_branch': req.update_branch - } +@app.get('/') +def read_root(): + return FileResponse(os.path.join(SD_UI_DIR, 'index.html'), headers=NOCACHE_HEADERS) - config_json_str = json.dumps(config) - config_bat_str = f'@set update_branch={req.update_branch}' - config_sh_str = f'export update_branch={req.update_branch}' - - config_json_path = os.path.join(CONFIG_DIR, 'config.json') - config_bat_path = os.path.join(CONFIG_DIR, 'config.bat') - config_sh_path = os.path.join(CONFIG_DIR, 'config.sh') - - with open(config_json_path, 'w') as f: - f.write(config_json_str) - - with open(config_bat_path, 'w') as f: - f.write(config_bat_str) - - with open(config_sh_path, 'w') as f: - f.write(config_sh_str) - - return {'OK'} - except Exception as e: - print(traceback.format_exc()) - return HTTPException(status_code=500, detail=str(e)) - -def getConfig(default_val={}): - try: - config_json_path = os.path.join(CONFIG_DIR, 'config.json') - if not os.path.exists(config_json_path): - return default_val - with open(config_json_path, 'r') as f: - return json.load(f) - except Exception as e: - print(str(e)) - print(traceback.format_exc()) - return default_val - -def setConfig(config): - try: - config_json_path = os.path.join(CONFIG_DIR, 'config.json') - with open(config_json_path, 'w') as f: - return json.dump(config, f) - except: - print(str(e)) - print(traceback.format_exc()) - -def getModels(): - models = { - 'active': { - 'stable-diffusion': 'sd-v1-4', - }, - 'options': { - 'stable-diffusion': ['sd-v1-4'], - }, - } - - # custom models - sd_models_dir = os.path.join(MODELS_DIR, 'stable-diffusion') - for file in os.listdir(sd_models_dir): - if file.endswith('.ckpt'): - model_name = os.path.splitext(file)[0] - models['options']['stable-diffusion'].append(model_name) - - # legacy - custom_weight_path = os.path.join(SD_DIR, 'custom-model.ckpt') - if os.path.exists(custom_weight_path): - models['active']['stable-diffusion'] = 'custom-model' - models['options']['stable-diffusion'].append('custom-model') - - config = getConfig() - if 'model' in config and 'stable-diffusion' in config['model']: - models['active']['stable-diffusion'] = config['model']['stable-diffusion'] - - return models - -@app.get('/get/{key:path}') -def read_web_data(key:str=None): - if not key: # /get without parameters, stable-diffusion easter egg. - return HTTPException(status_code=418, detail="StableDiffusion is drawing a teapot!") # HTTP418 I'm a teapot - elif key == 'app_config': - config = getConfig(default_val=None) - if config is None: - return HTTPException(status_code=500, detail="Config file is missing or unreadable") - return JSONResponse(config, headers=NOCACHE_HEADERS) - elif key == 'models': - return JSONResponse(getModels(), headers=NOCACHE_HEADERS) - elif key == 'modifiers': return FileResponse(os.path.join(SD_UI_DIR, 'modifiers.json'), headers=NOCACHE_HEADERS) - elif key == 'output_dir': return JSONResponse({ 'output_dir': outpath }, headers=NOCACHE_HEADERS) - else: - return HTTPException(status_code=404, detail=f'Request for unknown {key}') # HTTP404 Not Found +@app.on_event("shutdown") +def shutdown_event(): # Signal render thread to close on shutdown + task_manager.current_state_error = SystemExit('Application shutting down.') # don't log certain requests class LogSuppressFilter(logging.Filter): @@ -277,8 +321,26 @@ class LogSuppressFilter(logging.Filter): return True logging.getLogger('uvicorn.access').addFilter(LogSuppressFilter()) -task_manager.default_model_to_load = get_initial_model_to_load() -task_manager.start_render_thread() +config = getConfig() +# Start the task_manager +task_manager.default_model_to_load = resolve_model_to_use() +if 'render_devices' in config: # Start a new thread for each device. + if isinstance(config['render_devices'], str): + config['render_devices'] = config['render_devices'].split(',') + if not isinstance(config['render_devices'], list): + raise Exception('Invalid render_devices value in config.') + for device in config['render_devices']: + task_manager.start_render_thread(device) + +allow_cpu = False +if task_manager.is_alive() <= 0: # No running devices, apply defaults. + # Select best device GPU device using free memory if more than one device. + task_manager.start_render_thread('auto') + allow_cpu = True + +# Allow CPU to be used for renders if not already enabled in current config. +if task_manager.is_alive('cpu') <= 0 and allow_cpu: + task_manager.start_render_thread('cpu') # start the browser ui import webbrowser; webbrowser.open('http://localhost:9000') \ No newline at end of file From 994d62ac650efe63c5411520d80f61dc1be981e0 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 16 Oct 2022 22:24:26 -0400 Subject: [PATCH 002/221] Added a clear error message when targeting CPU if not enabled in config. --- ui/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ui/server.py b/ui/server.py index 2592a8c0..64fbbeca 100644 --- a/ui/server.py +++ b/ui/server.py @@ -244,6 +244,7 @@ def save_model_to_config(model_name): @app.post('/render') def render(req : task_manager.ImageRequest): + if req.use_cpu and task_manager.is_alive('cpu') <= 0: return HTTPException(status_code=403, detail=f'CPU rendering is not enabled in config.json or the thread has died...') # HTTP403 Forbidden try: save_model_to_config(req.use_stable_diffusion_model) req.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model) From 41bfb96b6b20d06b8b721d952369e12409b4c4ac Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 16 Oct 2022 23:06:41 -0400 Subject: [PATCH 003/221] Fixed bug in task_manager.is_alive and added way to check for first device. --- ui/sd_internal/runtime.py | 10 ++++------ ui/sd_internal/task_manager.py | 10 +++++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 81baa103..e9131b4d 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -149,12 +149,10 @@ def device_init(device_selection=None): thread_data.device = 'cpu' def is_first_cuda_device(device): - if thread_data.device == 0 or thread_data.device == '0': - return True - if thread_data.device == 'cuda' or thread_data.device == 'cuda:0': - return True - if thread_data.device == torch.device(0): - return True + if device is None: return False + if device == 0 or device == '0': return True + if device == 'cuda' or device == 'cuda:0': return True + if device == torch.device(0): return True return False def load_model_ckpt(): diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 75e1d262..8e61b7ea 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -9,6 +9,7 @@ from typing import Any, Generator, Hashable, Optional, Union from pydantic import BaseModel from sd_internal import Request, Response +THREAD_NAME_PREFIX = 'Runtime-Render/' ERR_LOCK_FAILED = ' failed to acquire lock within timeout.' LOCK_TIMEOUT = 15 # Maximum locking time in seconds before failing a task. # It's better to get an exception than a deadlock... ALWAYS use timeout in critical paths. @@ -285,12 +286,15 @@ def thread_render(device): current_state = ServerStates.Online def is_alive(name=None): + from . import runtime # When calling runtime from here DO NOT USE thread specific attributes or functions. if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('is_alive' + ERR_LOCK_FAILED) nbr_alive = 0 try: for rthread in render_threads: - if name and not rthread.name.endswith(name): - continue + thread_name = rthread.name[len(THREAD_NAME_PREFIX):] + if name and thread_name != name: + if not runtime.is_first_cuda_device(name) and not runtime.is_first_cuda_device(thread_name): + continue if rthread.is_alive(): nbr_alive += 1 return nbr_alive @@ -303,7 +307,7 @@ def start_render_thread(device='auto'): try: rthread = threading.Thread(target=thread_render, kwargs={'device': device}) rthread.daemon = True - rthread.name = 'Runner/' + device + rthread.name = THREAD_NAME_PREFIX + device rthread.start() render_threads.append(rthread) finally: From 1d4c5cc96ff957c76c7c34f460b5f239ca720ac6 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 16 Oct 2022 23:07:55 -0400 Subject: [PATCH 004/221] Added clear error response when submitting tasks that requires GFPGANer if cuda:0 and cpu rendering is disabled. --- ui/server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ui/server.py b/ui/server.py index 64fbbeca..55786906 100644 --- a/ui/server.py +++ b/ui/server.py @@ -245,6 +245,8 @@ def save_model_to_config(model_name): @app.post('/render') def render(req : task_manager.ImageRequest): if req.use_cpu and task_manager.is_alive('cpu') <= 0: return HTTPException(status_code=403, detail=f'CPU rendering is not enabled in config.json or the thread has died...') # HTTP403 Forbidden + if req.use_face_correction and task_manager.is_alive(0) <= 0 and task_manager.is_alive('cpu') <= 0: #TODO Remove when GFPGANer is fixed upstream. + return HTTPException(status_code=412, detail=f'GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') # HTTP412 Precondition Failed try: save_model_to_config(req.use_stable_diffusion_model) req.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model) From d4a348a2b2ac64530b4435873a69bebe2ee897e0 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 16 Oct 2022 23:12:46 -0400 Subject: [PATCH 005/221] Process GFPGANer on cuda:0 when possible, otherwise use cpu. --- ui/sd_internal/task_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 8e61b7ea..4136f7f0 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -217,7 +217,8 @@ def thread_render(device): if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': continue # CPU Tasks if queued_task.request.use_face_correction and not runtime.is_first_cuda_device(runtime.thread_data.device): - continue #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + if not runtime.thread_data.device == 'cpu' and is_alive(0) > 0: # Allows GFPGANer on cuda:0 and use cpu only when cuda:0 is not available. + continue #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. task = queued_task break if task is not None: From 012243a8804d3a122d7fc66b28e1ff898dd0bcbf Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 01:05:27 -0400 Subject: [PATCH 006/221] Process GPU tasks on CPU when there are no cuda devices at all. --- ui/sd_internal/task_manager.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 4136f7f0..55c6320c 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -157,7 +157,7 @@ class TaskCache(): finally: self._lock.release() -manager_lock = threading.Lock() +manager_lock = threading.RLock() render_threads = [] current_state = ServerStates.Init current_state_error:Exception = None @@ -212,13 +212,26 @@ def thread_render(device): continue try: # Select a render task. for queued_task in tasks_queue: - if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu': - continue # Cuda Tasks - if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': - continue # CPU Tasks - if queued_task.request.use_face_correction and not runtime.is_first_cuda_device(runtime.thread_data.device): - if not runtime.thread_data.device == 'cpu' and is_alive(0) > 0: # Allows GFPGANer on cuda:0 and use cpu only when cuda:0 is not available. - continue #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + cpu_alive = is_alive('cpu') + if queued_task.request.use_face_correction: #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + # Allows GFPGANer on cuda:0 and use cpu only when cuda:0 is not available. + first_device_alive = True if is_alive(0) >= 1 else False + if cpu_alive <= 0 and not first_device_alive: + queued_task.request.use_face_correction = False + print('cuda:0 and cpu are not available with the current config. Removed GFPGANer filter to run task.') + continue + if not queued_task.request.use_cpu: + if first_device_alive: + if not runtime.is_first_cuda_device(runtime.thread_data.device): + continue # Wait for cuda:0 + elif cpu_alive > 0: + print('cuda:0 is not available with the current config. Forcing task requiring GFPGANer to cpu.') + queued_task.request.use_cpu = True + continue + if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu' and cpu_alive > 0: + continue # CPU Tasks, Skip GPU device + if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu' and is_alive() > 1: # cpu is alive, so need more than one. + continue # GPU Tasks, don't run on CPU unless there is nothing else. task = queued_task break if task is not None: From 554b67a2f03e4a535d9e007b99f46fb1f7f0c91b Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 01:05:51 -0400 Subject: [PATCH 007/221] Fixing bug in is_alive. --- ui/sd_internal/runtime.py | 4 +++- ui/sd_internal/task_manager.py | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index e9131b4d..5756aaf2 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -130,7 +130,7 @@ def device_init(device_selection=None): device_selection = device_selection.lower() if device_selection.startswith('gpu:'): device_selection = int(device_selection[4:]) - if device_selection != 'cuda' and device_selection != 'current': + if device_selection != 'cuda' and device_selection != 'current' and device_selection != 'gpu': if device_select(device_selection): if isinstance(device_selection, int): print(f'Setting GPU:{device_selection} as active') @@ -152,6 +152,8 @@ def is_first_cuda_device(device): if device is None: return False if device == 0 or device == '0': return True if device == 'cuda' or device == 'cuda:0': return True + if device == 'gpu' or device == 'gpu:0': return True + if device == 'current': return True if device == torch.device(0): return True return False diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 55c6320c..a47e4263 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -305,9 +305,12 @@ def is_alive(name=None): nbr_alive = 0 try: for rthread in render_threads: - thread_name = rthread.name[len(THREAD_NAME_PREFIX):] - if name and thread_name != name: - if not runtime.is_first_cuda_device(name) and not runtime.is_first_cuda_device(thread_name): + thread_name = rthread.name[len(THREAD_NAME_PREFIX):].lower() + if name is not None: + if runtime.is_first_cuda_device(name): + if not runtime.is_first_cuda_device(thread_name): + continue + elif thread_name != name: continue if rthread.is_alive(): nbr_alive += 1 From c92129ac632ae8f245726b1c8dbb743424cfc411 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 02:27:30 -0400 Subject: [PATCH 008/221] Improved detection of missing cuda:0 and added warning to console about how to fix. --- ui/sd_internal/runtime.py | 2 +- ui/sd_internal/task_manager.py | 9 ++++++--- ui/server.py | 29 ++++++++++++++++++++++++----- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 5756aaf2..0a667417 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -130,7 +130,7 @@ def device_init(device_selection=None): device_selection = device_selection.lower() if device_selection.startswith('gpu:'): device_selection = int(device_selection[4:]) - if device_selection != 'cuda' and device_selection != 'current' and device_selection != 'gpu': + if device_selection != 'current' and device_selection != 'gpu': if device_select(device_selection): if isinstance(device_selection, int): print(f'Setting GPU:{device_selection} as active') diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index a47e4263..22146c4d 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -299,16 +299,19 @@ def thread_render(device): print(f'Session {task.request.session_id} task {id(task)} completed.') current_state = ServerStates.Online +def is_first_cuda_device(device): + from . import runtime # When calling runtime from outside thread_render DO NOT USE thread specific attributes or functions. + return runtime.is_first_cuda_device(device) + def is_alive(name=None): - from . import runtime # When calling runtime from here DO NOT USE thread specific attributes or functions. if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): raise Exception('is_alive' + ERR_LOCK_FAILED) nbr_alive = 0 try: for rthread in render_threads: thread_name = rthread.name[len(THREAD_NAME_PREFIX):].lower() if name is not None: - if runtime.is_first_cuda_device(name): - if not runtime.is_first_cuda_device(thread_name): + if is_first_cuda_device(name): + if not is_first_cuda_device(thread_name): continue elif thread_name != name: continue diff --git a/ui/server.py b/ui/server.py index 55786906..8c64c59b 100644 --- a/ui/server.py +++ b/ui/server.py @@ -78,16 +78,27 @@ def setConfig(config): print(traceback.format_exc()) if 'render_devices' in config: - gpu_devices = filter(lambda dev: dev.startswith('GPU:'), config['render_devices']) + gpu_devices = filter(lambda dev: dev.lower().startswith('gpu') or dev.lower().startswith('cuda'), config['render_devices']) else: gpu_devices = [] + has_first_cuda_device = False + for device in gpu_devices: + if not task_manager.is_first_cuda_device(device): continue + has_first_cuda_device = True + break + if len(gpu_devices) > 0 and not has_first_cuda_device: + print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') + try: # config.bat config_bat = [ f"@set update_branch={config['update_branch']}" ] - if len(gpu_devices) > 0: - config_sh.append(f"@set CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") + if len(gpu_devices) > 0 and not has_first_cuda_device: + config_sh.append('::Set the devices visible inside SD-UI here') + config_bat.append(f"::@set CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") # Needs better detection for edge cases, add as a comment for now. + print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') config_bat_path = os.path.join(CONFIG_DIR, 'config.bat') with open(config_bat_path, 'w') as f: f.write(f.write('\r\n'.join(config_bat))) @@ -99,8 +110,10 @@ def setConfig(config): '#!/bin/bash' f"export update_branch={config['update_branch']}" ] - if len(gpu_devices) > 0: - config_sh.append(f"CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") + if len(gpu_devices) > 0 and not has_first_cuda_device: + config_sh.append('#Set the devices visible inside SD-UI here') + config_sh.append(f"#CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") # Needs better detection for edge cases, add as a comment for now. + print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') config_sh_path = os.path.join(CONFIG_DIR, 'config.sh') with open(config_sh_path, 'w') as f: f.write('\n'.join(config_sh)) @@ -345,5 +358,11 @@ if task_manager.is_alive() <= 0: # No running devices, apply defaults. if task_manager.is_alive('cpu') <= 0 and allow_cpu: task_manager.start_render_thread('cpu') +if task_manager.is_alive(0) <= 0: # Missing cuda:0, warn the user. + print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') + print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') + print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') + # start the browser ui import webbrowser; webbrowser.open('http://localhost:9000') \ No newline at end of file From 87f93b34a30b0411d345ebf6210617f32b739425 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 14:44:53 -0400 Subject: [PATCH 009/221] Fixed a typo when adding a comment. --- ui/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/server.py b/ui/server.py index 8c64c59b..96a49737 100644 --- a/ui/server.py +++ b/ui/server.py @@ -96,7 +96,7 @@ def setConfig(config): f"@set update_branch={config['update_branch']}" ] if len(gpu_devices) > 0 and not has_first_cuda_device: - config_sh.append('::Set the devices visible inside SD-UI here') + config_bat.append('::Set the devices visible inside SD-UI here') config_bat.append(f"::@set CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") # Needs better detection for edge cases, add as a comment for now. print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') config_bat_path = os.path.join(CONFIG_DIR, 'config.bat') From 4d3358ba6601c86eaa610c7d8f69d6ce2b57d86a Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 21:27:15 -0400 Subject: [PATCH 010/221] Fixed file path bugs introduced by mistake and made img_id sequential based on time for better sorting of renders. --- ui/sd_internal/runtime.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 0a667417..bb15e588 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -274,7 +274,7 @@ def load_model_real_esrgan(): thread_data.model_real_esrgan.model.name = thread_data.real_esrgan_file print('loaded ', thread_data.real_esrgan_file, 'to', thread_data.model_real_esrgan.device, 'precision', thread_data.precision) -def get_base_path(disk_path, session_id, prompt, ext, suffix=None): +def get_base_path(disk_path, session_id, prompt, img_id, ext, suffix=None): if disk_path is None: return None if session_id is None: return None if ext is None: raise Exception('Missing ext') @@ -283,7 +283,6 @@ def get_base_path(disk_path, session_id, prompt, ext, suffix=None): os.makedirs(session_out_path, exist_ok=True) prompt_flattened = filename_regex.sub('_', prompt)[:50] - img_id = str(uuid.uuid4())[-8:] if suffix is not None: return os.path.join(session_out_path, f"{prompt_flattened}_{img_id}_{suffix}.{ext}") @@ -391,6 +390,8 @@ def do_mk_img(req: Request): opt_f = 8 opt_ddim_eta = 0.0 opt_init_img = req.init_image + img_id = base64.b64encode(int(time.time()).to_bytes(8, 'big')).decode() # Generate unique ID based on time. + img_id = img_id.translate({43:None, 47:None, 61:None})[-8:] # Remove + / = and keep last 8 chars. print(req.to_string(), '\n device', thread_data.device) print('\n\n Using precision:', thread_data.precision) @@ -548,9 +549,9 @@ def do_mk_img(req: Request): if req.save_to_disk_path is not None: if return_orig_img: - img_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], req.output_format) + img_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], img_id, req.output_format) save_image(img, img_out_path) - meta_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], 'txt') + meta_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], img_id, 'txt') save_metadata(meta_out_path, req, prompts[0], opt_seed) if return_orig_img: @@ -577,7 +578,7 @@ def do_mk_img(req: Request): response_image = ResponseImage(data=filtered_img_data, seed=req.seed) res.images.append(response_image) if req.save_to_disk_path is not None: - filtered_img_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], req.output_format, "_".join(filters_applied)) + filtered_img_out_path = get_base_path(req.save_to_disk_path, req.session_id, prompts[0], img_id, req.output_format, "_".join(filters_applied)) save_image(filtered_image, filtered_img_out_path) response_image.path_abs = filtered_img_out_path del filtered_image From 578b3ba4f4f18ceaa9a23e0cc0eadfb4d99a6a99 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 17 Oct 2022 23:15:36 -0400 Subject: [PATCH 011/221] Force encoding to utf-8 on text file operations Fixes #332 # Conflicts: # ui/server.py --- ui/sd_internal/runtime.py | 2 +- ui/server.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index bb15e588..7b56a3fd 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -616,7 +616,7 @@ Negative Prompt: {req.negative_prompt} Stable Diffusion model: {req.use_stable_diffusion_model + '.ckpt'} ''' try: - with open(meta_out_path, 'w') as f: + with open(meta_out_path, 'w', encoding='utf-8') as f: f.write(metadata) except: print('could not save the file', traceback.format_exc()) diff --git a/ui/server.py b/ui/server.py index 96a49737..42ae3512 100644 --- a/ui/server.py +++ b/ui/server.py @@ -60,7 +60,7 @@ def getConfig(default_val=APP_CONFIG_DEFAULTS): mtime = os.path.getmtime(config_json_path) if mtime <= config_last_mod_time: return config_cached - with open(config_json_path, 'r') as f: + with open(config_json_path, 'r', encoding='utf-8') as f: config_cached = json.load(f) config_last_mod_time = os.path.getmtime(config_json_path) return config_cached @@ -72,7 +72,7 @@ def getConfig(default_val=APP_CONFIG_DEFAULTS): def setConfig(config): try: # config.json config_json_path = os.path.join(CONFIG_DIR, 'config.json') - with open(config_json_path, 'w') as f: + with open(config_json_path, 'w', encoding='utf-8') as f: return json.dump(config, f) except: print(traceback.format_exc()) @@ -100,7 +100,7 @@ def setConfig(config): config_bat.append(f"::@set CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") # Needs better detection for edge cases, add as a comment for now. print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') config_bat_path = os.path.join(CONFIG_DIR, 'config.bat') - with open(config_bat_path, 'w') as f: + with open(config_bat_path, 'w', encoding='utf-8') as f: f.write(f.write('\r\n'.join(config_bat))) except Exception as e: print(traceback.format_exc()) @@ -115,7 +115,7 @@ def setConfig(config): config_sh.append(f"#CUDA_VISIBLE_DEVICES={','.join(gpu_devices)}") # Needs better detection for edge cases, add as a comment for now. print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') config_sh_path = os.path.join(CONFIG_DIR, 'config.sh') - with open(config_sh_path, 'w') as f: + with open(config_sh_path, 'w', encoding='utf-8') as f: f.write('\n'.join(config_sh)) except Exception as e: print(traceback.format_exc()) From 5e461e9b6b707d80716e27f6d01106ac39dafe4b Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 18 Oct 2022 13:21:15 -0400 Subject: [PATCH 012/221] Fixed is_alive with render_threads that can update the device name after starting. --- ui/sd_internal/task_manager.py | 14 ++++++++++-- ui/server.py | 40 +++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 52a6b183..917387fe 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -3,7 +3,7 @@ import traceback TASK_TTL = 15 * 60 # Discard last session's task timeout -import queue, threading, time +import queue, threading, time, weakref from typing import Any, Generator, Hashable, Optional, Union from pydantic import BaseModel @@ -165,6 +165,7 @@ current_model_path = None tasks_queue = [] task_cache = TaskCache() default_model_to_load = None +weak_thread_data = weakref.WeakKeyDictionary() def preload_model(file_path=None): global current_state, current_state_error, current_model_path @@ -189,11 +190,17 @@ def preload_model(file_path=None): def thread_render(device): global current_state, current_state_error, current_model_path from . import runtime + weak_thread_data[threading.current_thread()] = { + 'device': device + } try: runtime.device_init(device) except: print(traceback.format_exc()) return + weak_thread_data[threading.current_thread()] = { + 'device': runtime.thread_data.device + } preload_model() current_state = ServerStates.Online while True: @@ -308,8 +315,11 @@ def is_alive(name=None): nbr_alive = 0 try: for rthread in render_threads: - thread_name = rthread.name[len(THREAD_NAME_PREFIX):].lower() if name is not None: + weak_data = weak_thread_data.get(rthread) + if weak_data is None or weak_data['device'] is None: + continue + thread_name = str(weak_data['device']).lower() if is_first_cuda_device(name): if not is_first_cuda_device(thread_name): continue diff --git a/ui/server.py b/ui/server.py index c913364f..acb5719b 100644 --- a/ui/server.py +++ b/ui/server.py @@ -26,6 +26,7 @@ APP_CONFIG_DEFAULT_MODELS = [ 'sd-v1-4', # Default fallback. ] +import asyncio from fastapi import FastAPI, HTTPException from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, JSONResponse, StreamingResponse @@ -36,6 +37,7 @@ from typing import Any, Generator, Hashable, List, Optional, Union from sd_internal import Request, Response, task_manager +LOOP = asyncio.get_event_loop() app = FastAPI() modifiers_cache = None @@ -348,21 +350,29 @@ if 'render_devices' in config: # Start a new thread for each device. for device in config['render_devices']: task_manager.start_render_thread(device) -allow_cpu = False -if task_manager.is_alive() <= 0: # No running devices, apply defaults. - # Select best device GPU device using free memory if more than one device. - task_manager.start_render_thread('auto') - allow_cpu = True - -# Allow CPU to be used for renders if not already enabled in current config. -if task_manager.is_alive('cpu') <= 0 and allow_cpu: - task_manager.start_render_thread('cpu') - -if task_manager.is_alive(0) <= 0: # Missing cuda:0, warn the user. - print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') - print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') - print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') - print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') +async def check_status(): + device_count = 0 + for i in range(10): # Wait for devices to register and/or change names. + new_count = task_manager.is_alive() + if device_count != new_count: + device_count = new_count + await asyncio.sleep(3) + else: + break; + allow_cpu = False + if task_manager.is_alive() <= 0: # No running devices, apply defaults. + # Select best device GPU device using free memory if more than one device. + task_manager.start_render_thread('auto') + allow_cpu = True + # Allow CPU to be used for renders if not already enabled in current config. + if task_manager.is_alive('cpu') <= 0 and allow_cpu: + task_manager.start_render_thread('cpu') + if not task_manager.is_alive(0) <= 0: + print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') + print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') + print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') +LOOP.create_task(check_status()) # start the browser ui import webbrowser; webbrowser.open('http://localhost:9000') \ No newline at end of file From fcdb086daf56d3525a55c14faf1a7ee3715feb95 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 18 Oct 2022 20:33:37 -0400 Subject: [PATCH 013/221] Fixed is_alive to work with devices that can change name after init. --- ui/sd_internal/runtime.py | 2 +- ui/sd_internal/task_manager.py | 1 + ui/server.py | 56 +++++++++++++++++++++------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 6637d5cd..f278ec5f 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -130,7 +130,7 @@ def device_init(device_selection=None): device_selection = device_selection.lower() if device_selection.startswith('gpu:'): device_selection = int(device_selection[4:]) - if device_selection != 'current' and device_selection != 'gpu': + if device_selection != 'cuda' and device_selection != 'current' and device_selection != 'gpu': if device_select(device_selection): if isinstance(device_selection, int): print(f'Setting GPU:{device_selection} as active') diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 917387fe..4bd1d549 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -318,6 +318,7 @@ def is_alive(name=None): if name is not None: weak_data = weak_thread_data.get(rthread) if weak_data is None or weak_data['device'] is None: + print('The thread', rthread.name, 'is registered but has no data store in the task manager.') continue thread_name = str(weak_data['device']).lower() if is_first_cuda_device(name): diff --git a/ui/server.py b/ui/server.py index acb5719b..60d37693 100644 --- a/ui/server.py +++ b/ui/server.py @@ -340,6 +340,34 @@ class LogSuppressFilter(logging.Filter): logging.getLogger('uvicorn.access').addFilter(LogSuppressFilter()) config = getConfig() + +async def check_status(): # Task to Validate user config shortly after startup. + # Check that the loaded config.json yielded a server in a known valid state. + # Issues found, try to fix and warn the user. + device_count = 0 + for i in range(10): # Wait for devices to register and/or change names. + new_count = task_manager.is_alive() + if device_count == new_count: break; + device_count = new_count + await asyncio.sleep(3) + + if 'render_devices' in config and task_manager.is_alive() <= 0: # No running devices, probably invalid user config. Try to apply defaults. + task_manager.start_render_thread('auto') # Detect best device for renders + task_manager.start_render_thread('cpu') # Allow CPU to be used for renders + await asyncio.sleep(10) # delay message after thread start. + + display_warning = False + if not 'render_devices' in config and task_manager.is_alive(0) <= 0: # No config set, is on auto mode and without cuda:0 + task_manager.start_render_thread('cuda') # An other cuda device is better and cuda:0 is missing, start it... + display_warning = True # And warn user to update settings... + await asyncio.sleep(10) # delay message after thread start. + + if display_warning or task_manager.is_alive(0) <= 0: + print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') + print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') + print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') + # Start the task_manager task_manager.default_model_to_load = resolve_model_to_use() if 'render_devices' in config: # Start a new thread for each device. @@ -349,29 +377,13 @@ if 'render_devices' in config: # Start a new thread for each device. raise Exception('Invalid render_devices value in config.') for device in config['render_devices']: task_manager.start_render_thread(device) +else: + # Select best device GPU device using free memory if more than one device. + #task_manager.start_render_thread('cuda') # Starts silently on cuda:0 + task_manager.start_render_thread('auto') # Detect best device for renders + task_manager.start_render_thread('cpu') # Allow CPU to be used for renders -async def check_status(): - device_count = 0 - for i in range(10): # Wait for devices to register and/or change names. - new_count = task_manager.is_alive() - if device_count != new_count: - device_count = new_count - await asyncio.sleep(3) - else: - break; - allow_cpu = False - if task_manager.is_alive() <= 0: # No running devices, apply defaults. - # Select best device GPU device using free memory if more than one device. - task_manager.start_render_thread('auto') - allow_cpu = True - # Allow CPU to be used for renders if not already enabled in current config. - if task_manager.is_alive('cpu') <= 0 and allow_cpu: - task_manager.start_render_thread('cpu') - if not task_manager.is_alive(0) <= 0: - print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') - print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') - print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') - print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') +# Task to Validate user config shortly after startup. LOOP.create_task(check_status()) # start the browser ui From 53cdeeff037289472c8174d28f7a3c6d87d60b5f Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 18 Oct 2022 21:08:04 -0400 Subject: [PATCH 014/221] More fixes to devices changing names. --- ui/sd_internal/runtime.py | 24 ++++++++---------------- ui/server.py | 9 ++++----- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index f278ec5f..34bb8c7a 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -351,23 +351,14 @@ def do_mk_img(req: Request): thread_data.ckpt_file = req.use_stable_diffusion_model needs_model_reload = True - if req.use_cpu: - if thread_data.device != 'cpu': - thread_data.device = 'cpu' - if thread_data.model_is_half: - load_model_ckpt() - needs_model_reload = False + if thread_data.has_valid_gpu: + if (thread_data.precision == 'autocast' and (req.use_full_precision or not thread_data.model_is_half)) or \ + (thread_data.precision == 'full' and not req.use_full_precision and not thread_data.force_full_precision): + thread_data.precision = 'full' if req.use_full_precision else 'autocast' + load_model_ckpt() load_model_gfpgan() load_model_real_esrgan() - else: - if thread_data.has_valid_gpu: - if (thread_data.precision == 'autocast' and (req.use_full_precision or not thread_data.model_is_half)) or \ - (thread_data.precision == 'full' and not req.use_full_precision and not thread_data.force_full_precision): - thread_data.precision = 'full' if req.use_full_precision else 'autocast' - load_model_ckpt() - load_model_gfpgan() - load_model_real_esrgan() - needs_model_reload = False + needs_model_reload = False if needs_model_reload: load_model_ckpt() @@ -593,7 +584,8 @@ def do_mk_img(req: Request): move_fs_to_cpu() gc() del x_samples, x_samples_ddim, x_sample - print(f'memory_final = {round(torch.cuda.memory_allocated(thread_data.device) / 1e6, 2)}Mo') + if thread_data.device != 'cpu': + print(f'memory_final = {round(torch.cuda.memory_allocated(thread_data.device) / 1e6, 2)}Mo') print('Task completed') diff --git a/ui/server.py b/ui/server.py index 60d37693..68d22ac5 100644 --- a/ui/server.py +++ b/ui/server.py @@ -346,21 +346,21 @@ async def check_status(): # Task to Validate user config shortly after startup. # Issues found, try to fix and warn the user. device_count = 0 for i in range(10): # Wait for devices to register and/or change names. + await asyncio.sleep(3) new_count = task_manager.is_alive() if device_count == new_count: break; device_count = new_count - await asyncio.sleep(3) if 'render_devices' in config and task_manager.is_alive() <= 0: # No running devices, probably invalid user config. Try to apply defaults. task_manager.start_render_thread('auto') # Detect best device for renders task_manager.start_render_thread('cpu') # Allow CPU to be used for renders - await asyncio.sleep(10) # delay message after thread start. + await asyncio.sleep(3) # delay message after thread start. display_warning = False if not 'render_devices' in config and task_manager.is_alive(0) <= 0: # No config set, is on auto mode and without cuda:0 task_manager.start_render_thread('cuda') # An other cuda device is better and cuda:0 is missing, start it... display_warning = True # And warn user to update settings... - await asyncio.sleep(10) # delay message after thread start. + await asyncio.sleep(3) # delay message after thread start. if display_warning or task_manager.is_alive(0) <= 0: print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') @@ -378,8 +378,7 @@ if 'render_devices' in config: # Start a new thread for each device. for device in config['render_devices']: task_manager.start_render_thread(device) else: - # Select best device GPU device using free memory if more than one device. - #task_manager.start_render_thread('cuda') # Starts silently on cuda:0 + # Select best GPU device using free memory, if more than one device. task_manager.start_render_thread('auto') # Detect best device for renders task_manager.start_render_thread('cpu') # Allow CPU to be used for renders From 6098b196dc080f5a9ca21f2e2b5f6303648d9d21 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 18 Oct 2022 23:58:55 -0400 Subject: [PATCH 015/221] Text header, comments and better validations. --- ui/sd_internal/runtime.py | 6 ++++++ ui/sd_internal/task_manager.py | 11 +++++++++++ ui/server.py | 6 +++++- 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 34bb8c7a..9ed8404f 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -1,3 +1,9 @@ +"""runtime.py: torch device owned by a thread. +Notes: + Avoid device switching, transfering all models will get too complex. + To use a diffrent device signal the current render device to exit + And then start a new clean thread for the new device. +""" import json import os, re import traceback diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 4bd1d549..10d66fd4 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -1,3 +1,9 @@ +"""task_manager.py: manage tasks dispatching and render threads. +Notes: + render_threads should be the only hard reference held by the manager to the threads. + Use weak_thread_data to store all other data using weak keys. + This will allow for garbage collection after the thread dies. +""" import json import traceback @@ -340,6 +346,11 @@ def start_render_thread(device='auto'): rthread.daemon = True rthread.name = THREAD_NAME_PREFIX + device rthread.start() + timeout = LOCK_TIMEOUT + while not rthread.is_alive(): + if timeout <= 0: raise Exception('render_thread', rthread.name, 'failed to start before timeout or has crashed.') + timeout -= 1 + time.sleep(1) render_threads.append(rthread) finally: manager_lock.release() diff --git a/ui/server.py b/ui/server.py index 68d22ac5..2d7f7bd8 100644 --- a/ui/server.py +++ b/ui/server.py @@ -1,3 +1,7 @@ +"""server.py: FastAPI SD-UI Web Host. +Notes: + async endpoints always run on the main thread. Without they run on the thread pool. +""" import json import traceback @@ -343,7 +347,7 @@ config = getConfig() async def check_status(): # Task to Validate user config shortly after startup. # Check that the loaded config.json yielded a server in a known valid state. - # Issues found, try to fix and warn the user. + # When issues are found, try to fix them when possible and warn the user. device_count = 0 for i in range(10): # Wait for devices to register and/or change names. await asyncio.sleep(3) From 3fc66ec525e89464d738ba88b78eefd27ed449b6 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 00:27:51 -0400 Subject: [PATCH 016/221] Removed empty lines left over from merge. --- ui/sd_internal/runtime.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 80d2d35f..7ff2459d 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -289,7 +289,6 @@ def get_base_path(disk_path, session_id, prompt, img_id, ext, suffix=None): os.makedirs(session_out_path, exist_ok=True) prompt_flattened = filename_regex.sub('_', prompt)[:50] - if suffix is not None: return os.path.join(session_out_path, f"{prompt_flattened}_{img_id}_{suffix}.{ext}") @@ -388,7 +387,6 @@ def do_mk_img(req: Request): opt_f = 8 opt_ddim_eta = 0.0 opt_init_img = req.init_image - print(req.to_string(), '\n device', thread_data.device) print('\n\n Using precision:', thread_data.precision) From 21afe077d728a16906eed548fc5399ee3c5e9bca Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 03:02:26 -0400 Subject: [PATCH 017/221] Removed Cpu from the devices allowed to run GFPGANer. Added clear error for the user. --- ui/media/js/main.js | 14 +++++++++----- ui/sd_internal/runtime.py | 8 ++++---- ui/sd_internal/task_manager.py | 30 +++++++++++++++--------------- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/ui/media/js/main.js b/ui/media/js/main.js index 403c1252..d7a30f79 100644 --- a/ui/media/js/main.js +++ b/ui/media/js/main.js @@ -524,12 +524,16 @@ async function doMakeImage(task) { throw new Error('Connexion with server lost.') } } while (serverState.time > (Date.now() - (10 * 1000)) && serverState.task !== renderRequest.task) - if (serverState.session !== 'pending' && serverState.session !== 'running' && serverState.session !== 'buffer') { - if (serverState.session === 'stopped') { + switch(serverState.session) { + case 'pending': + case 'running': + case 'buffer': + case 'error': // Still valid, Update UI with error message + break + case 'stopped': return false - } - - throw new Error('Unexpected server task state: ' + serverState.session || 'Undefined') + default: + throw new Error('Unexpected server task state: ' + serverState.session || 'Undefined') } while (serverState.task === renderRequest.task && serverState.session === 'pending') { // Wait for task to start on server. diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 7ff2459d..ec6961b4 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -250,9 +250,9 @@ def load_model_gfpgan(): if thread_data.gfpgan_file is None: print('load_model_gfpgan called without setting gfpgan_file') return - if thread_data.device != 'cpu' and not is_first_cuda_device(thread_data.device): + if not is_first_cuda_device(thread_data.device): #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. - raise Exception(f'Current device {torch.device(thread_data.device)} is not {torch.device(0)}.') + raise Exception(f'Current device {torch.device(thread_data.device)} is not {torch.device(0)}. Cannot run GFPGANer.') model_path = thread_data.gfpgan_file + ".pth" thread_data.model_gfpgan = GFPGANer(device=torch.device(thread_data.device), model_path=model_path, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None) print('loaded', thread_data.gfpgan_file, 'to', thread_data.model_gfpgan.device, 'precision', thread_data.precision) @@ -369,10 +369,10 @@ def do_mk_img(req: Request): if needs_model_reload: load_model_ckpt() - if req.use_face_correction != thread_data.gfpgan_file: + if req.use_face_correction is not None and req.use_face_correction != thread_data.gfpgan_file: thread_data.gfpgan_file = req.use_face_correction load_model_gfpgan() - if req.use_upscale != thread_data.real_esrgan_file: + if req.use_upscale is not None and req.use_upscale != thread_data.real_esrgan_file: thread_data.real_esrgan_file = req.use_upscale load_model_real_esrgan() diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 10d66fd4..9348fe18 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -225,23 +225,18 @@ def thread_render(device): continue try: # Select a render task. for queued_task in tasks_queue: - cpu_alive = is_alive('cpu') if queued_task.request.use_face_correction: #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. - # Allows GFPGANer on cuda:0 and use cpu only when cuda:0 is not available. - first_device_alive = True if is_alive(0) >= 1 else False - if cpu_alive <= 0 and not first_device_alive: - queued_task.request.use_face_correction = False - print('cuda:0 and cpu are not available with the current config. Removed GFPGANer filter to run task.') + if is_alive(0) <= 0: # Allows GFPGANer only on cuda:0. + queued_task.error = Exception('cuda:0 is not available with the current config. Remove GFPGANer filter to run task.') + task = queued_task continue - if not queued_task.request.use_cpu: - if first_device_alive: - if not runtime.is_first_cuda_device(runtime.thread_data.device): - continue # Wait for cuda:0 - elif cpu_alive > 0: - print('cuda:0 is not available with the current config. Forcing task requiring GFPGANer to cpu.') - queued_task.request.use_cpu = True - continue - if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu' and cpu_alive > 0: + if queued_task.request.use_cpu: + queued_task.error = Exception('Cpu cannot be used to run this task. Remove GFPGANer filter to run task.') + task = queued_task + continue + if not runtime.is_first_cuda_device(runtime.thread_data.device): + continue # Wait for cuda:0 + if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu' and is_alive('cpu') > 0: continue # CPU Tasks, Skip GPU device if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu' and is_alive() > 1: # cpu is alive, so need more than one. continue # GPU Tasks, don't run on CPU unless there is nothing else. @@ -254,6 +249,11 @@ def thread_render(device): if task is None: time.sleep(1) continue + if task.error is not None: + print(task.error) + task.response = { "status": 'failed', "detail": str(task.error) } + task.buffer_queue.put(json.dumps(task.response)) + continue #if current_model_path != task.request.use_stable_diffusion_model: # preload_model(task.request.use_stable_diffusion_model) if current_state_error: From ef68e5b13dfbd2f0bedc757c3e99154218bb2f55 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 04:16:46 -0400 Subject: [PATCH 018/221] Added warning about validating config. --- ui/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ui/server.py b/ui/server.py index 2d7f7bd8..564faf42 100644 --- a/ui/server.py +++ b/ui/server.py @@ -356,6 +356,7 @@ async def check_status(): # Task to Validate user config shortly after startup. device_count = new_count if 'render_devices' in config and task_manager.is_alive() <= 0: # No running devices, probably invalid user config. Try to apply defaults. + print('WARNING: No active render devices after loading config. Validate "render_devices" in config.json') task_manager.start_render_thread('auto') # Detect best device for renders task_manager.start_render_thread('cpu') # Allow CPU to be used for renders await asyncio.sleep(3) # delay message after thread start. From 8fae83dab7b084ff41be8e121a52cf45d98e405e Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 04:26:09 -0400 Subject: [PATCH 019/221] Print value to console for better debug from logs. --- ui/server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ui/server.py b/ui/server.py index 564faf42..590e3b37 100644 --- a/ui/server.py +++ b/ui/server.py @@ -360,6 +360,7 @@ async def check_status(): # Task to Validate user config shortly after startup. task_manager.start_render_thread('auto') # Detect best device for renders task_manager.start_render_thread('cpu') # Allow CPU to be used for renders await asyncio.sleep(3) # delay message after thread start. + print('Default render devices loaded to replace missing render_devices', config['render_devices']) display_warning = False if not 'render_devices' in config and task_manager.is_alive(0) <= 0: # No config set, is on auto mode and without cuda:0 From a036b2981a2af81ceb0057d40ae1c6a333a17ad5 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 04:31:57 -0400 Subject: [PATCH 020/221] Removed forgotten mention of CPU in message to user. --- ui/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/server.py b/ui/server.py index 590e3b37..df1b27d0 100644 --- a/ui/server.py +++ b/ui/server.py @@ -94,7 +94,7 @@ def setConfig(config): has_first_cuda_device = True break if len(gpu_devices) > 0 and not has_first_cuda_device: - print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('WARNING: GFPGANer only works on GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') try: # config.bat @@ -369,7 +369,7 @@ async def check_status(): # Task to Validate user config shortly after startup. await asyncio.sleep(3) # delay message after thread start. if display_warning or task_manager.is_alive(0) <= 0: - print('WARNING: GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') + print('WARNING: GFPGANer only works on GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') print('Using CUDA_VISIBLE_DEVICES will remap the selected devices starting at GPU:0 fixing GFPGANer') print('Add the line "@set CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.bat') print('Add the line "CUDA_VISIBLE_DEVICES=N" where N is the GPUs to use to config.sh') From 3bdc90451ac9ce462c445a45ec1352e1a722c2ba Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 04:34:54 -0400 Subject: [PATCH 021/221] Dont preload on cpu. --- ui/sd_internal/task_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 9348fe18..da88cfc9 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -207,7 +207,8 @@ def thread_render(device): weak_thread_data[threading.current_thread()] = { 'device': runtime.thread_data.device } - preload_model() + if runtime.thread_data.device != 'cpu': + preload_model() current_state = ServerStates.Online while True: task_cache.clean() From 4e5ddca3bdc29f745097f382f78cf66e1e41172a Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 05:10:37 -0400 Subject: [PATCH 022/221] Display the failure detail when there is one at that step. Was checking the json object, not the server response. --- ui/media/js/main.js | 4 ++-- ui/sd_internal/task_manager.py | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/ui/media/js/main.js b/ui/media/js/main.js index d7a30f79..f22831ef 100644 --- a/ui/media/js/main.js +++ b/ui/media/js/main.js @@ -509,10 +509,10 @@ async function doMakeImage(task) { }) renderRequest = await res.json() // status_code 503, already a task running. - } while (renderRequest.status_code === 503 && await asyncDelay(30 * 1000)) + } while (res.status === 503 && await asyncDelay(30 * 1000)) if (typeof renderRequest?.stream !== 'string') { console.log('Endpoint response: ', renderRequest) - throw new Error('Endpoint response does not contains a response stream url.') + throw new Error(renderRequest.detail || 'Endpoint response does not contains a response stream url.') } task['taskStatusLabel'].innerText = "Waiting" task['taskStatusLabel'].classList.add('waitingTaskLabel') diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index da88cfc9..e2922049 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -237,10 +237,20 @@ def thread_render(device): continue if not runtime.is_first_cuda_device(runtime.thread_data.device): continue # Wait for cuda:0 - if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu' and is_alive('cpu') > 0: - continue # CPU Tasks, Skip GPU device - if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu' and is_alive() > 1: # cpu is alive, so need more than one. - continue # GPU Tasks, don't run on CPU unless there is nothing else. + if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu': + if is_alive('cpu') > 0: + continue # CPU Tasks, Skip GPU device + else: + queued_task.error = Exception('Cpu is not enabled in render_devices.') + task = queued_task + continue + if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': + if is_alive() > 1: # cpu is alive, so need more than one. + continue # GPU Tasks, don't run on CPU unless there is nothing else. + else: + queued_task.error = Exception('No active gpu found. Please check the error message in the command-line window at startup.') + task = queued_task + continue task = queued_task break if task is not None: From fc8660df78e1f7e89b1efd6d96f7503bb82f3921 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 19 Oct 2022 05:18:36 -0400 Subject: [PATCH 023/221] Faster response on invalid settings when CPU was specified with GFPGANer. --- ui/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/server.py b/ui/server.py index df1b27d0..fc3738d6 100644 --- a/ui/server.py +++ b/ui/server.py @@ -264,8 +264,8 @@ def save_model_to_config(model_name): @app.post('/render') def render(req : task_manager.ImageRequest): if req.use_cpu and task_manager.is_alive('cpu') <= 0: raise HTTPException(status_code=403, detail=f'CPU rendering is not enabled in config.json or the thread has died...') # HTTP403 Forbidden - if req.use_face_correction and task_manager.is_alive(0) <= 0 and task_manager.is_alive('cpu') <= 0: #TODO Remove when GFPGANer is fixed upstream. - raise HTTPException(status_code=412, detail=f'GFPGANer only works on CPU or GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') # HTTP412 Precondition Failed + if req.use_face_correction and task_manager.is_alive(0) <= 0: #TODO Remove when GFPGANer is fixed upstream. + raise HTTPException(status_code=412, detail=f'GFPGANer only works GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') # HTTP412 Precondition Failed try: save_model_to_config(req.use_stable_diffusion_model) req.use_stable_diffusion_model = resolve_model_to_use(req.use_stable_diffusion_model) From 56ed4fe6f2818ce01711a76ad9b46ed620f4599f Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 01:30:49 -0400 Subject: [PATCH 024/221] Fix VisualStudio Type Warning. --- ui/sd_internal/task_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 21fcf4e0..bdfbe4e5 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -36,7 +36,7 @@ class RenderTask(): # Task with output queue and completion lock. def __init__(self, req: Request): self.request: Request = req # Initial Request self.response: Any = None # Copy of the last reponse - self.temp_images:[] = [None] * req.num_outputs * (1 if req.show_only_filtered_image else 2) + self.temp_images:list = [None] * req.num_outputs * (1 if req.show_only_filtered_image else 2) self.error: Exception = None self.lock: threading.Lock = threading.Lock() # Locks at task start and unlocks when task is completed self.buffer_queue: queue.Queue = queue.Queue() # Queue of JSON string segments From 1442748f5887949ee156ab469e7cb591d2668081 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 03:53:26 -0400 Subject: [PATCH 025/221] When starting with profiler cuda devices are slower to init. --- ui/server.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ui/server.py b/ui/server.py index bd42ec58..0dd0adff 100644 --- a/ui/server.py +++ b/ui/server.py @@ -364,24 +364,27 @@ async def check_status(): # Task to Validate user config shortly after startup. # Check that the loaded config.json yielded a server in a known valid state. # When issues are found, try to fix them when possible and warn the user. device_count = 0 - for i in range(10): # Wait for devices to register and/or change names. - await asyncio.sleep(3) + # Wait for devices to register and/or change names. + THREAD_START_DELAY = 5 # seconds - Give time for devices/threads to start. + for i in range(10): # Maximum number of retry. + await asyncio.sleep(THREAD_START_DELAY) new_count = task_manager.is_alive() - if device_count == new_count: break; + # Stops retry once no more devices show up. + if new_count > 0 and device_count == new_count: break device_count = new_count if 'render_devices' in config and task_manager.is_alive() <= 0: # No running devices, probably invalid user config. Try to apply defaults. print('WARNING: No active render devices after loading config. Validate "render_devices" in config.json') task_manager.start_render_thread('auto') # Detect best device for renders task_manager.start_render_thread('cpu') # Allow CPU to be used for renders - await asyncio.sleep(3) # delay message after thread start. + await asyncio.sleep(THREAD_START_DELAY) # delay message after thread start. print('Default render devices loaded to replace missing render_devices', config['render_devices']) display_warning = False if not 'render_devices' in config and task_manager.is_alive(0) <= 0: # No config set, is on auto mode and without cuda:0 task_manager.start_render_thread('cuda') # An other cuda device is better and cuda:0 is missing, start it... display_warning = True # And warn user to update settings... - await asyncio.sleep(3) # delay message after thread start. + await asyncio.sleep(THREAD_START_DELAY) # delay message after thread start. if display_warning or task_manager.is_alive(0) <= 0: print('WARNING: GFPGANer only works on GPU:0, use CUDA_VISIBLE_DEVICES if GFPGANer is needed on a specific GPU.') From ccb7a553c2bc3a1660f77498a54929605e9c2574 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 03:53:43 -0400 Subject: [PATCH 026/221] Memory improvements --- ui/sd_internal/runtime.py | 254 +++++++++++++++++++++++--------------- 1 file changed, 154 insertions(+), 100 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index b88af09f..132658e1 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -9,6 +9,7 @@ import os, re import traceback import torch import numpy as np +from gc import collect as gc_collect from omegaconf import OmegaConf from PIL import Image, ImageOps from tqdm import tqdm, trange @@ -104,6 +105,7 @@ def device_init(device_selection=None): thread_data.turbo = False thread_data.has_valid_gpu = False thread_data.force_full_precision = False + thread_data.reduced_memory = True if device_selection.lower() == 'cpu': print('CPU requested, skipping gpu init.') @@ -172,7 +174,6 @@ def load_model_ckpt(): if not thread_data.unet_bs: thread_data.unet_bs = 1 - unload_model() if thread_data.device == 'cpu': thread_data.precision = 'full' @@ -213,14 +214,20 @@ def load_model_ckpt(): modelCS.eval() modelCS.cond_stage_model.device = torch.device(thread_data.device) if thread_data.device != 'cpu': - modelCS.to(thread_data.device) + if thread_data.reduced_memory: + modelCS.to('cpu') + else: + modelCS.to(thread_data.device) # Preload on device if not already there. thread_data.modelCS = modelCS modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() if thread_data.device != 'cpu': - modelFS.to(thread_data.device) + if thread_data.reduced_memory: + modelFS.to('cpu') + else: + modelFS.to(thread_data.device) # Preload on device if not already there. thread_data.modelFS = modelFS del sd @@ -236,20 +243,55 @@ def load_model_ckpt(): print('loaded', thread_data.ckpt_file, 'as', model.device, '->', modelCS.cond_stage_model.device, '->', thread_data.modelFS.device, 'using precision', thread_data.precision) -def unload_model(): +def unload_filters(): + if thread_data.model_gfpgan is not None: + del thread_data.model_gfpgan + thread_data.model_gfpgan = None + + if thread_data.model_real_esrgan is not None: + del thread_data.model_real_esrgan + thread_data.model_real_esrgan = None + +def unload_models(): if thread_data.model is not None: print('Unloading models...') del thread_data.model del thread_data.modelCS del thread_data.modelFS + thread_data.model = None thread_data.modelCS = None thread_data.modelFS = None +def wait_move(model, target_device=None): # Send to target_device and wait until complete. + if thread_data.device == "cpu" or thread_data.device == target_device: return + if target_device is None: target_device = 'cpu' + start_mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 + if start_mem <= 0: return + model_name = model.__class__.__name__ + print(f'Device:{thread_data.device} - Sending model {model_name} to {target_device} | Memory transfer starting. Memory Used: {round(start_mem)}Mo') + model.to(target_device) + start_time = time.time() + time_step = start_time + WARNING_TIMEOUT = 1.5 # seconds - Show activity in console after timeout. + last_mem = start_mem + is_transfering = True + while is_transfering: + time.sleep(0.5) # 500ms + mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 + is_transfering = bool(mem > 0 and mem < last_mem) # still stuff loaded, but less than last time. + last_mem = mem + if not is_transfering: + break; + if time.time() - time_step > WARNING_TIMEOUT: # Long delay, print to console to show activity. + print(f'Device:{thread_data.device} - Waiting for Memory transfer. Memory Used: {round(mem)}Mo, Transfered: {round(start_mem - mem)}Mo') + time_step = time.time() + print(f'Device:{thread_data.device} - {model_name} Moved: {round(start_mem - mem)}Mo in {round(time.time() - start_time, 3)} seconds to {target_device}') + def load_model_gfpgan(): - if thread_data.gfpgan_file is None: - print('load_model_gfpgan called without setting gfpgan_file') - return + if thread_data.gfpgan_file is None: raise ValueError(f'Thread gfpgan_file is undefined.') + #print('load_model_gfpgan called without setting gfpgan_file') + #return if not is_first_cuda_device(thread_data.device): #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. raise Exception(f'Current device {torch.device(thread_data.device)} is not {torch.device(0)}. Cannot run GFPGANer.') @@ -258,9 +300,9 @@ def load_model_gfpgan(): print('loaded', thread_data.gfpgan_file, 'to', thread_data.model_gfpgan.device, 'precision', thread_data.precision) def load_model_real_esrgan(): - if thread_data.real_esrgan_file is None: - print('load_model_real_esrgan called without setting real_esrgan_file') - return + if thread_data.real_esrgan_file is None: raise ValueError(f'Thread real_esrgan_file is undefined.') + #print('load_model_real_esrgan called without setting real_esrgan_file') + #return model_path = thread_data.real_esrgan_file + ".pth" RealESRGAN_models = { @@ -294,7 +336,7 @@ def get_base_path(disk_path, session_id, prompt, img_id, ext, suffix=None): return os.path.join(session_out_path, f"{prompt_flattened}_{img_id}_{suffix}.{ext}") return os.path.join(session_out_path, f"{prompt_flattened}_{img_id}.{ext}") -def apply_filters(filter_name, image_data): +def apply_filters(filter_name, image_data, model_path=None): print(f'Applying filter {filter_name}...') if isinstance(image_data, torch.Tensor): print(image_data) @@ -303,12 +345,22 @@ def apply_filters(filter_name, image_data): gc() if filter_name == 'gfpgan': + if model_path is not None and model_path != thread_data.gfpgan_file: + thread_data.gfpgan_file = model_path + load_model_gfpgan() + elif not thread_data.model_gfpgan: + load_model_gfpgan() if thread_data.model_gfpgan is None: raise Exception('Model "gfpgan" not loaded.') print('enhance with', thread_data.gfpgan_file, 'on', thread_data.model_gfpgan.device, 'precision', thread_data.precision) _, _, output = thread_data.model_gfpgan.enhance(image_data[:,:,::-1], has_aligned=False, only_center_face=False, paste_back=True) image_data = output[:,:,::-1] if filter_name == 'real_esrgan': + if model_path is not None and model_path != thread_data.real_esrgan_file: + thread_data.real_esrgan_file = model_path + load_model_real_esrgan() + elif not thread_data.model_real_esrgan: + load_model_real_esrgan() if thread_data.model_real_esrgan is None: raise Exception('Model "gfpgan" not loaded.') print('enhance with', thread_data.real_esrgan_file, 'on', thread_data.model_real_esrgan.device, 'precision', thread_data.precision) output, _ = thread_data.model_real_esrgan.enhance(image_data[:,:,::-1]) @@ -338,6 +390,53 @@ def mk_img(req: Request): "detail": str(e) }) +def update_temp_img(req, x_samples): + partial_images = [] + for i in range(req.num_outputs): + x_sample_ddim = thread_data.modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) + x_sample = torch.clamp((x_sample_ddim + 1.0) / 2.0, min=0.0, max=1.0) + x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") + x_sample = x_sample.astype(np.uint8) + img = Image.fromarray(x_sample) + buf = BytesIO() + img.save(buf, format='JPEG') + buf.seek(0) + + del img, x_sample, x_sample_ddim + # don't delete x_samples, it is used in the code that called this callback + + thread_data.temp_images[str(req.session_id) + '/' + str(i)] = buf + partial_images.append({'path': f'/image/tmp/{req.session_id}/{i}'}) + return partial_images + +# Build and return the apropriate generator for do_mk_img +def get_image_progess_generator(req, extra_props=None): + if not req.stream_progress_updates: + def empty_callback(x_samples, i): return x_samples + return empty_callback + + thread_data.partial_x_samples = None + last_callback_time = -1 + def img_callback(x_samples, i): + nonlocal last_callback_time + + thread_data.partial_x_samples = x_samples + step_time = time.time() - last_callback_time if last_callback_time != -1 else -1 + last_callback_time = time.time() + + progress = {"step": i, "step_time": step_time} + if extra_props is not None: + progress.update(extra_props) + + if req.stream_image_progress and i % 5 == 0: + progress['output'] = update_temp_img(req, x_samples) + + yield json.dumps(progress) + + if thread_data.stop_processing: + raise UserInitiatedStop("User requested that we stop processing") + return img_callback + def do_mk_img(req: Request): thread_data.stop_processing = False @@ -353,7 +452,7 @@ def do_mk_img(req: Request): if not os.path.exists(req.use_stable_diffusion_model + '.ckpt'): raise FileNotFoundError(f'Cannot find {req.use_stable_diffusion_model}.ckpt') needs_model_reload = False - if thread_data.ckpt_file != req.use_stable_diffusion_model: + if not thread_data.model or thread_data.ckpt_file != req.use_stable_diffusion_model: thread_data.ckpt_file = req.use_stable_diffusion_model needs_model_reload = True @@ -361,25 +460,19 @@ def do_mk_img(req: Request): if (thread_data.precision == 'autocast' and (req.use_full_precision or not thread_data.model_is_half)) or \ (thread_data.precision == 'full' and not req.use_full_precision and not thread_data.force_full_precision): thread_data.precision = 'full' if req.use_full_precision else 'autocast' - load_model_ckpt() - load_model_gfpgan() - load_model_real_esrgan() - needs_model_reload = False + needs_model_reload = True if needs_model_reload: + unload_models() + unload_filters() load_model_ckpt() - if req.use_face_correction is not None and req.use_face_correction != thread_data.gfpgan_file: - thread_data.gfpgan_file = req.use_face_correction - load_model_gfpgan() - if req.use_upscale is not None and req.use_upscale != thread_data.real_esrgan_file: - thread_data.real_esrgan_file = req.use_upscale - load_model_real_esrgan() - if thread_data.turbo != req.turbo: thread_data.turbo = req.turbo thread_data.model.turbo = req.turbo + gc() + opt_prompt = req.prompt opt_seed = req.seed opt_n_iter = 1 @@ -432,7 +525,7 @@ def do_mk_img(req: Request): if thread_data.device != "cpu" and thread_data.precision == "autocast": mask = mask.half() - move_fs_to_cpu() + wait_move(thread_data.modelFS) # Send to CPU and wait until complete. assert 0. <= req.prompt_strength <= 1., 'can only work with strength in [0.0, 1.0]' t_enc = int(req.prompt_strength * req.num_inference_steps) @@ -450,7 +543,8 @@ def do_mk_img(req: Request): for prompts in tqdm(data, desc="data"): with precision_scope("cuda"): - thread_data.modelCS.to(thread_data.device) + if thread_data.reduced_memory: + thread_data.modelCS.to(thread_data.device) uc = None if req.guidance_scale != 1.0: uc = thread_data.modelCS.get_learned_conditioning(batch_size * [req.negative_prompt]) @@ -470,47 +564,11 @@ def do_mk_img(req: Request): else: c = thread_data.modelCS.get_learned_conditioning(prompts) - thread_data.modelFS.to(thread_data.device) + if thread_data.reduced_memory: + thread_data.modelFS.to(thread_data.device) - partial_x_samples = None - last_callback_time = -1 - def img_callback(x_samples, i): - nonlocal partial_x_samples, last_callback_time - - partial_x_samples = x_samples - - if req.stream_progress_updates: - n_steps = req.num_inference_steps if req.init_image is None else t_enc - step_time = time.time() - last_callback_time if last_callback_time != -1 else -1 - last_callback_time = time.time() - - progress = {"step": i, "total_steps": n_steps, "step_time": step_time} - - if req.stream_image_progress and i % 5 == 0: - partial_images = [] - - for i in range(batch_size): - x_samples_ddim = thread_data.modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) - x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) - x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") - x_sample = x_sample.astype(np.uint8) - img = Image.fromarray(x_sample) - buf = BytesIO() - img.save(buf, format='JPEG') - buf.seek(0) - - del img, x_sample, x_samples_ddim - # don't delete x_samples, it is used in the code that called this callback - - thread_data.temp_images[str(req.session_id) + '/' + str(i)] = buf - partial_images.append({'path': f'/image/tmp/{req.session_id}/{i}'}) - - progress['output'] = partial_images - - yield json.dumps(progress) - - if thread_data.stop_processing: - raise UserInitiatedStop("User requested that we stop processing") + n_steps = req.num_inference_steps if req.init_image is None else t_enc + img_callback = get_image_progess_generator(req, {"total_steps": n_steps}) # run the handler try: @@ -520,16 +578,23 @@ def do_mk_img(req: Request): else: x_samples = _img2img(init_latent, t_enc, batch_size, req.guidance_scale, c, uc, req.num_inference_steps, opt_ddim_eta, opt_seed, img_callback, mask) - yield from x_samples - - x_samples = partial_x_samples + if req.stream_progress_updates: + yield from x_samples + if hasattr(thread_data, 'partial_x_samples'): + if thread_data.partial_x_samples is not None: + x_samples = thread_data.partial_x_samples + del thread_data.partial_x_samples except UserInitiatedStop: - if partial_x_samples is None: + if not hasattr(thread_data, 'partial_x_samples'): continue + if thread_data.partial_x_samples is None: + del thread_data.partial_x_samples + continue + x_samples = thread_data.partial_x_samples + del thread_data.partial_x_samples - x_samples = partial_x_samples - - print("saving images") + print("decoding images") + img_data = [None] * batch_size for i in range(batch_size): img_id = base64.b64encode(int(time.time()+i).to_bytes(8, 'big')).decode() # Generate unique ID based on time. img_id = img_id.translate({43:None, 47:None, 61:None})[-8:] # Remove + / = and keep last 8 chars. @@ -538,7 +603,15 @@ def do_mk_img(req: Request): x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") x_sample = x_sample.astype(np.uint8) - img = Image.fromarray(x_sample) + img_data[i] = x_sample + del x_samples, x_samples_ddim, x_sample + + if thread_data.reduced_memory: + wait_move(thread_data.modelFS) # Send to CPU and wait until complete. + + print("saving images") + for i in range(batch_size): + img = Image.fromarray(img_data[i]) has_filters = (req.use_face_correction is not None and req.use_face_correction.startswith('GFPGAN')) or \ (req.use_upscale is not None and req.use_upscale.startswith('RealESRGAN')) @@ -562,19 +635,18 @@ def do_mk_img(req: Request): if req.save_to_disk_path is not None: res_image_orig.path_abs = img_out_path - del img if has_filters and not thread_data.stop_processing: filters_applied = [] if req.use_face_correction: - x_sample = apply_filters('gfpgan', x_sample) + img_data[i] = apply_filters('gfpgan', img_data[i], req.use_face_correction) filters_applied.append(req.use_face_correction) if req.use_upscale: - x_sample = apply_filters('real_esrgan', x_sample) + img_data[i] = apply_filters('real_esrgan', img_data[i], req.use_upscale) filters_applied.append(req.use_upscale) if (len(filters_applied) > 0): - filtered_image = Image.fromarray(x_sample) + filtered_image = Image.fromarray(img_data[i]) filtered_img_data = img_to_base64_str(filtered_image, req.output_format) response_image = ResponseImage(data=filtered_img_data, seed=opt_seed) res.images.append(response_image) @@ -587,9 +659,10 @@ def do_mk_img(req: Request): seeds += str(opt_seed) + "," opt_seed += 1 - move_fs_to_cpu() + if thread_data.reduced_memory: + unload_filters() + del img_data gc() - del x_samples, x_samples_ddim, x_sample if thread_data.device != 'cpu': print(f'memory_final = {round(torch.cuda.memory_allocated(thread_data.device) / 1e6, 2)}Mo') @@ -626,15 +699,7 @@ Stable Diffusion model: {req.use_stable_diffusion_model + '.ckpt'} def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, sampler_name): shape = [opt_n_samples, opt_C, opt_H // opt_f, opt_W // opt_f] - if thread_data.device != "cpu": - mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 - print('Device:', thread_data.device, 'CS_Model, Memory transfer starting. Memory Used:', round(mem, 2), 'Mo') - thread_data.modelCS.to("cpu") - while torch.cuda.memory_allocated(thread_data.device) / 1e6 >= mem and mem > 0: - print('Device:', thread_data.device, 'Waiting Memory transfer. Memory Used:', round(mem, 2), 'Mo') - time.sleep(1) - print('Transfered', round(mem - torch.cuda.memory_allocated(thread_data.device) / 1e6, 2), 'Mo') - + wait_move(thread_data.modelCS) # Send to CPU and wait until complete. if sampler_name == 'ddim': thread_data.model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False) @@ -677,21 +742,10 @@ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, o x_T=x_T, sampler = 'ddim' ) - yield from samples_ddim -def move_fs_to_cpu(): - if thread_data.device != "cpu": - mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 - print('Device:', thread_data.device, 'FS_Model, Memory transfer starting. Memory Used:', round(mem, 2), 'Mo') - thread_data.modelFS.to("cpu") - while torch.cuda.memory_allocated(thread_data.device) / 1e6 >= mem and mem > 0: - print('Device:', thread_data.device, 'Waiting for Memory transfer. Memory Used:', round(mem, 2), 'Mo') - time.sleep(1) - print('Transfered', round(mem - torch.cuda.memory_allocated(thread_data.device) / 1e6, 2), 'Mo') - def gc(): - #gc.collect() + gc_collect() if thread_data.device == 'cpu': return torch.cuda.empty_cache() From 88ef1a3c5becc3aa35347bb26babf65c8d99a58a Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 20:22:34 -0400 Subject: [PATCH 027/221] Moved time before model.to --- ui/sd_internal/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 132658e1..9b052397 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -270,8 +270,8 @@ def wait_move(model, target_device=None): # Send to target_device and wait until if start_mem <= 0: return model_name = model.__class__.__name__ print(f'Device:{thread_data.device} - Sending model {model_name} to {target_device} | Memory transfer starting. Memory Used: {round(start_mem)}Mo') - model.to(target_device) start_time = time.time() + model.to(target_device) time_step = start_time WARNING_TIMEOUT = 1.5 # seconds - Show activity in console after timeout. last_mem = start_mem From 7befa94e6d85083d10a531e0839c841ebb2ed235 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 20:56:24 -0400 Subject: [PATCH 028/221] More comments and cleanup. --- ui/sd_internal/runtime.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 9b052397..f6ea55b7 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -207,6 +207,9 @@ def load_model_ckpt(): model.turbo = thread_data.turbo if thread_data.device != 'cpu': model.to(thread_data.device) + #if thread_data.reduced_memory: + #model.model1.to("cpu") + #model.model2.to("cpu") thread_data.model = model modelCS = instantiate_from_config(config.modelCondStage) @@ -263,9 +266,8 @@ def unload_models(): thread_data.modelCS = None thread_data.modelFS = None -def wait_move(model, target_device=None): # Send to target_device and wait until complete. - if thread_data.device == "cpu" or thread_data.device == target_device: return - if target_device is None: target_device = 'cpu' +def wait_model_move_to(model, target_device): # Send to target_device and wait until complete. + if thread_data.device == target_device: return start_mem = torch.cuda.memory_allocated(thread_data.device) / 1e6 if start_mem <= 0: return model_name = model.__class__.__name__ @@ -338,12 +340,11 @@ def get_base_path(disk_path, session_id, prompt, img_id, ext, suffix=None): def apply_filters(filter_name, image_data, model_path=None): print(f'Applying filter {filter_name}...') + gc() # Free space before loading new data. if isinstance(image_data, torch.Tensor): print(image_data) image_data.to(thread_data.device) - gc() - if filter_name == 'gfpgan': if model_path is not None and model_path != thread_data.gfpgan_file: thread_data.gfpgan_file = model_path @@ -373,18 +374,10 @@ def mk_img(req: Request): yield from do_mk_img(req) except Exception as e: print(traceback.format_exc()) - - gc() - - if thread_data.device != "cpu": - thread_data.modelFS.to("cpu") - thread_data.modelCS.to("cpu") - - thread_data.model.model1.to("cpu") - thread_data.model.model2.to("cpu") - - gc() - + # Model crashed, release all resources in unknown state. + unload_models() + unload_filters() + gc() # Release from memory. yield json.dumps({ "status": 'failed', "detail": str(e) @@ -471,6 +464,7 @@ def do_mk_img(req: Request): thread_data.turbo = req.turbo thread_data.model.turbo = req.turbo + # Start by cleaning memory, loading and unloading things can leave memory allocated. gc() opt_prompt = req.prompt @@ -525,7 +519,8 @@ def do_mk_img(req: Request): if thread_data.device != "cpu" and thread_data.precision == "autocast": mask = mask.half() - wait_move(thread_data.modelFS) # Send to CPU and wait until complete. + # Send to CPU and wait until complete. + wait_model_move_to(thread_data.modelFS, 'cpu') assert 0. <= req.prompt_strength <= 1., 'can only work with strength in [0.0, 1.0]' t_enc = int(req.prompt_strength * req.num_inference_steps) @@ -607,7 +602,8 @@ def do_mk_img(req: Request): del x_samples, x_samples_ddim, x_sample if thread_data.reduced_memory: - wait_move(thread_data.modelFS) # Send to CPU and wait until complete. + # Send to CPU and wait until complete. + wait_model_move_to(thread_data.modelFS, 'cpu') print("saving images") for i in range(batch_size): @@ -699,7 +695,8 @@ Stable Diffusion model: {req.use_stable_diffusion_model + '.ckpt'} def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, opt_C, opt_f, opt_ddim_eta, c, uc, opt_seed, img_callback, mask, sampler_name): shape = [opt_n_samples, opt_C, opt_H // opt_f, opt_W // opt_f] - wait_move(thread_data.modelCS) # Send to CPU and wait until complete. + # Send to CPU and wait until complete. + wait_model_move_to(thread_data.modelCS, 'cpu') if sampler_name == 'ddim': thread_data.model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False) From ce2b711b1f9524fe7dcb692121bef4fdb5e86aee Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 21:44:15 -0400 Subject: [PATCH 029/221] Newlines... --- ui/sd_internal/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index f6ea55b7..a87b2071 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -663,7 +663,6 @@ def do_mk_img(req: Request): print(f'memory_final = {round(torch.cuda.memory_allocated(thread_data.device) / 1e6, 2)}Mo') print('Task completed') - yield json.dumps(res.json()) def save_image(img, img_out_path): @@ -697,6 +696,7 @@ def _txt2img(opt_W, opt_H, opt_n_samples, opt_ddim_steps, opt_scale, start_code, # Send to CPU and wait until complete. wait_model_move_to(thread_data.modelCS, 'cpu') + if sampler_name == 'ddim': thread_data.model.make_schedule(ddim_num_steps=opt_ddim_steps, ddim_eta=opt_ddim_eta, verbose=False) From 3b5f96a13396143fb969f6fdf7e4204d107e3c83 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Fri, 21 Oct 2022 22:45:19 -0400 Subject: [PATCH 030/221] Fixed stopping tasks and more cleaning. --- ui/sd_internal/task_manager.py | 95 ++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index bdfbe4e5..1216c2c0 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -193,6 +193,50 @@ def preload_model(file_path=None): current_state = ServerStates.Unavailable print(traceback.format_exc()) +def thread_get_next_task(): + if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): + print('Render thread on device', runtime.thread_data.device, 'failed to acquire manager lock.') + return None + if len(tasks_queue) <= 0: + manager_lock.release() + return None + from . import runtime + task = None + try: # Select a render task. + for queued_task in tasks_queue: + if queued_task.request.use_face_correction: # TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. + if is_alive(0) <= 0: # Allows GFPGANer only on cuda:0. + queued_task.error = Exception('cuda:0 is not available with the current config. Remove GFPGANer filter to run task.') + task = queued_task + break + if queued_task.request.use_cpu: + queued_task.error = Exception('Cpu cannot be used to run this task. Remove GFPGANer filter to run task.') + task = queued_task + break + if not runtime.is_first_cuda_device(runtime.thread_data.device): + continue # Wait for cuda:0 + if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu': + if is_alive('cpu') > 0: + continue # CPU Tasks, Skip GPU device + else: + queued_task.error = Exception('Cpu is not enabled in render_devices.') + task = queued_task + break + if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': + if is_alive() > 1: # cpu is alive, so need more than one. + continue # GPU Tasks, don't run on CPU unless there is nothing else. + else: + queued_task.error = Exception('No active gpu found. Please check the error message in the command-line window at startup.') + task = queued_task + break + task = queued_task + break + if task is not None: + del tasks_queue[tasks_queue.index(task)] + return task + finally: + manager_lock.release() + def thread_render(device): global current_state, current_state_error, current_model_path from . import runtime @@ -215,60 +259,19 @@ def thread_render(device): if isinstance(current_state_error, SystemExit): current_state = ServerStates.Unavailable return - task = None - if not manager_lock.acquire(blocking=True, timeout=LOCK_TIMEOUT): - print('Render thread on device', runtime.thread_data.device, 'failed to acquire manager lock.') - time.sleep(1) - continue - if len(tasks_queue) <= 0: - manager_lock.release() - time.sleep(1) - continue - try: # Select a render task. - for queued_task in tasks_queue: - if queued_task.request.use_face_correction: #TODO Remove when fixed - A bug with GFPGANer and facexlib needs to be fixed before use on other devices. - if is_alive(0) <= 0: # Allows GFPGANer only on cuda:0. - queued_task.error = Exception('cuda:0 is not available with the current config. Remove GFPGANer filter to run task.') - task = queued_task - continue - if queued_task.request.use_cpu: - queued_task.error = Exception('Cpu cannot be used to run this task. Remove GFPGANer filter to run task.') - task = queued_task - continue - if not runtime.is_first_cuda_device(runtime.thread_data.device): - continue # Wait for cuda:0 - if queued_task.request.use_cpu and runtime.thread_data.device != 'cpu': - if is_alive('cpu') > 0: - continue # CPU Tasks, Skip GPU device - else: - queued_task.error = Exception('Cpu is not enabled in render_devices.') - task = queued_task - continue - if not queued_task.request.use_cpu and runtime.thread_data.device == 'cpu': - if is_alive() > 1: # cpu is alive, so need more than one. - continue # GPU Tasks, don't run on CPU unless there is nothing else. - else: - queued_task.error = Exception('No active gpu found. Please check the error message in the command-line window at startup.') - task = queued_task - continue - task = queued_task - break - if task is not None: - del tasks_queue[tasks_queue.index(task)] - finally: - manager_lock.release() + task = thread_get_next_task() if task is None: time.sleep(1) continue if task.error is not None: print(task.error) - task.response = { "status": 'failed', "detail": str(task.error) } + task.response = {"status": 'failed', "detail": str(task.error)} task.buffer_queue.put(json.dumps(task.response)) continue - #if current_model_path != task.request.use_stable_diffusion_model: - # preload_model(task.request.use_stable_diffusion_model) if current_state_error: task.error = current_state_error + task.response = {"status": 'failed', "detail": str(task.error)} + task.buffer_queue.put(json.dumps(task.response)) continue print(f'Session {task.request.session_id} starting task {id(task)}') if not task.lock.acquire(blocking=False): raise Exception('Got locked task from queue.') @@ -288,7 +291,7 @@ def thread_render(device): current_state = ServerStates.Rendering current_model_path = task.request.use_stable_diffusion_model if isinstance(current_state_error, SystemExit) or isinstance(current_state_error, StopAsyncIteration) or isinstance(task.error, StopAsyncIteration): - runtime.stop_processing = True + runtime.thread_data.stop_processing = True if isinstance(current_state_error, StopAsyncIteration): task.error = current_state_error current_state_error = None From 8a10fcf7ea3705f5f8717f81816e37bcd33b479d Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 00:34:33 -0400 Subject: [PATCH 031/221] updated print statement. --- ui/sd_internal/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index a87b2071..521c9562 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -288,7 +288,7 @@ def wait_model_move_to(model, target_device): # Send to target_device and wait u if time.time() - time_step > WARNING_TIMEOUT: # Long delay, print to console to show activity. print(f'Device:{thread_data.device} - Waiting for Memory transfer. Memory Used: {round(mem)}Mo, Transfered: {round(start_mem - mem)}Mo') time_step = time.time() - print(f'Device:{thread_data.device} - {model_name} Moved: {round(start_mem - mem)}Mo in {round(time.time() - start_time, 3)} seconds to {target_device}') + print(f'Device:{thread_data.device} - {model_name} Moved: {round(start_mem - last_mem)}Mo in {round(time.time() - start_time, 3)} seconds to {target_device}') def load_model_gfpgan(): if thread_data.gfpgan_file is None: raise ValueError(f'Thread gfpgan_file is undefined.') From cd6d49860fbdaabd8b43adf9118b28560880a376 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 01:23:39 -0400 Subject: [PATCH 032/221] Missing a 'r' in progress --- ui/sd_internal/runtime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 521c9562..f58e39e8 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -403,7 +403,7 @@ def update_temp_img(req, x_samples): return partial_images # Build and return the apropriate generator for do_mk_img -def get_image_progess_generator(req, extra_props=None): +def get_image_progress_generator(req, extra_props=None): if not req.stream_progress_updates: def empty_callback(x_samples, i): return x_samples return empty_callback @@ -563,7 +563,7 @@ def do_mk_img(req: Request): thread_data.modelFS.to(thread_data.device) n_steps = req.num_inference_steps if req.init_image is None else t_enc - img_callback = get_image_progess_generator(req, {"total_steps": n_steps}) + img_callback = get_image_progress_generator(req, {"total_steps": n_steps}) # run the handler try: From 344dd92c851f0167931cc58b10cebbc5ba1ff4a5 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 12:29:01 -0400 Subject: [PATCH 033/221] Improved checks on '/render' requests --- ui/media/js/main.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ui/media/js/main.js b/ui/media/js/main.js index e7fbe9ff..6eabc088 100644 --- a/ui/media/js/main.js +++ b/ui/media/js/main.js @@ -538,9 +538,16 @@ async function doMakeImage(task) { case 'pending': case 'running': case 'buffer': - case 'error': // Still valid, Update UI with error message + // Normal expected messages. break + case 'completed': + console.warn('Server %o render request %o completed unexpectedly', serverState, renderRequest) + break // Continue anyway to try to read cached result. + case 'error': + console.error('Server %o render request %o has failed', serverState, renderRequest) + break // Still valid, Update UI with error message case 'stopped': + console.log('Server %o render request %o was stopped', serverState, renderRequest) return false default: throw new Error('Unexpected server task state: ' + serverState.session || 'Undefined') From 2c1a897c4e2b1d645a4af70abda94de6194811e4 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 12:40:33 -0400 Subject: [PATCH 034/221] Missing newline. --- ui/media/js/main.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ui/media/js/main.js b/ui/media/js/main.js index 6eabc088..8d3c041a 100644 --- a/ui/media/js/main.js +++ b/ui/media/js/main.js @@ -519,6 +519,7 @@ async function doMakeImage(task) { renderRequest = await res.json() // status_code 503, already a task running. } while (res.status === 503 && await asyncDelay(RETRY_DELAY_IF_SERVER_IS_BUSY)) + if (typeof renderRequest?.stream !== 'string') { console.log('Endpoint response: ', renderRequest) throw new Error(renderRequest.detail || 'Endpoint response does not contains a response stream url.') @@ -534,6 +535,7 @@ async function doMakeImage(task) { throw new Error('Connexion with server lost.') } } while (Date.now() < (serverState.time + SERVER_STATE_VALIDITY_DURATION) && serverState.task !== renderRequest.task) + switch(serverState.session) { case 'pending': case 'running': From 46a46877ed96de8fbc85d6ab6a8290485cd89112 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 13:49:23 -0400 Subject: [PATCH 035/221] Missing model_path replaced by model_name --- ui/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/server.py b/ui/server.py index 0dd0adff..a91d87ce 100644 --- a/ui/server.py +++ b/ui/server.py @@ -153,7 +153,7 @@ def resolve_model_to_use(model_name:str=None): for default_model in APP_CONFIG_DEFAULT_MODELS: default_model_path = os.path.join(SD_DIR, default_model + '.ckpt') if os.path.exists(default_model_path): - print('Could not find the configured custom model at:', model_path + '.ckpt', '. Using the default one:', default_model_path + '.ckpt') + print(f'Could not find the configured custom model {model_name}.ckpt. Using the default one: {default_model_path}.ckpt') return default_model_path raise Exception('No valid models found.') From 364e364429ad1ca4e082f7e23c31b06876a02715 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 13:52:13 -0400 Subject: [PATCH 036/221] Added get_cached_task to replace task_cache.tryGet in server.py Now updated cache TTL on /stream and temp images endpoints. Keep images alive longer when browser keeps reading the endpoints. --- ui/sd_internal/task_manager.py | 7 +++++++ ui/server.py | 8 ++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 1216c2c0..0d26327e 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -326,6 +326,13 @@ def thread_render(device): print(f'Session {task.request.session_id} task {id(task)} completed.') current_state = ServerStates.Online +def get_cached_task(session_id:str, update_ttl:bool=False): + # By calling keep before tryGet, wont discard if was expired. + if update_ttl and not task_cache.keep(session_id, TASK_TTL): + # Failed to keep task, already gone. + return None + return task_cache.tryGet(session_id) + def is_first_cuda_device(device): from . import runtime # When calling runtime from outside thread_render DO NOT USE thread specific attributes or functions. return runtime.is_first_cuda_device(device) diff --git a/ui/server.py b/ui/server.py index a91d87ce..dc76a3bf 100644 --- a/ui/server.py +++ b/ui/server.py @@ -251,7 +251,7 @@ def ping(session_id:str=None): # Alive response = {'status': str(task_manager.current_state)} if session_id: - task = task_manager.task_cache.tryGet(session_id) + task = task_manager.get_cached_task(session_id) if task: response['task'] = id(task) if task.lock.locked(): @@ -302,7 +302,7 @@ def render(req : task_manager.ImageRequest): @app.get('/image/stream/{session_id:str}/{task_id:int}') def stream(session_id:str, task_id:int): #TODO Move to WebSockets ?? - task = task_manager.task_cache.tryGet(session_id) + task = task_manager.get_cached_task(session_id, update_ttl=True) if not task: raise HTTPException(status_code=410, detail='No request received.') # HTTP410 Gone if (id(task) != task_id): raise HTTPException(status_code=409, detail=f'Wrong task id received. Expected:{id(task)}, Received:{task_id}') # HTTP409 Conflict if task.buffer_queue.empty() and not task.lock.locked(): @@ -320,7 +320,7 @@ def stop(session_id:str=None): raise HTTPException(status_code=409, detail='Not currently running any tasks.') # HTTP409 Conflict task_manager.current_state_error = StopAsyncIteration('') return {'OK'} - task = task_manager.task_cache.tryGet(session_id) + task = task_manager.get_cached_task(session_id) if not task: raise HTTPException(status_code=404, detail=f'Session {session_id} has no active task.') # HTTP404 Not Found if isinstance(task.error, StopAsyncIteration): raise HTTPException(status_code=409, detail=f'Session {session_id} task is already stopped.') # HTTP409 Conflict task.error = StopAsyncIteration('') @@ -328,7 +328,7 @@ def stop(session_id:str=None): @app.get('/image/tmp/{session_id}/{img_id:int}') def get_image(session_id, img_id): - task = task_manager.task_cache.tryGet(session_id) + task = task_manager.get_cached_task(session_id, update_ttl=True) if not task: raise HTTPException(status_code=410, detail=f'Session {session_id} has not submitted a task.') # HTTP410 Gone if not task.temp_images[img_id]: raise HTTPException(status_code=425, detail='Too Early, task data is not available yet.') # HTTP425 Too Early try: From 8adf965d0befe6763a229656b9a25690776b90bf Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 19:02:02 -0400 Subject: [PATCH 037/221] Formatting changes. --- ui/sd_internal/runtime.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index f58e39e8..f0e5c57a 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -171,10 +171,10 @@ def load_model_ckpt(): if not thread_data.precision: thread_data.precision = 'full' if thread_data.force_full_precision else 'autocast' + if not thread_data.unet_bs: thread_data.unet_bs = 1 - if thread_data.device == 'cpu': thread_data.precision = 'full' @@ -207,9 +207,9 @@ def load_model_ckpt(): model.turbo = thread_data.turbo if thread_data.device != 'cpu': model.to(thread_data.device) - #if thread_data.reduced_memory: - #model.model1.to("cpu") - #model.model2.to("cpu") + #if thread_data.reduced_memory: + #model.model1.to("cpu") + #model.model2.to("cpu") thread_data.model = model modelCS = instantiate_from_config(config.modelCondStage) @@ -464,7 +464,7 @@ def do_mk_img(req: Request): thread_data.turbo = req.turbo thread_data.model.turbo = req.turbo - # Start by cleaning memory, loading and unloading things can leave memory allocated. + # Start by cleaning memory, loading and unloading things can leave memory allocated. gc() opt_prompt = req.prompt From 0d62123a0b0da7ec3dfe758e5509e9e021d9c71b Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sat, 22 Oct 2022 21:28:12 -0400 Subject: [PATCH 038/221] Replaced missing gpu_name by device_name --- ui/sd_internal/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index f0e5c57a..9d4ee0d9 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -73,7 +73,7 @@ def device_select(device): # otherwise these NVIDIA cards create green images thread_data.force_full_precision = ('nvidia' in device_name.lower() or 'geforce' in device_name.lower()) and (' 1660' in device_name or ' 1650' in device_name) if thread_data.force_full_precision: - print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', gpu_name) + print('forcing full precision on NVIDIA 16xx cards, to avoid green images. GPU detected: ', device_name) thread_data.device = device thread_data.has_valid_gpu = True From fc875651d3e3701948bc0bfb2f167f108442ba13 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Sun, 23 Oct 2022 05:00:21 -0400 Subject: [PATCH 039/221] Removed unused vars --- ui/sd_internal/runtime.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 9d4ee0d9..6c32f587 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -473,7 +473,6 @@ def do_mk_img(req: Request): opt_C = 4 opt_f = 8 opt_ddim_eta = 0.0 - opt_init_img = req.init_image print(req.to_string(), '\n device', thread_data.device) print('\n\n Using precision:', thread_data.precision) @@ -532,7 +531,6 @@ def do_mk_img(req: Request): else: session_out_path = None - seeds = "" with torch.no_grad(): for n in trange(opt_n_iter, desc="Sampling"): for prompts in tqdm(data, desc="data"): @@ -651,8 +649,7 @@ def do_mk_img(req: Request): save_image(filtered_image, filtered_img_out_path) response_image.path_abs = filtered_img_out_path del filtered_image - - seeds += str(opt_seed) + "," + # Filter Applied, move to next seed opt_seed += 1 if thread_data.reduced_memory: From 189d31cc29f4d12c2e87797df2f66f57472d246a Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Mon, 24 Oct 2022 05:12:08 -0400 Subject: [PATCH 040/221] Specify update_ttl on all get_cached_task calls. --- ui/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/server.py b/ui/server.py index dc76a3bf..5521600b 100644 --- a/ui/server.py +++ b/ui/server.py @@ -251,7 +251,7 @@ def ping(session_id:str=None): # Alive response = {'status': str(task_manager.current_state)} if session_id: - task = task_manager.get_cached_task(session_id) + task = task_manager.get_cached_task(session_id, update_ttl=True) if task: response['task'] = id(task) if task.lock.locked(): @@ -320,7 +320,7 @@ def stop(session_id:str=None): raise HTTPException(status_code=409, detail='Not currently running any tasks.') # HTTP409 Conflict task_manager.current_state_error = StopAsyncIteration('') return {'OK'} - task = task_manager.get_cached_task(session_id) + task = task_manager.get_cached_task(session_id, update_ttl=False) if not task: raise HTTPException(status_code=404, detail=f'Session {session_id} has no active task.') # HTTP404 Not Found if isinstance(task.error, StopAsyncIteration): raise HTTPException(status_code=409, detail=f'Session {session_id} task is already stopped.') # HTTP409 Conflict task.error = StopAsyncIteration('') From c41baf3aeb74d97740c1f1d7cf610b96fbb61a4f Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 25 Oct 2022 02:10:52 -0400 Subject: [PATCH 041/221] Moved img_id creation inside save image loop. --- ui/sd_internal/runtime.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 6c32f587..972a6062 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -589,9 +589,6 @@ def do_mk_img(req: Request): print("decoding images") img_data = [None] * batch_size for i in range(batch_size): - img_id = base64.b64encode(int(time.time()+i).to_bytes(8, 'big')).decode() # Generate unique ID based on time. - img_id = img_id.translate({43:None, 47:None, 61:None})[-8:] # Remove + / = and keep last 8 chars. - x_samples_ddim = thread_data.modelFS.decode_first_stage(x_samples[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") @@ -606,6 +603,8 @@ def do_mk_img(req: Request): print("saving images") for i in range(batch_size): img = Image.fromarray(img_data[i]) + img_id = base64.b64encode(int(time.time()+i).to_bytes(8, 'big')).decode() # Generate unique ID based on time. + img_id = img_id.translate({43:None, 47:None, 61:None})[-8:] # Remove + / = and keep last 8 chars. has_filters = (req.use_face_correction is not None and req.use_face_correction.startswith('GFPGAN')) or \ (req.use_upscale is not None and req.use_upscale.startswith('RealESRGAN')) From ae40b6ba8c6a0f38ba00c65b3690473fa6b0b957 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Tue, 25 Oct 2022 03:00:50 -0400 Subject: [PATCH 042/221] Missed a is_alive check in the conversion. --- ui/sd_internal/task_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/sd_internal/task_manager.py b/ui/sd_internal/task_manager.py index 0d26327e..efc53c08 100644 --- a/ui/sd_internal/task_manager.py +++ b/ui/sd_internal/task_manager.py @@ -381,7 +381,7 @@ def shutdown_event(): # Signal render thread to close on shutdown current_state_error = SystemExit('Application shutting down.') def render(req : ImageRequest): - if not is_alive(): # Render thread is dead + if is_alive() <= 0: # Render thread is dead raise ChildProcessError('Rendering thread has died.') # Alive, check if task in cache task = task_cache.tryGet(req.session_id) From 06c2ab045a963a0a180ac103a9900618627685f1 Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 26 Oct 2022 16:14:29 -0400 Subject: [PATCH 043/221] Fix TypeError: string indices must be integers --- ui/sd_internal/runtime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 972a6062..4c75b60a 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -622,8 +622,8 @@ def do_mk_img(req: Request): save_metadata(meta_out_path, req, prompts[0], opt_seed) if return_orig_img: - img_data = img_to_base64_str(img, req.output_format) - res_image_orig = ResponseImage(data=img_data, seed=opt_seed) + img_str = img_to_base64_str(img, req.output_format) + res_image_orig = ResponseImage(data=img_str, seed=opt_seed) res.images.append(res_image_orig) if req.save_to_disk_path is not None: From d3df113fb07edb370e0f74b1e4e3b7b5a14a88dd Mon Sep 17 00:00:00 2001 From: Marc-Andre Ferland Date: Wed, 26 Oct 2022 16:52:31 -0400 Subject: [PATCH 044/221] When reduced_memory is True, on crash only move model back to Cpu. --- ui/sd_internal/runtime.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ui/sd_internal/runtime.py b/ui/sd_internal/runtime.py index 972a6062..3e0c6d34 100644 --- a/ui/sd_internal/runtime.py +++ b/ui/sd_internal/runtime.py @@ -374,9 +374,17 @@ def mk_img(req: Request): yield from do_mk_img(req) except Exception as e: print(traceback.format_exc()) - # Model crashed, release all resources in unknown state. - unload_models() - unload_filters() + + if thread_data.reduced_memory: + thread_data.modelFS.to('cpu') + thread_data.modelCS.to('cpu') + thread_data.model.model1.to("cpu") + thread_data.model.model2.to("cpu") + else: + # Model crashed, release all resources in unknown state. + unload_models() + unload_filters() + gc() # Release from memory. yield json.dumps({ "status": 'failed', From 7c5bbca2fa220f015894467bf832da100ff5f72e Mon Sep 17 00:00:00 2001 From: cmdr2 Date: Thu, 27 Oct 2022 20:49:05 +0530 Subject: [PATCH 045/221] Bump version --- ui/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/index.html b/ui/index.html index 111dad6f..5a37d765 100644 --- a/ui/index.html +++ b/ui/index.html @@ -18,7 +18,7 @@