Update to the latest commit on basujindal's SD fork; More VRAM garbage-collection; Speed up live preview by displaying only every 5th step

This commit is contained in:
cmdr2 2022-09-22 22:14:25 +05:30
parent f98225cdb6
commit 7b520942dc
3 changed files with 56 additions and 27 deletions

View File

@ -15,7 +15,7 @@
@call git reset --hard
@call git pull
@call git checkout d154155d4c0b43e13ec1f00eb72b7ff9d522fcf9
@call git checkout f6cfebffa752ee11a7b07497b8529d5971de916c
@call git apply ..\ui\sd_internal\ddim_callback.patch
@ -32,7 +32,7 @@
)
@cd stable-diffusion
@call git checkout d154155d4c0b43e13ec1f00eb72b7ff9d522fcf9
@call git checkout f6cfebffa752ee11a7b07497b8529d5971de916c
@call git apply ..\ui\sd_internal\ddim_callback.patch

View File

@ -1,7 +1,16 @@
diff --git a/optimizedSD/ddpm.py b/optimizedSD/ddpm.py
index dcf7901..4028a70 100644
index b967b55..75ddd8b 100644
--- a/optimizedSD/ddpm.py
+++ b/optimizedSD/ddpm.py
@@ -22,7 +22,7 @@ from ldm.util import exists, default, instantiate_from_config
from ldm.modules.diffusionmodules.util import make_beta_schedule
from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
-from samplers import CompVisDenoiser, get_ancestral_step, to_d, append_dims,linear_multistep_coeff
+from .samplers import CompVisDenoiser, get_ancestral_step, to_d, append_dims,linear_multistep_coeff
def disabled_train(self):
"""Overwrite model.train with this function to make sure train/eval mode
@@ -485,6 +485,7 @@ class UNet(DDPM):
log_every_t=100,
unconditional_guidance_scale=1.,
@ -25,11 +34,11 @@ index dcf7901..4028a70 100644
+ callback=callback, img_callback=img_callback,
+ streaming_callbacks=streaming_callbacks)
# elif sampler == "euler":
# cvd = CompVisDenoiser(self.alphas_cumprod)
@@ -536,11 +540,15 @@ class UNet(DDPM):
# samples = self.heun_sampling(noise, sig, conditioning, unconditional_conditioning=unconditional_conditioning,
# unconditional_guidance_scale=unconditional_guidance_scale)
elif sampler == "euler":
self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=False)
@@ -555,11 +559,15 @@ class UNet(DDPM):
samples = self.lms_sampling(self.alphas_cumprod,x_latent, S, conditioning, unconditional_conditioning=unconditional_conditioning,
unconditional_guidance_scale=unconditional_guidance_scale)
+ if streaming_callbacks: # this line needs to be right after the sampling() call
+ yield from samples
@ -44,7 +53,7 @@ index dcf7901..4028a70 100644
@torch.no_grad()
def plms_sampling(self, cond,b, img,
@@ -548,7 +556,8 @@ class UNet(DDPM):
@@ -567,7 +575,8 @@ class UNet(DDPM):
callback=None, quantize_denoised=False,
mask=None, x0=None, img_callback=None, log_every_t=100,
temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
@ -54,13 +63,13 @@ index dcf7901..4028a70 100644
device = self.betas.device
timesteps = self.ddim_timesteps
@@ -580,10 +589,22 @@ class UNet(DDPM):
@@ -599,10 +608,21 @@ class UNet(DDPM):
old_eps.append(e_t)
if len(old_eps) >= 4:
old_eps.pop(0)
- if callback: callback(i)
- if img_callback: img_callback(pred_x0, i)
-
- return img
+ if callback:
+ if streaming_callbacks:
@ -80,7 +89,7 @@ index dcf7901..4028a70 100644
@torch.no_grad()
def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
@@ -687,7 +708,9 @@ class UNet(DDPM):
@@ -706,7 +726,9 @@ class UNet(DDPM):
@torch.no_grad()
def ddim_sampling(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
@ -91,11 +100,10 @@ index dcf7901..4028a70 100644
timesteps = self.ddim_timesteps
timesteps = timesteps[:t_start]
@@ -710,11 +733,25 @@ class UNet(DDPM):
x_dec = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
@@ -730,10 +752,24 @@ class UNet(DDPM):
unconditional_guidance_scale=unconditional_guidance_scale,
unconditional_conditioning=unconditional_conditioning)
+
+ if callback:
+ if streaming_callbacks:
+ yield from callback(i)
@ -106,7 +114,7 @@ index dcf7901..4028a70 100644
+ yield from img_callback(x_dec, i)
+ else:
+ img_callback(x_dec, i)
+
if mask is not None:
- return x0 * mask + (1. - mask) * x_dec
+ x_dec = x0 * mask + (1. - mask) * x_dec
@ -119,3 +127,16 @@ index dcf7901..4028a70 100644
@torch.no_grad()
diff --git a/optimizedSD/openaimodelSplit.py b/optimizedSD/openaimodelSplit.py
index abc3098..7a32ffe 100644
--- a/optimizedSD/openaimodelSplit.py
+++ b/optimizedSD/openaimodelSplit.py
@@ -13,7 +13,7 @@ from ldm.modules.diffusionmodules.util import (
normalization,
timestep_embedding,
)
-from splitAttention import SpatialTransformer
+from .splitAttention import SpatialTransformer
class AttentionPool2d(nn.Module):

View File

@ -193,6 +193,15 @@ def mk_img(req: Request):
gc()
if device != "cpu":
modelFS.to("cpu")
modelCS.to("cpu")
model.model1.to("cpu")
model.model2.to("cpu")
gc()
yield json.dumps({
"status": 'failed',
"detail": str(e)
@ -312,11 +321,7 @@ def do_mk_img(req: Request):
if device != "cpu" and precision == "autocast":
mask = mask.half()
if device != "cpu":
mem = torch.cuda.memory_allocated() / 1e6
modelFS.to("cpu")
while torch.cuda.memory_allocated() / 1e6 >= mem:
time.sleep(1)
move_fs_to_cpu()
assert 0. <= opt_strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt_strength * opt_ddim_steps)
@ -365,7 +370,7 @@ def do_mk_img(req: Request):
if req.stream_progress_updates:
progress = {"step": i, "total_steps": opt_ddim_steps}
if req.stream_image_progress:
if req.stream_image_progress and i % 5 == 0:
partial_images = []
for i in range(batch_size):
@ -484,12 +489,8 @@ def do_mk_img(req: Request):
seeds += str(opt_seed) + ","
opt_seed += 1
move_fs_to_cpu()
gc()
if device != "cpu":
mem = torch.cuda.memory_allocated() / 1e6
modelFS.to("cpu")
while torch.cuda.memory_allocated() / 1e6 >= mem:
time.sleep(1)
del x_samples, x_samples_ddim, x_sample
print("memory_final = ", torch.cuda.memory_allocated() / 1e6)
@ -575,6 +576,13 @@ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, o
else:
return samples_ddim
def move_fs_to_cpu():
if device != "cpu":
mem = torch.cuda.memory_allocated() / 1e6
modelFS.to("cpu")
while torch.cuda.memory_allocated() / 1e6 >= mem:
time.sleep(1)
def gc():
if device == 'cpu':
return