Update to the latest commit on basujindal's SD fork; More VRAM garbage-collection; Speed up live preview by displaying only every 5th step

2025-08-12 17:27:13 +02:00 · 2022-09-22 22:14:25 +05:30
parent f98225cdb6
commit 7b520942dc
3 changed files with 56 additions and 27 deletions
--- a/ui/sd_internal/runtime.py
+++ b/ui/sd_internal/runtime.py
@ -193,6 +193,15 @@ def mk_img(req: Request):

        gc()

+        if device != "cpu":
+            modelFS.to("cpu")
+            modelCS.to("cpu")
+
+            model.model1.to("cpu")
+            model.model2.to("cpu")
+
+        gc()
+
        yield json.dumps({
            "status": 'failed',
            "detail": str(e)
@ -312,11 +321,7 @@ def do_mk_img(req: Request):
            if device != "cpu" and precision == "autocast":
                mask = mask.half()

-        if device != "cpu":
-            mem = torch.cuda.memory_allocated() / 1e6
-            modelFS.to("cpu")
-            while torch.cuda.memory_allocated() / 1e6 >= mem:
-                time.sleep(1)
+        move_fs_to_cpu()

        assert 0. <= opt_strength <= 1., 'can only work with strength in [0.0, 1.0]'
        t_enc = int(opt_strength * opt_ddim_steps)
@ -365,7 +370,7 @@ def do_mk_img(req: Request):
                        if req.stream_progress_updates:
                            progress = {"step": i, "total_steps": opt_ddim_steps}

-                            if req.stream_image_progress:
+                            if req.stream_image_progress and i % 5 == 0:
                                partial_images = []

                                for i in range(batch_size):
@ -484,12 +489,8 @@ def do_mk_img(req: Request):
                        seeds += str(opt_seed) + ","
                        opt_seed += 1

+                    move_fs_to_cpu()
                    gc()
-                    if device != "cpu":
-                        mem = torch.cuda.memory_allocated() / 1e6
-                        modelFS.to("cpu")
-                        while torch.cuda.memory_allocated() / 1e6 >= mem:
-                            time.sleep(1)
                    del x_samples, x_samples_ddim, x_sample
                    print("memory_final = ", torch.cuda.memory_allocated() / 1e6)

@ -575,6 +576,13 @@ def _img2img(init_latent, t_enc, batch_size, opt_scale, c, uc, opt_ddim_steps, o
    else:
        return samples_ddim

+def move_fs_to_cpu():
+    if device != "cpu":
+        mem = torch.cuda.memory_allocated() / 1e6
+        modelFS.to("cpu")
+        while torch.cuda.memory_allocated() / 1e6 >= mem:
+            time.sleep(1)
+
 def gc():
    if device == 'cpu':
        return