refactor video file create and save

Signed-off-by: Vladimir Mandic <[email protected]>
vladmandic · Jan 15, 2025 · 5a59054 · 5a59054
1 parent 0ac2e3c
commit 5a59054
Show file tree

Hide file tree

Showing 21 changed files with 166 additions and 265 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,16 +1,20 @@
 # Change Log for SD.Next
 
-## Update for 2025-01-13
+## Update for 2025-01-15
+
+### Highlights for 2025-01-15
 
 Two weeks since last release, time for update!  
+This time a bit shorter highligh reel as this is primarily a service release, but still there is more than few updates
+
 *What's New?"  
 - Large [Wiki](https://github.com/vladmandic/automatic/wiki)/[Docs](https://vladmandic.github.io/sdnext-docs/) updates  
-- New models: **Allegro Video**, new pipelines: **PixelSmith**, updates: **Hunyuan-Video**, **LTX-Video**  
-- New schedulers (TDD)  
-- Improvements to **Detailer**, **XYZ grid**, **Sysinfo**, **Logging**  
+- New models: **Allegro Video**, new pipelines: **PixelSmith**, updates: **Hunyuan-Video**, **LTX-Video**, **Sana 4k**  
+- New version for **ZLUDA**  
+- New features in **Detailer**, **XYZ grid**, **Sysinfo**, **Logging**, **Schedulers**, **Video save/create**  
 - And a tons of hotfixes...  
 
-### Details for 2025-01-13
+### Details for 2025-01-15
 
 - [Wiki/Docs](https://vladmandic.github.io/sdnext-docs/):
   - updated: Detailer, Install, Update, Debug, Control-HowTo, ZLUDA  
@@ -26,6 +30,10 @@ Two weeks since last release, time for update!
   - example: <https://huggingface.co/Cseti/HunyuanVideo-LoRA-Arcane_Jinx-v1>
 - [LTX Video](https://github.com/Lightricks/LTX-Video) framewise decoding  
   - enabled by default, allows generating longer videos with reduced memory requirements  
+- [Sana 4k](https://huggingface.co/Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers)  
+  - new Sana variation with support of directly generating 4k images  
+  - simply select from *networks -> models -> reference*  
+  - tip: enable vae tiling when generating very large images  
 - **Logging**:
   - reverted enable debug by default  
   - updated [debug wiki](https://github.com/vladmandic/automatic/wiki/debug)  
@@ -38,6 +46,7 @@ Two weeks since last release, time for update!
   - startup tracing and optimizations  
   - threading load locks on model loads  
   - refactor native vs legacy model loader  
+  - video save/create
 - **Schedulers**:
   - [TDD](https://github.com/RedAIGC/Target-Driven-Distillation) new super-fast scheduler that can generate images in 4-8 steps  
     recommended to use with [TDD LoRA](https://huggingface.co/RED-AIGC/TDD/tree/main)  
@@ -51,6 +60,11 @@ Two weeks since last release, time for update!
   - since different TAESD versions produce different results and latest is not necessarily greatest  
     you can choose TAESD version in settings -> live preview  
     also added is support for another finetuned version of TAESD [Hybrid TinyVAE](https://huggingface.co/cqyan/hybrid-sd-tinyvae-xl)  
+- **Video**  
+  - all video create/save code is now unified  
+  - add support for video formats: GIF, PNG, MP4/MP4V, MP4/AVC1, MP4/JVT3, MKV/H264, AVI/DIVX, AVI/RGBA, MJPEG/MJPG, MPG/MPG1, AVR/AVR1
+  - *note*: video format support is platform dependent and not all formats may be available on all platforms
+  - *note*: avc1 and h264 need custom opencv due to oss licensing issues  
 - **ZLUDA** v3.8.7  
   - new runtime compiler implementation: complex types, JIT are now available  
   - fast fourier transformation is implemented  

diff --git a/html/previews.json b/html/previews.json
@@ -11,6 +11,7 @@
   "THUDM--CogVideoX-5b-I2V": "models/Reference/THUDM--CogView3-Plus-3B.jpg",
   "Efficient-Large-Model--Sana_1600M_1024px_BF16_diffusers": "models/Reference/Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
   "Efficient-Large-Model--Sana_1600M_2Kpx_BF16_diffusers": "models/Reference/Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
+  "Efficient-Large-Model--Sana_1600M_4Kpx_BF16_diffusers": "models/Reference/Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
   "Efficient-Large-Model--Sana_600M_1024px_diffusers": "models/Reference/Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
   "stabilityai--stable-video-diffusion-img2vid-xt-1-1": "models/Reference/stabilityai--stable-video-diffusion-img2vid-xt.jpg",
   "shuttleai--shuttle-3-diffusion": "models/Reference/shuttleai--shuttle-3-diffusion.jpg"

diff --git a/html/reference.json b/html/reference.json
@@ -180,19 +180,25 @@
     "extras": "sampler: Default, cfg_scale: 3.5"
   },
 
-  "NVLabs Sana 1.6B 2048px": {
+  "NVLabs Sana 1.6B 4k": {
+    "path": "Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers",
+    "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
+    "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
+    "skip": true
+  }, 
+  "NVLabs Sana 1.6B 2k": {
     "path": "Efficient-Large-Model/Sana_1600M_2Kpx_BF16_diffusers",
     "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
     "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
     "skip": true
   }, 
-  "NVLabs Sana 1.6B 1024px": {
+  "NVLabs Sana 1.6B 1k": {
     "path": "Efficient-Large-Model/Sana_1600M_1024px_diffusers",
     "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
     "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
     "skip": true
   }, 
-  "NVLabs Sana 0.6B 512px": {
+  "NVLabs Sana 0.6B 0.5k": {
     "path": "Efficient-Large-Model/Sana_600M_512px_diffusers",
     "desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
     "preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",

diff --git a/modules/images.py b/modules/images.py
@@ -15,6 +15,7 @@
 from modules.images_grid import image_grid, get_grid_size, split_grid, combine_grid, check_grid_size, get_font, draw_grid_annotations, draw_prompt_matrix, GridAnnotation, Grid # pylint: disable=unused-import
 from modules.images_resize import resize_image # pylint: disable=unused-import
 from modules.images_namegen import FilenameGenerator, get_next_sequence_number # pylint: disable=unused-import
+from modules.video import save_video # pylint: disable=unused-import
 
 
 debug = errors.log.trace if os.environ.get('SD_PATH_DEBUG', None) is not None else lambda *args, **kwargs: None
@@ -190,76 +191,6 @@ def save_image(image,
     return params.filename, filename_txt, exifinfo
 
 
-def save_video_atomic(images, filename, video_type: str = 'none', duration: float = 2.0, loop: bool = False, interpolate: int = 0, scale: float = 1.0, pad: int = 1, change: float = 0.3):
-    try:
-        import cv2
-    except Exception as e:
-        shared.log.error(f'Save video: cv2: {e}')
-        return
-    os.makedirs(os.path.dirname(filename), exist_ok=True)
-    if video_type.lower() == 'mp4':
-        frames = images
-        if interpolate > 0:
-            try:
-                import modules.rife
-                frames = modules.rife.interpolate(images, count=interpolate, scale=scale, pad=pad, change=change)
-            except Exception as e:
-                shared.log.error(f'RIFE interpolation: {e}')
-                errors.display(e, 'RIFE interpolation')
-        video_frames = [np.array(frame) for frame in frames]
-        fourcc = "mp4v"
-        h, w, _c = video_frames[0].shape
-        video_writer = cv2.VideoWriter(filename, fourcc=cv2.VideoWriter_fourcc(*fourcc), fps=len(frames)/duration, frameSize=(w, h))
-        for i in range(len(video_frames)):
-            img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
-            video_writer.write(img)
-        size = os.path.getsize(filename)
-        shared.log.info(f'Save video: file="{filename}" frames={len(frames)} duration={duration} fourcc={fourcc} size={size}')
-    if video_type.lower() == 'gif' or video_type.lower() == 'png':
-        append = images.copy()
-        image = append.pop(0)
-        if loop:
-            append += append[::-1]
-        frames=len(append) + 1
-        image.save(
-            filename,
-            save_all = True,
-            append_images = append,
-            optimize = False,
-            duration = 1000.0 * duration / frames,
-            loop = 0 if loop else 1,
-        )
-        size = os.path.getsize(filename)
-        shared.log.info(f'Save video: file="{filename}" frames={len(append) + 1} duration={duration} loop={loop} size={size}')
-
-
-def save_video(p, images, filename = None, video_type: str = 'none', duration: float = 2.0, loop: bool = False, interpolate: int = 0, scale: float = 1.0, pad: int = 1, change: float = 0.3, sync: bool = False):
-    if images is None or len(images) < 2 or video_type is None or video_type.lower() == 'none':
-        return None
-    image = images[0]
-    if p is not None:
-        seed = p.all_seeds[0] if getattr(p, 'all_seeds', None) is not None else p.seed
-        prompt = p.all_prompts[0] if getattr(p, 'all_prompts', None) is not None else p.prompt
-        namegen = FilenameGenerator(p, seed=seed, prompt=prompt, image=image)
-    else:
-        namegen = FilenameGenerator(None, seed=0, prompt='', image=image)
-    if filename is None and p is not None:
-        filename = namegen.apply(shared.opts.samples_filename_pattern if shared.opts.samples_filename_pattern and len(shared.opts.samples_filename_pattern) > 0 else "[seq]-[prompt_words]")
-        filename = os.path.join(shared.opts.outdir_video, filename)
-        filename = namegen.sequence(filename, shared.opts.outdir_video, '')
-    else:
-        if os.pathsep not in filename:
-            filename = os.path.join(shared.opts.outdir_video, filename)
-    if not filename.lower().endswith(video_type.lower()):
-        filename += f'.{video_type.lower()}'
-    filename = namegen.sanitize(filename)
-    if not sync:
-        threading.Thread(target=save_video_atomic, args=(images, filename, video_type, duration, loop, interpolate, scale, pad, change)).start()
-    else:
-        save_video_atomic(images, filename, video_type, duration, loop, interpolate, scale, pad, change)
-    return filename
-
-
 def safe_decode_string(s: bytes):
     remove_prefix = lambda text, prefix: text[len(prefix):] if text.startswith(prefix) else text # pylint: disable=unnecessary-lambda-assignment
     for encoding in ['utf-8', 'utf-16', 'ascii', 'latin_1', 'cp1252', 'cp437']: # try different encodings

diff --git a/modules/processing_args.py b/modules/processing_args.py
@@ -179,6 +179,9 @@ def set_pipeline_args(p, model, prompts:list, negative_prompts:list, prompts_2:t
         p.extra_generation_params["CHI"] = chi
         if not chi:
             args['complex_human_instruction'] = None
+    if 'use_resolution_binning' in possible:
+        args['use_resolution_binning'] = True
+        p.extra_generation_params["Binning"] = True
     if prompt_parser_diffusers.embedder is not None and not prompt_parser_diffusers.embedder.scheduled_prompt: # not scheduled so we dont need it anymore
         prompt_parser_diffusers.embedder = None
 

diff --git a/modules/shared.py b/modules/shared.py
@@ -768,6 +768,7 @@ def get_default_modes():
     "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
     "font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}),
     "aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"),
+    "logmonitor_show": OptionInfo(True, "Show log view"),
     "motd": OptionInfo(False, "Show MOTD"),
     "compact_view": OptionInfo(False, "Compact view"),
     "return_grid": OptionInfo(True, "Show grid in results"),
@@ -787,7 +788,6 @@ def get_default_modes():
     "taesd_layers": OptionInfo(3, "TAESD decode layers", gr.Slider, {"minimum": 1, "maximum": 3, "step": 1}),
     "live_preview_downscale": OptionInfo(True, "Downscale high resolution live previews"),
 
-    "logmonitor_show": OptionInfo(True, "Show log view"),
     "logmonitor_refresh_period": OptionInfo(5000, "Log view update period", gr.Slider, {"minimum": 0, "maximum": 30000, "step": 25}),
     "notification_audio_enable": OptionInfo(False, "Play a notification upon completion"),
     "notification_audio_path": OptionInfo("html/notification.mp3","Path to notification sound", component_args=hide_dirs, folder=True),

diff --git a/modules/ui_control.py b/modules/ui_control.py
@@ -168,13 +168,8 @@ def create_ui(_blocks: gr.Blocks=None):
                     with gr.Row():
                         video_skip_frames = gr.Slider(minimum=0, maximum=100, step=1, label='Skip input frames', value=0, elem_id="control_video_skip_frames")
                     with gr.Row():
-                        video_type = gr.Dropdown(label='Video file', choices=['None', 'GIF', 'PNG', 'MP4'], value='None', elem_id="control_video_type")
-                        video_duration = gr.Slider(label='Duration', minimum=0.25, maximum=300, step=0.25, value=2, visible=False, elem_id="control_video_duration")
-                    with gr.Row():
-                        video_loop = gr.Checkbox(label='Loop', value=True, visible=False, elem_id="control_video_loop")
-                        video_pad = gr.Slider(label='Pad frames', minimum=0, maximum=24, step=1, value=1, visible=False, elem_id="control_video_pad")
-                        video_interpolate = gr.Slider(label='Interpolate frames', minimum=0, maximum=24, step=1, value=0, visible=False, elem_id="control_video_interpolate")
-                    video_type.change(fn=helpers.video_type_change, inputs=[video_type], outputs=[video_duration, video_loop, video_pad, video_interpolate])
+                        from modules.ui_sections import create_video_inputs
+                        video_type, video_duration, video_loop, video_pad, video_interpolate = create_video_inputs()
 
                 enable_hr, hr_sampler_index, hr_denoising_strength, hr_resize_mode, hr_resize_context, hr_upscaler, hr_force, hr_second_pass_steps, hr_scale, hr_resize_x, hr_resize_y, refiner_steps, refiner_start, refiner_prompt, refiner_negative = ui_sections.create_hires_inputs('control')
                 detailer_enabled, detailer_prompt, detailer_negative, detailer_steps, detailer_strength = shared.yolo.ui('control')

diff --git a/modules/ui_control_helpers.py b/modules/ui_control_helpers.py
@@ -181,15 +181,6 @@ def select_input(input_mode, input_image, init_image, init_type, input_resize, i
     return res
 
 
-def video_type_change(video_type):
-    return [
-        gr.update(visible=video_type != 'None'),
-        gr.update(visible=video_type == 'GIF' or video_type == 'PNG'),
-        gr.update(visible=video_type == 'MP4'),
-        gr.update(visible=video_type == 'MP4'),
-    ]
-
-
 def copy_input(mode_from, mode_to, input_image, input_resize, input_inpaint):
     debug_log(f'Control transfter input: from={mode_from} to={mode_to} image={input_image} resize={input_resize} inpaint={input_inpaint}')
     def getimg(ctrl):

diff --git a/modules/ui_sections.py b/modules/ui_sections.py
@@ -131,6 +131,26 @@ def create_seed_inputs(tab, reuse_visible=True):
     return seed, reuse_seed, subseed, reuse_subseed, subseed_strength, seed_resize_from_h, seed_resize_from_w
 
 
+def create_video_inputs():
+    def video_type_change(video_type):
+        return [
+            gr.update(visible=video_type != 'None'),
+            gr.update(visible=video_type in ['GIF', 'PNG']),
+            gr.update(visible=video_type not in ['None', 'GIF', 'PNG']),
+            gr.update(visible=video_type not in ['None', 'GIF', 'PNG']),
+        ]
+    with gr.Column():
+        video_codecs = ['None', 'GIF', 'PNG', 'MP4/MP4V', 'MP4/AVC1', 'MP4/JVT3', 'MKV/H264', 'AVI/DIVX', 'AVI/RGBA', 'MJPEG/MJPG', 'MPG/MPG1', 'AVR/AVR1']
+        video_type = gr.Dropdown(label='Video type', choices=video_codecs, value='None')
+    with gr.Column():
+        video_duration = gr.Slider(label='Duration', minimum=0.25, maximum=300, step=0.25, value=2, visible=False)
+        video_loop = gr.Checkbox(label='Loop', value=True, visible=False, elem_id="control_video_loop")
+        video_pad = gr.Slider(label='Pad frames', minimum=0, maximum=24, step=1, value=1, visible=False)
+        video_interpolate = gr.Slider(label='Interpolate frames', minimum=0, maximum=24, step=1, value=0, visible=False)
+    video_type.change(fn=video_type_change, inputs=[video_type], outputs=[video_duration, video_loop, video_pad, video_interpolate])
+    return video_type, video_duration, video_loop, video_pad, video_interpolate
+
+
 def create_cfg_inputs(tab):
     with gr.Row():
         cfg_scale = gr.Slider(minimum=0.0, maximum=30.0, step=0.1, label='Guidance scale', value=6.0, elem_id=f"{tab}_cfg_scale")