kabachuha · ambocclusion · Jul 2, 2023 · Jul 2, 2023 · Jul 3, 2023 · Jul 13, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 imageio_ffmpeg
 av
 moviepy
-numexpr
+numexpr
diff --git a/scripts/modelscope/process_modelscope.py b/scripts/modelscope/process_modelscope.py
@@ -1,5 +1,5 @@
 # Function calls referenced from https://github.com/modelscope/modelscope/tree/master/modelscope/pipelines/multi_modal
-
+import shutil
 # Copyright (C) 2023 by Artem Khrapov (kabachuha)
 # Read LICENSE for usage terms.
 
@@ -17,7 +17,7 @@
 from types import SimpleNamespace
 from t2v_helpers.general_utils import get_t2v_version, get_model_location
 import time, math
-from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
+from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, ffmpeg_reverse_frames, ffmpeg_combine_videos, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
 from t2v_helpers.args import get_outdir, process_args
 import t2v_helpers.args as t2v_helpers_args
 from modules import shared, sd_hijack, lowvram
@@ -76,87 +76,90 @@ def process_modelscope(args_dict, extra_args=None):
 
     mask = None
 
-    if args.do_vid2vid:
-        if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
-            raise FileNotFoundError("Please upload a video :()")
+    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
 
-        # Overrides
-        if args.vid2vid_frames is not None:
-            vid2vid_frames_path = args.vid2vid_frames.name
+    # Start the batch count loop
+    pbar = tqdm(range(args.batch_count), leave=False)
+    if args.batch_count == 1:
+        pbar.disable = True
 
-        print("got a request to *vid2vid* an existing video.")
+    vids_to_pack = []
 
-        in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
-        folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
-        outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
-        i = 1
-        while os.path.exists(outdir_no_tmp):
-            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
-            i += 1
+    state.job_count = args.batch_count
 
-        outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
-        os.makedirs(outdir_v2v, exist_ok=True)
+    for batch in pbar:
+        if args.do_vid2vid:
+            if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
+                raise FileNotFoundError("Please upload a video :()")
 
-        vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=args.vid2vid_startFrame + args.frames,
-                   numeric_files_output=True, out_img_format='png')
+            # Overrides
+            if args.vid2vid_frames is not None:
+                vid2vid_frames_path = args.vid2vid_frames.name
 
-        temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
-        duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
+            print("got a request to *vid2vid* an existing video.")
 
-        videogen = []
-        for f in os.listdir(temp_convert_raw_png_path):
-            # double check for old _depth_ files, not really needed probably but keeping it for now
-            if '_depth_' not in f:
-                videogen.append(f)
+            in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
+            folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
+            outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
+            i = 1
+            while os.path.exists(outdir_no_tmp):
+                outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
+                i += 1
 
-        videogen.sort(key=lambda x: int(x.split('.')[0]))
+            outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
+            os.makedirs(outdir_v2v, exist_ok=True)
 
-        images = []
-        for file in tqdm(videogen, desc="Loading frames"):
-            image = Image.open(os.path.join(temp_convert_raw_png_path, file))
-            image = image.resize((args.width, args.height), Image.ANTIALIAS)
-            array = np.array(image)
-            images += [array]
+            extract_to_frame=args.vid2vid_startFrame + args.frames
+            print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
+            vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
+                       numeric_files_output=True, out_img_format='png')
 
-        # print(images)
+            temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
+            duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
 
-        images = np.stack(images)  # f h w c
-        batches = 1
-        n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
-        bcfhw = n_images.transpose(0, 4, 1, 2, 3)
-        # convert to 0-1 float
-        bcfhw = bcfhw.astype(np.float32) / 255
-        bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
+            videogen = []
+            for f in os.listdir(temp_convert_raw_png_path):
+                # double check for old _depth_ files, not really needed probably but keeping it for now
+                if '_depth_' not in f:
+                    videogen.append(f)
 
-        print(f"Converted the frames to tensor {bfchw.shape}")
+            videogen.sort(key=lambda x: int(x.split('.')[0]))
 
-        vd_out = torch.from_numpy(bcfhw).to("cuda")
+            images = []
+            for file in tqdm(videogen, desc="Loading frames"):
+                image = Image.open(os.path.join(temp_convert_raw_png_path, file))
+                image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
+                array = np.array(image)
+                images += [array]
 
-        # should be -1,1, not 0,1
-        vd_out = 2 * vd_out - 1
+            # print(images)
 
-        # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
-        print("Computing latents")
-        latents = pipe.compute_latents(vd_out).to(device)
+            images = np.stack(images)  # f h w c
+            batches = 1
+            n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1))  # n f h w c
+            bcfhw = n_images.transpose(0, 4, 1, 2, 3)
+            # convert to 0-1 float
+            bcfhw = bcfhw.astype(np.float32) / 255
+            bfchw = bcfhw.transpose(0, 2, 1, 3, 4)  # b c f h w
 
-        skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
-    else:
-        latents = None
-        args.strength = 1
-        skip_steps = 0
+            print(f"Converted the frames to tensor {bfchw.shape}")
 
-    print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
+            vd_out = torch.from_numpy(bcfhw).to("cuda")
 
-    # Start the batch count loop
-    pbar = tqdm(range(args.batch_count), leave=False)
-    if args.batch_count == 1:
-        pbar.disable = True
+            # should be -1,1, not 0,1
+            vd_out = 2 * vd_out - 1
 
-    vids_to_pack = []
+            # latents should have shape num_sample, 4, max_frames, latent_h,latent_w
+            print("Computing latents")
+            latents = pipe.compute_latents(vd_out).to(device)
 
-    state.job_count = args.batch_count
+            skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
+        else:
+            latents = None
+            args.strength = 1
+            skip_steps = 0
 
-    for batch in pbar:
+        #do txt2vid
         state.job_no = batch
         if state.skipped:
             state.skipped = False
@@ -172,7 +175,7 @@ def process_modelscope(args_dict, extra_args=None):
             print("Received an image for inpainting", args.inpainting_image.name)
             for i in range(args.frames):
                 image = Image.open(args.inpainting_image.name).convert("RGB")
-                image = image.resize((args.width, args.height), Image.ANTIALIAS)
+                image = image.resize((args.width, args.height))
                 array = np.array(image)
                 images += [array]
 
@@ -220,8 +223,10 @@ def process_modelscope(args_dict, extra_args=None):
         samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
                                 args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)
 
-        if batch > 0:
+        if args.batch_count > 1:
             outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
+        else:
+            outdir_current = os.path.join(get_outdir(), f"{init_timestring}")
         print(f'text2video finished, saving frames to {outdir_current}')
 
         # just deleted the folder so we need to make it again
@@ -230,19 +235,45 @@ def process_modelscope(args_dict, extra_args=None):
             cv2.imwrite(outdir_current + os.path.sep +
                         f"{i:06}.png", samples[i])
 
+        gc.collect()
+        devices.torch_gc()
+
         # TODO: add params to the GUI
         if not video_args.skip_video_creation:
             ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
                                                                                                                                                                               "%06d.png"),
                                 stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
                                 audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+        if video_args.do_stitch_videos and video_args.stitched_video_strength > 0.35:
+            reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
+            ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
+            args.do_vid2vid = True
+            args.vid2vid_startFrame = 0
+            print(f"vid2vid start frame: {args.vid2vid_startFrame}")
+            print(f'strength: {args.strength}')
+            args.strength = video_args.stitched_video_strength
+            args.vid2vid_frames = open(reverse_video_path, 'rb')
         print(f't2v complete, result saved at {outdir_current}')
 
-        mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
-        dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
+    if video_args.do_stitch_videos:
+        outdir_current = os.path.join(get_outdir(), f"{init_timestring}_final")
+
+        os.makedirs(outdir_current, exist_ok=True)
+
+        combined_video_path = os.path.join(get_outdir(), f"{init_timestring}_final")
+        combined_video_list = [os.path.join(get_outdir(), f"{init_timestring}_{i}", "vid.mp4").replace('/', os.path.sep) for i in range(0, args.batch_count)]
+
+        ffmpeg_combine_videos(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=combined_video_path + os.path.sep + f"vid.mp4",
+                              input_videos=combined_video_list, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset, add_soundtrack=video_args.add_soundtrack,
+                              audio_path=video_args.soundtrack_path)
+    else:
+        outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
+
+    mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
+    dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
 
-        if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
-            vids_to_pack.append(dataurl)
+    if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
+        vids_to_pack.append(dataurl)
     t2v_helpers_args.i1_store_t2v = f'<p style=\"font-weight:bold;margin-bottom:0em\">text2video extension for auto1111 — version 1.2b </p>'
     for dataurl in vids_to_pack:
         t2v_helpers_args.i1_store_t2v += f'<video controls loop><source src="{dataurl}" type="video/mp4"></video><br>'

diff --git a/scripts/t2v_helpers/args.py b/scripts/t2v_helpers/args.py
@@ -101,6 +101,14 @@ def refresh_all_models(model):
             # TODO: make it how it's done in Deforum/WebUI, so we won't have to track individual vars
             prompt, n_prompt, sampler, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
             model_type.change(fn=enable_sampler_dropdown, inputs=[model_type], outputs=[sampler])
+            gr.Markdown('''`Stitch videos` allows you to generate multiple videos consecutively and combine them into 
+                        one video when they're done. Use stitched video denoising strength to adjust the continuity between videos.
+                        Uses the batch count parameter to determine how many videos to generate and stitch together.
+
+                        Currently only works with ModelScope''')
+            with gr.Row():
+                do_stitch_videos = gr.Checkbox(label="Stitch videos", value=d.do_stitch_videos, interactive=True)
+                stitched_video_strength = gr.Slider(label="Stitched video denoising strength", value=d.stitched_video_strength, minimum=0, maximum=1, step=0.01, interactive=True)
             with gr.Accordion('img2vid', open=False):
                 inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
                 # TODO: should be tied to the total frame count dynamically
@@ -155,7 +163,7 @@ def refresh_all_models(model):
 
     return locals()
 
-t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
+t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, do_stitch_videos, stitched_video_strength').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
 common_values_names = str('''prompt, n_prompt, sampler, steps, frames, seed, cfg_scale, width, height, eta, batch_count''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
 
@@ -205,6 +213,8 @@ def T2VArgs():
     prompt = ""
     n_prompt = "text, watermark, copyright, blurry, nsfw"
     strength = 0.75
+    do_stitch_videos = False
+    stitched_video_strength = 0.0
     vid2vid_startFrame = 0
     inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)
     inpainting_frames = 0