Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stitchedvideos #191

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
imageio_ffmpeg
av
moviepy
numexpr
numexpr
167 changes: 99 additions & 68 deletions scripts/modelscope/process_modelscope.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Function calls referenced from https://github.com/modelscope/modelscope/tree/master/modelscope/pipelines/multi_modal

import shutil
# Copyright (C) 2023 by Artem Khrapov (kabachuha)
# Read LICENSE for usage terms.

Expand All @@ -17,7 +17,7 @@
from types import SimpleNamespace
from t2v_helpers.general_utils import get_t2v_version, get_model_location
import time, math
from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
from t2v_helpers.video_audio_utils import ffmpeg_stitch_video, ffmpeg_reverse_frames, ffmpeg_combine_videos, get_quick_vid_info, vid2frames, duplicate_pngs_from_folder, clean_folder_name
from t2v_helpers.args import get_outdir, process_args
import t2v_helpers.args as t2v_helpers_args
from modules import shared, sd_hijack, lowvram
Expand Down Expand Up @@ -76,87 +76,90 @@ def process_modelscope(args_dict, extra_args=None):

mask = None

if args.do_vid2vid:
if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
raise FileNotFoundError("Please upload a video :()")
print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')

# Overrides
if args.vid2vid_frames is not None:
vid2vid_frames_path = args.vid2vid_frames.name
# Start the batch count loop
pbar = tqdm(range(args.batch_count), leave=False)
if args.batch_count == 1:
pbar.disable = True

print("got a request to *vid2vid* an existing video.")
vids_to_pack = []

in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
i = 1
while os.path.exists(outdir_no_tmp):
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
i += 1
state.job_count = args.batch_count

outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
os.makedirs(outdir_v2v, exist_ok=True)
for batch in pbar:
if args.do_vid2vid:
if args.vid2vid_frames is None and args.vid2vid_frames_path == "":
raise FileNotFoundError("Please upload a video :()")

vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=args.vid2vid_startFrame + args.frames,
numeric_files_output=True, out_img_format='png')
# Overrides
if args.vid2vid_frames is not None:
vid2vid_frames_path = args.vid2vid_frames.name

temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)
print("got a request to *vid2vid* an existing video.")

videogen = []
for f in os.listdir(temp_convert_raw_png_path):
# double check for old _depth_ files, not really needed probably but keeping it for now
if '_depth_' not in f:
videogen.append(f)
in_vid_fps, _, _ = get_quick_vid_info(vid2vid_frames_path)
folder_name = clean_folder_name(Path(vid2vid_frames_path).stem)
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name)
i = 1
while os.path.exists(outdir_no_tmp):
outdir_no_tmp = os.path.join(os.getcwd(), 'outputs', 'frame-vid2vid', folder_name + '_' + str(i))
i += 1

videogen.sort(key=lambda x: int(x.split('.')[0]))
outdir_v2v = os.path.join(outdir_no_tmp, 'tmp_input_frames')
os.makedirs(outdir_v2v, exist_ok=True)

images = []
for file in tqdm(videogen, desc="Loading frames"):
image = Image.open(os.path.join(temp_convert_raw_png_path, file))
image = image.resize((args.width, args.height), Image.ANTIALIAS)
array = np.array(image)
images += [array]
extract_to_frame=args.vid2vid_startFrame + args.frames
print(f'vid2vid_frames_path: {vid2vid_frames_path} outdir_v2v: {outdir_v2v} extract_from_frame: {args.vid2vid_startFrame} extract_to_frame: {extract_to_frame}')
vid2frames(video_path=vid2vid_frames_path, video_in_frame_path=outdir_v2v, overwrite=True, extract_from_frame=args.vid2vid_startFrame, extract_to_frame=extract_to_frame,
numeric_files_output=True, out_img_format='png')

# print(images)
temp_convert_raw_png_path = os.path.join(outdir_v2v, "tmp_vid2vid_folder")
duplicate_pngs_from_folder(outdir_v2v, temp_convert_raw_png_path, None, folder_name)

images = np.stack(images) # f h w c
batches = 1
n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1)) # n f h w c
bcfhw = n_images.transpose(0, 4, 1, 2, 3)
# convert to 0-1 float
bcfhw = bcfhw.astype(np.float32) / 255
bfchw = bcfhw.transpose(0, 2, 1, 3, 4) # b c f h w
videogen = []
for f in os.listdir(temp_convert_raw_png_path):
# double check for old _depth_ files, not really needed probably but keeping it for now
if '_depth_' not in f:
videogen.append(f)

print(f"Converted the frames to tensor {bfchw.shape}")
videogen.sort(key=lambda x: int(x.split('.')[0]))

vd_out = torch.from_numpy(bcfhw).to("cuda")
images = []
for file in tqdm(videogen, desc="Loading frames"):
image = Image.open(os.path.join(temp_convert_raw_png_path, file))
image = image.resize((args.width, args.height))#, Image.ANTIALIAS)
array = np.array(image)
images += [array]

# should be -1,1, not 0,1
vd_out = 2 * vd_out - 1
# print(images)

# latents should have shape num_sample, 4, max_frames, latent_h,latent_w
print("Computing latents")
latents = pipe.compute_latents(vd_out).to(device)
images = np.stack(images) # f h w c
batches = 1
n_images = np.tile(images[np.newaxis, ...], (batches, 1, 1, 1, 1)) # n f h w c
bcfhw = n_images.transpose(0, 4, 1, 2, 3)
# convert to 0-1 float
bcfhw = bcfhw.astype(np.float32) / 255
bfchw = bcfhw.transpose(0, 2, 1, 3, 4) # b c f h w

skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
else:
latents = None
args.strength = 1
skip_steps = 0
print(f"Converted the frames to tensor {bfchw.shape}")

print('Working in txt2vid mode' if not args.do_vid2vid else 'Working in vid2vid mode')
vd_out = torch.from_numpy(bcfhw).to("cuda")

# Start the batch count loop
pbar = tqdm(range(args.batch_count), leave=False)
if args.batch_count == 1:
pbar.disable = True
# should be -1,1, not 0,1
vd_out = 2 * vd_out - 1

vids_to_pack = []
# latents should have shape num_sample, 4, max_frames, latent_h,latent_w
print("Computing latents")
latents = pipe.compute_latents(vd_out).to(device)

state.job_count = args.batch_count
skip_steps = int(math.floor(args.steps * max(0, min(1 - args.strength, 1))))
else:
latents = None
args.strength = 1
skip_steps = 0

for batch in pbar:
#do txt2vid
state.job_no = batch
if state.skipped:
state.skipped = False
Expand All @@ -172,7 +175,7 @@ def process_modelscope(args_dict, extra_args=None):
print("Received an image for inpainting", args.inpainting_image.name)
for i in range(args.frames):
image = Image.open(args.inpainting_image.name).convert("RGB")
image = image.resize((args.width, args.height), Image.ANTIALIAS)
image = image.resize((args.width, args.height))
array = np.array(image)
images += [array]

Expand Down Expand Up @@ -220,8 +223,10 @@ def process_modelscope(args_dict, extra_args=None):
samples, _ = pipe.infer(args.prompt, args.n_prompt, args.steps, args.frames, args.seed + batch if args.seed != -1 else -1, args.cfg_scale,
args.width, args.height, args.eta, cpu_vae, device, latents, strength=args.strength, skip_steps=skip_steps, mask=mask, is_vid2vid=args.do_vid2vid, sampler=args.sampler)

if batch > 0:
if args.batch_count > 1:
outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")
else:
outdir_current = os.path.join(get_outdir(), f"{init_timestring}")
print(f'text2video finished, saving frames to {outdir_current}')

# just deleted the folder so we need to make it again
Expand All @@ -230,19 +235,45 @@ def process_modelscope(args_dict, extra_args=None):
cv2.imwrite(outdir_current + os.path.sep +
f"{i:06}.png", samples[i])

gc.collect()
devices.torch_gc()

# TODO: add params to the GUI
if not video_args.skip_video_creation:
ffmpeg_stitch_video(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=outdir_current + os.path.sep + f"vid.mp4", imgs_path=os.path.join(outdir_current,
"%06d.png"),
stitch_from_frame=0, stitch_to_frame=-1, add_soundtrack=video_args.add_soundtrack,
audio_path=vid2vid_frames_path if video_args.add_soundtrack == 'Init Video' else video_args.soundtrack_path, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
if video_args.do_stitch_videos and video_args.stitched_video_strength > 0.35:
reverse_video_path = outdir_current + os.path.sep + f"vid_reversed.mp4"
ffmpeg_reverse_frames(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=reverse_video_path, input_path=os.path.join(outdir_current, "%06d.png"), crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset)
args.do_vid2vid = True
args.vid2vid_startFrame = 0
print(f"vid2vid start frame: {args.vid2vid_startFrame}")
print(f'strength: {args.strength}')
args.strength = video_args.stitched_video_strength
args.vid2vid_frames = open(reverse_video_path, 'rb')
print(f't2v complete, result saved at {outdir_current}')

mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()
if video_args.do_stitch_videos:
outdir_current = os.path.join(get_outdir(), f"{init_timestring}_final")

os.makedirs(outdir_current, exist_ok=True)

combined_video_path = os.path.join(get_outdir(), f"{init_timestring}_final")
combined_video_list = [os.path.join(get_outdir(), f"{init_timestring}_{i}", "vid.mp4").replace('/', os.path.sep) for i in range(0, args.batch_count)]

ffmpeg_combine_videos(ffmpeg_location=video_args.ffmpeg_location, fps=video_args.fps, outmp4_path=combined_video_path + os.path.sep + f"vid.mp4",
input_videos=combined_video_list, crf=video_args.ffmpeg_crf, preset=video_args.ffmpeg_preset, add_soundtrack=video_args.add_soundtrack,
audio_path=video_args.soundtrack_path)
else:
outdir_current = os.path.join(get_outdir(), f"{init_timestring}_{batch}")

mp4 = open(outdir_current + os.path.sep + f"vid.mp4", 'rb').read()
dataurl = "data:video/mp4;base64," + b64encode(mp4).decode()

if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
vids_to_pack.append(dataurl)
if max_vids_to_pack == -1 or len(vids_to_pack) < max_vids_to_pack:
vids_to_pack.append(dataurl)
t2v_helpers_args.i1_store_t2v = f'<p style=\"font-weight:bold;margin-bottom:0em\">text2video extension for auto1111 — version 1.2b </p>'
for dataurl in vids_to_pack:
t2v_helpers_args.i1_store_t2v += f'<video controls loop><source src="{dataurl}" type="video/mp4"></video><br>'
Expand Down
12 changes: 11 additions & 1 deletion scripts/t2v_helpers/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ def refresh_all_models(model):
# TODO: make it how it's done in Deforum/WebUI, so we won't have to track individual vars
prompt, n_prompt, sampler, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
model_type.change(fn=enable_sampler_dropdown, inputs=[model_type], outputs=[sampler])
gr.Markdown('''`Stitch videos` allows you to generate multiple videos consecutively and combine them into
one video when they're done. Use stitched video denoising strength to adjust the continuity between videos.
Uses the batch count parameter to determine how many videos to generate and stitch together.

Currently only works with ModelScope''')
with gr.Row():
do_stitch_videos = gr.Checkbox(label="Stitch videos", value=d.do_stitch_videos, interactive=True)
stitched_video_strength = gr.Slider(label="Stitched video denoising strength", value=d.stitched_video_strength, minimum=0, maximum=1, step=0.01, interactive=True)
with gr.Accordion('img2vid', open=False):
inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
# TODO: should be tied to the total frame count dynamically
Expand Down Expand Up @@ -155,7 +163,7 @@ def refresh_all_models(model):

return locals()

t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path').replace("\n", "").replace("\r", "").replace(" ", "").split(',')
t2v_video_args_names = str('skip_video_creation, ffmpeg_location, ffmpeg_crf, ffmpeg_preset, fps, add_soundtrack, soundtrack_path, do_stitch_videos, stitched_video_strength').replace("\n", "").replace("\r", "").replace(" ", "").split(',')

common_values_names = str('''prompt, n_prompt, sampler, steps, frames, seed, cfg_scale, width, height, eta, batch_count''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')

Expand Down Expand Up @@ -205,6 +213,8 @@ def T2VArgs():
prompt = ""
n_prompt = "text, watermark, copyright, blurry, nsfw"
strength = 0.75
do_stitch_videos = False
stitched_video_strength = 0.0
vid2vid_startFrame = 0
inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)
inpainting_frames = 0
Expand Down
Loading