In the past few years, there has been a marked increase in the popularity of generative models that utilize various data modalities. One of the most challenging undertakings in this regard is synthesizing videos from text, which is both time-consuming and resource-intensive. The core of proposed solution/animation approach is Kandinsky extension with Deforum features. This leads to new generative opportunities of text2image model.
git clone https://github.com/ai-forever/deforum-kandinsky.git
cd deforum-kandinsky
pip install -r requirements.txt
from IPython.display import Video
from deforum_kandinsky import KandinskyV22Img2ImgPipeline, DeforumKandinsky
from diffusers import KandinskyV22PriorPipeline
from transformers import CLIPVisionModelWithProjection
from diffusers.models import UNet2DConditionModel
import imageio.v2 as iio
from PIL import Image
import numpy as np
import torch
import datetime
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython import display
# create video from generated frames
def frames2video(frames, output_path="video.mp4", fps=24, display=False):
writer = iio.get_writer(output_path, fps=fps)
for frame in tqdm(frames):
writer.append_data(np.array(frame))
writer.close()
if display:
display.Video(url=output_path)
from diffusers import KandinskyV22PriorPipeline
from deforum_kandinsky import (
KandinskyV22Img2ImgPipeline,
DeforumKandinsky,
KandinskyImg2ImgPipeline,
DeforumKandinsky
)
# load models
model_version = 2.2
device = "cuda"
if model_version == 2.2:
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
'kandinsky-community/kandinsky-2-2-prior',
subfolder='image_encoder'
).to(torch.float16).to(device)
unet = UNet2DConditionModel.from_pretrained(
'kandinsky-community/kandinsky-2-2-decoder',
subfolder='unet'
).to(torch.float16).to(device)
prior = KandinskyV22PriorPipeline.from_pretrained(
'kandinsky-community/kandinsky-2-2-prior',
image_encoder=image_encoder,
torch_dtype=torch.float16
).to(device)
decoder = KandinskyV22Img2ImgPipeline.from_pretrained(
'kandinsky-community/kandinsky-2-2-decoder',
unet=unet,
torch_dtype=torch.float16
).to(device)
elif model_version == 2.1:
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
"kandinsky-community/kandinsky-2-1-prior",
subfolder='image_encoder',
torch_dtype=torch.float16
).to(device)
unet = UNet2DConditionModel.from_pretrained(
"kandinsky-community/kandinsky-2-1",
subfolder='unet',
torch_dtype=torch.float16
).to(device)
prior = KandinskyPriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-1-prior",
torch_dtype=torch.float16
).to(device)
decoder = KandinskyImg2ImgPipeline.from_pretrained(
'kandinsky-community/kandinsky-2-1',
unet=unet,
torch_dtype=torch.float16
).to(device)
deforum = DeforumKandinsky(
prior=prior,
decoder_img2img=decoder,
device='cuda'
)
animation = deforum(
prompts=[
"winter forest, snowflakes, Van Gogh style",
"spring forest, flowers, sun rays, Van Gogh style",
"summer forest, lake, reflections on the water, summer sun, Van Gogh style",
"autumn forest, rain, Van Gogh style",
Image.open("path/to/image.jpg"),
],
animations=['live', 'right', 'right', 'right', 'live'],
prompt_durations=[1, 1, 1, 1, 1],
H=640,
W=640,
fps=24,
save_samples=False,
)
frames = []
out = widgets.Output()
pbar = tqdm(animation, total=len(deforum))
display.display(out)
with out:
for item in pbar:
frame = item.pop('image', None)
frames.append(frame)
display.clear_output(wait=True)
display.display(frame)
for key, value in item.items():
print(f"{key}: {value}")
# save and display video
display.clear_output(wait=True)
frames2video(frames, "output_2_2.mp4", fps=24)
display.Video(url="output_2_2.mp4")
Deforum web-page
Quick Guide to Deforum v06
GitHub repository: deforum-stable-diffusion