From b2e7b9ca047cad22af235072eeabff08dba90309 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Mon, 27 Feb 2023 01:36:52 -0600
Subject: [PATCH 1/7] feat: initial implementation of storybook generation

---
 storybook_generation/README.md   |  6 ++++
 storybook_generation/combiner.py | 40 ++++++++++++++++++++++++
 storybook_generation/splitter.py | 30 ++++++++++++++++++
 storybook_generation/walker.py   | 49 +++++++++++++++++++++++++++++
 storybook_generation/workflow.py | 53 ++++++++++++++++++++++++++++++++
 5 files changed, 178 insertions(+)
 create mode 100644 storybook_generation/README.md
 create mode 100644 storybook_generation/combiner.py
 create mode 100644 storybook_generation/splitter.py
 create mode 100644 storybook_generation/walker.py
 create mode 100644 storybook_generation/workflow.py

diff --git a/storybook_generation/README.md b/storybook_generation/README.md
new file mode 100644
index 0000000..4f82932
--- /dev/null
+++ b/storybook_generation/README.md
@@ -0,0 +1,6 @@
+1. call LLM on a paragraph to generate a script
+2. split script into sentences
+3. call stable diffusion walker on each pair of sentences
+4. split each walker output into frames
+5. combine all frames into one video
+6. (optional) etch the script onto the video
\ No newline at end of file
diff --git a/storybook_generation/combiner.py b/storybook_generation/combiner.py
new file mode 100644
index 0000000..2a5085a
--- /dev/null
+++ b/storybook_generation/combiner.py
@@ -0,0 +1,40 @@
+import sieve
+
+@sieve.function(
+    name="frame-combiner",
+    gpu = False,
+    python_packages=[
+        "uuid==1.30",
+        "ffmpeg-python==0.2.0"
+    ],
+    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg"],
+    python_version="3.8",
+    iterator_input=True,
+    persist_output=True
+)
+def frame_combine(it: sieve.Image) -> sieve.Video:
+    import uuid
+    import ffmpeg
+    l = []
+    for i in it:
+        l.append(i)
+        print(i.path, i.frame_number)
+    sorted_by_frame_number = sorted(l, key=lambda k: k.frame_number)
+    image_paths = [i.path for i in sorted_by_frame_number]
+
+    if hasattr(l[0], 'fps'):
+        fps = l[0].fps
+    else:
+        fps = 30
+
+    video_path = f"{uuid.uuid4()}.mp4"
+    process = ffmpeg.input('pipe:', r=str(fps), f='image2pipe').output(video_path, vcodec='libx264', pix_fmt='yuv420p').overwrite_output().run_async(pipe_stdin=True)
+    for in_file in image_paths:
+        with open(in_file, 'rb') as f:
+            jpeg_data = f.read()
+            process.stdin.write(jpeg_data)
+
+    process.stdin.close()
+    process.wait()
+
+    return sieve.Video(path=video_path)
\ No newline at end of file
diff --git a/storybook_generation/splitter.py b/storybook_generation/splitter.py
new file mode 100644
index 0000000..22f74e5
--- /dev/null
+++ b/storybook_generation/splitter.py
@@ -0,0 +1,30 @@
+import sieve
+
+@sieve.function(
+    name="video-splitter",
+    gpu = False,
+    python_packages=[
+        "ffmpeg-python==0.2.0"
+    ],
+    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg"],
+    python_version="3.8"
+)
+def VideoSplitter(video: sieve.Video) -> sieve.Image:
+    # use ffmpeg to extract all frames in video as bmp files and return the path to the folder
+    video_fps = video.fps
+
+    import tempfile
+    temp_dir = tempfile.mkdtemp()
+
+    import subprocess
+    subprocess.call([
+        'ffmpeg',
+        '-i', video.path,
+        f'{temp_dir}/%09d.jpg'
+    ])
+    import os
+    filenames = os.listdir(temp_dir)
+    filenames.sort()
+    for i, filename in enumerate(filenames):
+        print(os.path.join(temp_dir, filename), i)
+        yield sieve.Image(path=os.path.join(temp_dir, filename), frame_number=i, fps=video_fps)
diff --git a/storybook_generation/walker.py b/storybook_generation/walker.py
new file mode 100644
index 0000000..eefd7fc
--- /dev/null
+++ b/storybook_generation/walker.py
@@ -0,0 +1,49 @@
+import sieve
+
+@sieve.Model(
+    name="run_stable_diff_walk",
+    python_packages=[
+        "torch==1.8.1",
+        "stable_diffusion_videos==0.8.1",
+        "accelerate==0.16.0"
+    ],
+    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg", "libavcodec58", "libsndfile1", "git-lfs"],
+    gpu=True,
+    machine_type="a100",
+    run_commands=[
+        "mkdir -p /root/.cache/models/stable-diffusion-v1-4",
+        "git lfs install",
+        "git clone https://huggingface.co/CompVis/stable-diffusion-v1-4 /root/.cache/models/stable-diffusion-v1-4",
+    ],
+    iterator_input=True,
+    persist_output=True
+)
+class StableDiffusionVideo:
+    def __setup__(self):
+        import torch
+        from stable_diffusion_videos import StableDiffusionWalkPipeline
+
+        # load stable diffusion model from local cache
+        self.pipeline = StableDiffusionWalkPipeline.from_pretrained(
+            "/root/.cache/models/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+            revision="fp16",
+        ).to("cuda")
+    
+    def __predict__(self, prompt1: str, prompt2: str) -> sieve.Video:
+        import torch
+        from stable_diffusion_videos import StableDiffusionWalkPipeline
+
+        prompt1, prompt2 = list(prompt1)[0], list(prompt2)[0] # current workaround for iterator inputs
+
+        # generate and store video output
+        video_path = self.pipeline.walk(
+            [prompt1, prompt2],
+            [42, 1337],
+            fps=5,
+            num_interpolation_steps=15,
+            height=512,
+            width=512,
+        )
+
+        return sieve.Video(path=video_path)
\ No newline at end of file
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
new file mode 100644
index 0000000..ec6016f
--- /dev/null
+++ b/storybook_generation/workflow.py
@@ -0,0 +1,53 @@
+'''
+Storybook generation workflow
+'''
+
+import sieve
+from walker import StableDiffusionVideo
+from combiner import frame_combine
+from splitter import VideoSplitter
+
+# TODO: make this use an LLM
+@sieve.function(name="prompt-to-script")
+def prompt_to_script(prompt: str) -> list:
+    script = prompt.split(".")
+    script = [s.strip() for s in script if s.strip() != ""]
+    script = [s + "." for s in script]
+    return script
+
+@sieve.function(
+    name="script-to-video",
+    python_packages=[
+        "torch==1.8.1",
+        "stable_diffusion_videos==0.8.1",
+        "accelerate==0.16.0",
+        "opencv-python==4.6.0.66",
+        "moviepy==1.0.3",
+        "uuid==1.30",
+        "ffmpeg-python==0.2.0",
+    ],
+    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg", "libavcodec58", "libsndfile1", "git-lfs"],
+    gpu=True,
+    machine_type="a100",
+    run_commands=[
+        "mkdir -p /root/.cache/models/stable-diffusion-v1-4",
+        "git lfs install",
+        "git clone https://huggingface.co/CompVis/stable-diffusion-v1-4 /root/.cache/models/stable-diffusion-v1-4",
+    ],
+    iterator_input=True,
+    persist_output=True,
+    python_version="3.8",
+)
+def script_to_video(script: list) -> sieve.Video:
+    images = []
+    for i in range(len(script) - 1):
+        prompt1, prompt2 = script[i], script[i + 1]
+        video = StableDiffusionVideo()(prompt1, prompt2)
+        images.append(VideoSplitter(video))
+    return frame_combine(images)
+
+@sieve.workflow(name="storybook_generation")
+def storybook_generation(prompt: str) -> sieve.Video:
+    script = prompt_to_script(prompt)
+    video = script_to_video(script)
+    return video

From a991f5e4f367b8db5eef350d9e0c72f75f4bdbc5 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Mon, 27 Feb 2023 19:54:28 -0600
Subject: [PATCH 2/7] feat: working storybook with proper parallelization
 without concat

---
 storybook_generation/splitter.py | 30 ----------------------
 storybook_generation/walker.py   | 10 +++-----
 storybook_generation/workflow.py | 43 ++++++++------------------------
 3 files changed, 14 insertions(+), 69 deletions(-)
 delete mode 100644 storybook_generation/splitter.py

diff --git a/storybook_generation/splitter.py b/storybook_generation/splitter.py
deleted file mode 100644
index 22f74e5..0000000
--- a/storybook_generation/splitter.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import sieve
-
-@sieve.function(
-    name="video-splitter",
-    gpu = False,
-    python_packages=[
-        "ffmpeg-python==0.2.0"
-    ],
-    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg"],
-    python_version="3.8"
-)
-def VideoSplitter(video: sieve.Video) -> sieve.Image:
-    # use ffmpeg to extract all frames in video as bmp files and return the path to the folder
-    video_fps = video.fps
-
-    import tempfile
-    temp_dir = tempfile.mkdtemp()
-
-    import subprocess
-    subprocess.call([
-        'ffmpeg',
-        '-i', video.path,
-        f'{temp_dir}/%09d.jpg'
-    ])
-    import os
-    filenames = os.listdir(temp_dir)
-    filenames.sort()
-    for i, filename in enumerate(filenames):
-        print(os.path.join(temp_dir, filename), i)
-        yield sieve.Image(path=os.path.join(temp_dir, filename), frame_number=i, fps=video_fps)
diff --git a/storybook_generation/walker.py b/storybook_generation/walker.py
index eefd7fc..03e4d9f 100644
--- a/storybook_generation/walker.py
+++ b/storybook_generation/walker.py
@@ -3,7 +3,7 @@
 @sieve.Model(
     name="run_stable_diff_walk",
     python_packages=[
-        "torch==1.8.1",
+        "torch==1.13.1",
         "stable_diffusion_videos==0.8.1",
         "accelerate==0.16.0"
     ],
@@ -15,7 +15,6 @@
         "git lfs install",
         "git clone https://huggingface.co/CompVis/stable-diffusion-v1-4 /root/.cache/models/stable-diffusion-v1-4",
     ],
-    iterator_input=True,
     persist_output=True
 )
 class StableDiffusionVideo:
@@ -30,11 +29,11 @@ def __setup__(self):
             revision="fp16",
         ).to("cuda")
     
-    def __predict__(self, prompt1: str, prompt2: str) -> sieve.Video:
+    def __predict__(self, prompt_pair: tuple) -> sieve.Video:
         import torch
         from stable_diffusion_videos import StableDiffusionWalkPipeline
 
-        prompt1, prompt2 = list(prompt1)[0], list(prompt2)[0] # current workaround for iterator inputs
+        prompt1, prompt2 = prompt_pair[0], prompt_pair[1]
 
         # generate and store video output
         video_path = self.pipeline.walk(
@@ -45,5 +44,4 @@ def __predict__(self, prompt1: str, prompt2: str) -> sieve.Video:
             height=512,
             width=512,
         )
-
-        return sieve.Video(path=video_path)
\ No newline at end of file
+        yield sieve.Video(path=video_path)
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
index ec6016f..70bd39c 100644
--- a/storybook_generation/workflow.py
+++ b/storybook_generation/workflow.py
@@ -4,8 +4,7 @@
 
 import sieve
 from walker import StableDiffusionVideo
-from combiner import frame_combine
-from splitter import VideoSplitter
+from combiner import combiner
 
 # TODO: make this use an LLM
 @sieve.function(name="prompt-to-script")
@@ -15,39 +14,17 @@ def prompt_to_script(prompt: str) -> list:
     script = [s + "." for s in script]
     return script
 
-@sieve.function(
-    name="script-to-video",
-    python_packages=[
-        "torch==1.8.1",
-        "stable_diffusion_videos==0.8.1",
-        "accelerate==0.16.0",
-        "opencv-python==4.6.0.66",
-        "moviepy==1.0.3",
-        "uuid==1.30",
-        "ffmpeg-python==0.2.0",
-    ],
-    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg", "libavcodec58", "libsndfile1", "git-lfs"],
-    gpu=True,
-    machine_type="a100",
-    run_commands=[
-        "mkdir -p /root/.cache/models/stable-diffusion-v1-4",
-        "git lfs install",
-        "git clone https://huggingface.co/CompVis/stable-diffusion-v1-4 /root/.cache/models/stable-diffusion-v1-4",
-    ],
-    iterator_input=True,
-    persist_output=True,
-    python_version="3.8",
-)
-def script_to_video(script: list) -> sieve.Video:
-    images = []
+
+@sieve.function(name="create-prompt-pairs")
+def create_prompt_pairs(script: list) -> tuple:
     for i in range(len(script) - 1):
-        prompt1, prompt2 = script[i], script[i + 1]
-        video = StableDiffusionVideo()(prompt1, prompt2)
-        images.append(VideoSplitter(video))
-    return frame_combine(images)
+        yield (script[i], script[i + 1])
+
 
 @sieve.workflow(name="storybook_generation")
 def storybook_generation(prompt: str) -> sieve.Video:
     script = prompt_to_script(prompt)
-    video = script_to_video(script)
-    return video
+    prompt_pair = create_prompt_pairs(script)
+    videos = StableDiffusionVideo()(prompt_pair)
+    combined_video = videos #combiner(videos)
+    return combined_video

From a630bde1d27cfc2203851861b3e2481193605a9f Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Mon, 27 Feb 2023 21:17:42 -0600
Subject: [PATCH 3/7] feat: combine videos after generation

---
 storybook_generation/combiner.py | 35 ++++++++------------------------
 storybook_generation/workflow.py |  2 +-
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/storybook_generation/combiner.py b/storybook_generation/combiner.py
index 2a5085a..babbdab 100644
--- a/storybook_generation/combiner.py
+++ b/storybook_generation/combiner.py
@@ -1,40 +1,23 @@
 import sieve
 
 @sieve.function(
-    name="frame-combiner",
+    name="video-combiner",
     gpu = False,
     python_packages=[
+        "moviepy==1.0.3",
         "uuid==1.30",
-        "ffmpeg-python==0.2.0"
     ],
-    system_packages=["libgl1-mesa-glx", "libglib2.0-0", "ffmpeg"],
     python_version="3.8",
     iterator_input=True,
     persist_output=True
 )
-def frame_combine(it: sieve.Image) -> sieve.Video:
+def combiner(videos) -> sieve.Video:
+    from moviepy.editor import VideoFileClip, concatenate_videoclips
     import uuid
-    import ffmpeg
-    l = []
-    for i in it:
-        l.append(i)
-        print(i.path, i.frame_number)
-    sorted_by_frame_number = sorted(l, key=lambda k: k.frame_number)
-    image_paths = [i.path for i in sorted_by_frame_number]
-
-    if hasattr(l[0], 'fps'):
-        fps = l[0].fps
-    else:
-        fps = 30
 
+    videos = [VideoFileClip(video.path) for video in videos]
+    video = concatenate_videoclips(videos)
     video_path = f"{uuid.uuid4()}.mp4"
-    process = ffmpeg.input('pipe:', r=str(fps), f='image2pipe').output(video_path, vcodec='libx264', pix_fmt='yuv420p').overwrite_output().run_async(pipe_stdin=True)
-    for in_file in image_paths:
-        with open(in_file, 'rb') as f:
-            jpeg_data = f.read()
-            process.stdin.write(jpeg_data)
-
-    process.stdin.close()
-    process.wait()
-
-    return sieve.Video(path=video_path)
\ No newline at end of file
+    video.write_videofile(video_path)
+    return sieve.Video(path=video_path)
+    
\ No newline at end of file
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
index 70bd39c..cb13e4d 100644
--- a/storybook_generation/workflow.py
+++ b/storybook_generation/workflow.py
@@ -26,5 +26,5 @@ def storybook_generation(prompt: str) -> sieve.Video:
     script = prompt_to_script(prompt)
     prompt_pair = create_prompt_pairs(script)
     videos = StableDiffusionVideo()(prompt_pair)
-    combined_video = videos #combiner(videos)
+    combined_video = combiner(videos)
     return combined_video

From 19000c8eab2c15f451b29e45f23759cd38a18094 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Mon, 27 Feb 2023 23:10:19 -0600
Subject: [PATCH 4/7] fix: sort videos by video number

---
 storybook_generation/combiner.py |  6 ++++++
 storybook_generation/walker.py   | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/storybook_generation/combiner.py b/storybook_generation/combiner.py
index babbdab..b470191 100644
--- a/storybook_generation/combiner.py
+++ b/storybook_generation/combiner.py
@@ -15,8 +15,14 @@ def combiner(videos) -> sieve.Video:
     from moviepy.editor import VideoFileClip, concatenate_videoclips
     import uuid
 
+    # Sort videos by global id
+    videos = sorted(videos, key=lambda video: video.video_number)
+
+    # Combine videos
     videos = [VideoFileClip(video.path) for video in videos]
     video = concatenate_videoclips(videos)
+
+    # Save video
     video_path = f"{uuid.uuid4()}.mp4"
     video.write_videofile(video_path)
     return sieve.Video(path=video_path)
diff --git a/storybook_generation/walker.py b/storybook_generation/walker.py
index 03e4d9f..1ec33bc 100644
--- a/storybook_generation/walker.py
+++ b/storybook_generation/walker.py
@@ -22,26 +22,35 @@ def __setup__(self):
         import torch
         from stable_diffusion_videos import StableDiffusionWalkPipeline
 
-        # load stable diffusion model from local cache
+        # Load stable diffusion model from local cache
         self.pipeline = StableDiffusionWalkPipeline.from_pretrained(
             "/root/.cache/models/stable-diffusion-v1-4",
             torch_dtype=torch.float16,
             revision="fp16",
         ).to("cuda")
+
+        # Keep global ID to sort outputs
+        self.video_number = 0
     
     def __predict__(self, prompt_pair: tuple) -> sieve.Video:
         import torch
         from stable_diffusion_videos import StableDiffusionWalkPipeline
 
+        # Unpack prompt pair
         prompt1, prompt2 = prompt_pair[0], prompt_pair[1]
 
-        # generate and store video output
+        # Generate and store video output
         video_path = self.pipeline.walk(
             [prompt1, prompt2],
             [42, 1337],
             fps=5,
             num_interpolation_steps=15,
             height=512,
-            width=512,
+            width=768,
         )
-        yield sieve.Video(path=video_path)
+
+        # Increment global id
+        self.video_number += 1
+
+        # Return video
+        yield sieve.Video(path=video_path, video_number=self.video_number)

From 2a3526bb64704d748c40897df6794e9b6c388373 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Mon, 27 Feb 2023 23:10:32 -0600
Subject: [PATCH 5/7] feat: add video captioner and README

---
 storybook_generation/README.md    | 24 +++++++++++++++-----
 storybook_generation/captioner.py | 37 +++++++++++++++++++++++++++++++
 storybook_generation/workflow.py  | 16 ++++++++-----
 3 files changed, 66 insertions(+), 11 deletions(-)
 create mode 100644 storybook_generation/captioner.py

diff --git a/storybook_generation/README.md b/storybook_generation/README.md
index 4f82932..1ebee66 100644
--- a/storybook_generation/README.md
+++ b/storybook_generation/README.md
@@ -1,6 +1,18 @@
-1. call LLM on a paragraph to generate a script
-2. split script into sentences
-3. call stable diffusion walker on each pair of sentences
-4. split each walker output into frames
-5. combine all frames into one video
-6. (optional) etch the script onto the video
\ No newline at end of file
+# Storybook Generation
+This workflow generates a video story from a paragraph of text. It uses `StableDiffusionWalker` on pairwise sentences to generate video clips, captions them with each sentence, and stitches them together into a video.
+
+## Deploying
+Follow our [getting started guide](https://www.sievedata.com/dashboard/welcome) to get your Sieve API key and install the Sieve Python client.
+
+1. Export API keys & install Python client
+```
+export SIEVE_API_KEY={YOUR_API_KEY}
+pip install https://mango.sievedata.com/v1/client_package/sievedata-0.0.1.1.2-py3-none-any.whl
+```
+
+2. Deploy a workflow to Sieve
+```
+git clone git@github.com:sieve-community/examples.git
+cd examples/yolo_object_tracking
+sieve deploy
+```
diff --git a/storybook_generation/captioner.py b/storybook_generation/captioner.py
new file mode 100644
index 0000000..617ac08
--- /dev/null
+++ b/storybook_generation/captioner.py
@@ -0,0 +1,37 @@
+import sieve
+
+@sieve.function(
+    name="video-captioner",
+    gpu = False,
+    python_packages=[
+        "moviepy==1.0.3",
+        "uuid==1.30",
+    ],
+    system_packages=["imagemagick"],
+    run_commands=["apt install -y imagemagick"],
+    iterator_input=True,
+    persist_output=True
+)
+def captioner(videos, prompt_pair) -> sieve.Video:
+    from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
+    import uuid
+
+    for v, prompt in zip(videos, prompt_pair):
+        video = VideoFileClip(v.path)
+
+        # Create caption
+        prompt = prompt[0]
+        middle = len(prompt) // 2
+        caption = prompt[:middle] + "\n" + prompt[middle:]
+        text = TextClip(caption, font='calibri', fontsize=24, color='white')
+        text = text.set_pos('bottom').set_duration(video.duration)
+
+        # Combine video and caption
+        video = CompositeVideoClip([video, text])
+        video.write_videofile("bear_with_text.mp4")
+
+        # Save video
+        video_path = f"{uuid.uuid4()}.mp4"
+        video.write_videofile(video_path)
+        return sieve.Video(path=video_path)
+    
\ No newline at end of file
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
index cb13e4d..257bb5f 100644
--- a/storybook_generation/workflow.py
+++ b/storybook_generation/workflow.py
@@ -1,12 +1,13 @@
 '''
-Storybook generation workflow
+Sieve workflow to generate a storybook video from a piece of writing.
 '''
 
 import sieve
 from walker import StableDiffusionVideo
 from combiner import combiner
+from captioner import captioner
 
-# TODO: make this use an LLM
+# Creates a cleaned up list of sentences from a piece of writing
 @sieve.function(name="prompt-to-script")
 def prompt_to_script(prompt: str) -> list:
     script = prompt.split(".")
@@ -14,17 +15,22 @@ def prompt_to_script(prompt: str) -> list:
     script = [s + "." for s in script]
     return script
 
-
+# Generates pairs of sentences from a list of sentences
 @sieve.function(name="create-prompt-pairs")
 def create_prompt_pairs(script: list) -> tuple:
     for i in range(len(script) - 1):
         yield (script[i], script[i + 1])
 
-
 @sieve.workflow(name="storybook_generation")
 def storybook_generation(prompt: str) -> sieve.Video:
+    # Create a script (list of sentences) and pair them up
     script = prompt_to_script(prompt)
     prompt_pair = create_prompt_pairs(script)
+
+    # Generate videos and caption them
     videos = StableDiffusionVideo()(prompt_pair)
-    combined_video = combiner(videos)
+    captioned_videos = captioner(videos, prompt_pair)
+
+    # Return a concatenated video
+    combined_video = combiner(captioned_videos)
     return combined_video

From 2b6aac5552965cc7800a782dcaa101c17be6c401 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Tue, 28 Feb 2023 00:31:02 -0600
Subject: [PATCH 6/7] fix: use opencv instead of moviepy to fix install error

---
 storybook_generation/caption_combine.py | 63 +++++++++++++++++++++++++
 storybook_generation/captioner.py       | 37 ---------------
 storybook_generation/combiner.py        | 29 ------------
 storybook_generation/workflow.py        | 14 +++---
 4 files changed, 69 insertions(+), 74 deletions(-)
 create mode 100644 storybook_generation/caption_combine.py
 delete mode 100644 storybook_generation/captioner.py
 delete mode 100644 storybook_generation/combiner.py

diff --git a/storybook_generation/caption_combine.py b/storybook_generation/caption_combine.py
new file mode 100644
index 0000000..20b630f
--- /dev/null
+++ b/storybook_generation/caption_combine.py
@@ -0,0 +1,63 @@
+import sieve
+
+@sieve.function(
+    name="video-captioner-combiner",
+    gpu = False,
+    python_packages=[
+        "moviepy==1.0.3",
+        "opencv-python==4.6.0.66",
+        "uuid==1.30",
+    ],
+    python_version="3.8",
+    iterator_input=True,
+    persist_output=True
+)
+def caption_and_combine(videos, prompt_pairs) -> sieve.Video:
+    from moviepy.editor import VideoFileClip, ImageClip, concatenate_videoclips
+    import cv2
+    import textwrap
+    import uuid
+
+    # Sort videos by global id
+    videos = sorted(videos, key=lambda video: video.video_number)
+
+    # Add captions
+    images = []
+    for v, prompt in zip(videos, prompt_pairs):
+        # Add caption
+        cap = cv2.VideoCapture(v.path)
+        while cap.isOpened():
+            # Capture frames in the video
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            # Add caption with textwrap
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            wrapped_text = textwrap.wrap(prompt[0], width=30)
+            x, y = 10, 40
+            font_size = 1
+            font_thickness = 2
+
+            for i, line in enumerate(wrapped_text):
+                textsize = cv2.getTextSize(line, font, font_size, font_thickness)[0]
+
+                gap = textsize[1] + 10
+
+                y = int((frame.shape[0] + textsize[1]) / 2) + i * gap
+                x = int((frame.shape[1] - textsize[0]) / 2)
+
+                cv2.putText(frame, line, (x, y), font,
+                            font_size, 
+                            (255,255,0), 
+                            font_thickness, 
+                            lineType = cv2.LINE_AA)
+                
+            images.append(frame)
+        
+    clips = [ImageClip(m).set_duration(0.25) for m in images]
+    video = concatenate_videoclips(clips)
+    video_path = f"{uuid.uuid4()}.mp4"
+    video.write_videofile(video_path, fps=30)
+    return sieve.Video(path=video_path)
+    
\ No newline at end of file
diff --git a/storybook_generation/captioner.py b/storybook_generation/captioner.py
deleted file mode 100644
index 617ac08..0000000
--- a/storybook_generation/captioner.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import sieve
-
-@sieve.function(
-    name="video-captioner",
-    gpu = False,
-    python_packages=[
-        "moviepy==1.0.3",
-        "uuid==1.30",
-    ],
-    system_packages=["imagemagick"],
-    run_commands=["apt install -y imagemagick"],
-    iterator_input=True,
-    persist_output=True
-)
-def captioner(videos, prompt_pair) -> sieve.Video:
-    from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
-    import uuid
-
-    for v, prompt in zip(videos, prompt_pair):
-        video = VideoFileClip(v.path)
-
-        # Create caption
-        prompt = prompt[0]
-        middle = len(prompt) // 2
-        caption = prompt[:middle] + "\n" + prompt[middle:]
-        text = TextClip(caption, font='calibri', fontsize=24, color='white')
-        text = text.set_pos('bottom').set_duration(video.duration)
-
-        # Combine video and caption
-        video = CompositeVideoClip([video, text])
-        video.write_videofile("bear_with_text.mp4")
-
-        # Save video
-        video_path = f"{uuid.uuid4()}.mp4"
-        video.write_videofile(video_path)
-        return sieve.Video(path=video_path)
-    
\ No newline at end of file
diff --git a/storybook_generation/combiner.py b/storybook_generation/combiner.py
deleted file mode 100644
index b470191..0000000
--- a/storybook_generation/combiner.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import sieve
-
-@sieve.function(
-    name="video-combiner",
-    gpu = False,
-    python_packages=[
-        "moviepy==1.0.3",
-        "uuid==1.30",
-    ],
-    python_version="3.8",
-    iterator_input=True,
-    persist_output=True
-)
-def combiner(videos) -> sieve.Video:
-    from moviepy.editor import VideoFileClip, concatenate_videoclips
-    import uuid
-
-    # Sort videos by global id
-    videos = sorted(videos, key=lambda video: video.video_number)
-
-    # Combine videos
-    videos = [VideoFileClip(video.path) for video in videos]
-    video = concatenate_videoclips(videos)
-
-    # Save video
-    video_path = f"{uuid.uuid4()}.mp4"
-    video.write_videofile(video_path)
-    return sieve.Video(path=video_path)
-    
\ No newline at end of file
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
index 257bb5f..637ea7e 100644
--- a/storybook_generation/workflow.py
+++ b/storybook_generation/workflow.py
@@ -4,8 +4,7 @@
 
 import sieve
 from walker import StableDiffusionVideo
-from combiner import combiner
-from captioner import captioner
+from caption_combine import caption_and_combine
 
 # Creates a cleaned up list of sentences from a piece of writing
 @sieve.function(name="prompt-to-script")
@@ -25,12 +24,11 @@ def create_prompt_pairs(script: list) -> tuple:
 def storybook_generation(prompt: str) -> sieve.Video:
     # Create a script (list of sentences) and pair them up
     script = prompt_to_script(prompt)
-    prompt_pair = create_prompt_pairs(script)
+    prompt_pairs = create_prompt_pairs(script)
 
-    # Generate videos and caption them
-    videos = StableDiffusionVideo()(prompt_pair)
-    captioned_videos = captioner(videos, prompt_pair)
+    # Generate videos with StableDiffusionWalker
+    videos = StableDiffusionVideo()(prompt_pairs)
 
-    # Return a concatenated video
-    combined_video = combiner(captioned_videos)
+    # Return a captioned and concatenated video
+    combined_video = caption_and_combine(videos, prompt_pairs)
     return combined_video

From e8b52f0bde8daa2b7382c658dc25aa11ff820507 Mon Sep 17 00:00:00 2001
From: Ishan Shah <ishan0102@utexas.edu>
Date: Tue, 28 Feb 2023 00:34:24 -0600
Subject: [PATCH 7/7] fix: finishing touches and logging

---
 storybook_generation/README.md          |  3 +++
 storybook_generation/caption_combine.py | 13 ++++++++-----
 storybook_generation/walker.py          |  1 +
 storybook_generation/workflow.py        |  3 +++
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/storybook_generation/README.md b/storybook_generation/README.md
index 1ebee66..742e2eb 100644
--- a/storybook_generation/README.md
+++ b/storybook_generation/README.md
@@ -1,6 +1,9 @@
 # Storybook Generation
 This workflow generates a video story from a paragraph of text. It uses `StableDiffusionWalker` on pairwise sentences to generate video clips, captions them with each sentence, and stitches them together into a video.
 
+## Examples
+Here's a good starter prompt: `Once upon a time, there was a small bird named Poppy. Poppy was a curious bird who loved to explore the world around her. One day, as she was flying over the fields, she noticed a beautiful flower in the distance. Poppy flew closer to the flower and was amazed by its vibrant colors and sweet fragrance. She landed on the flower and started to sip the nectar from its center.`
+
 ## Deploying
 Follow our [getting started guide](https://www.sievedata.com/dashboard/welcome) to get your Sieve API key and install the Sieve Python client.
 
diff --git a/storybook_generation/caption_combine.py b/storybook_generation/caption_combine.py
index 20b630f..e8c4bb9 100644
--- a/storybook_generation/caption_combine.py
+++ b/storybook_generation/caption_combine.py
@@ -13,18 +13,18 @@
     persist_output=True
 )
 def caption_and_combine(videos, prompt_pairs) -> sieve.Video:
-    from moviepy.editor import VideoFileClip, ImageClip, concatenate_videoclips
+    from moviepy.editor import ImageClip, concatenate_videoclips
     import cv2
     import textwrap
     import uuid
 
-    # Sort videos by global id
+    # Sort videos by global ID
     videos = sorted(videos, key=lambda video: video.video_number)
 
     # Add captions
     images = []
     for v, prompt in zip(videos, prompt_pairs):
-        # Add caption
+        print("Creating video with caption: ", prompt[0])
         cap = cv2.VideoCapture(v.path)
         while cap.isOpened():
             # Capture frames in the video
@@ -52,9 +52,12 @@ def caption_and_combine(videos, prompt_pairs) -> sieve.Video:
                             (255,255,0), 
                             font_thickness, 
                             lineType = cv2.LINE_AA)
-                
+            
+            # Add the frame to the list of images
             images.append(frame)
-        
+    
+    # Combine the images into a video
+    print("Combining all frames into video...")
     clips = [ImageClip(m).set_duration(0.25) for m in images]
     video = concatenate_videoclips(clips)
     video_path = f"{uuid.uuid4()}.mp4"
diff --git a/storybook_generation/walker.py b/storybook_generation/walker.py
index 1ec33bc..f76b8c1 100644
--- a/storybook_generation/walker.py
+++ b/storybook_generation/walker.py
@@ -40,6 +40,7 @@ def __predict__(self, prompt_pair: tuple) -> sieve.Video:
         prompt1, prompt2 = prompt_pair[0], prompt_pair[1]
 
         # Generate and store video output
+        print("Generating video with prompts: " + prompt1 + " | " + prompt2)
         video_path = self.pipeline.walk(
             [prompt1, prompt2],
             [42, 1337],
diff --git a/storybook_generation/workflow.py b/storybook_generation/workflow.py
index 637ea7e..c4184d2 100644
--- a/storybook_generation/workflow.py
+++ b/storybook_generation/workflow.py
@@ -23,12 +23,15 @@ def create_prompt_pairs(script: list) -> tuple:
 @sieve.workflow(name="storybook_generation")
 def storybook_generation(prompt: str) -> sieve.Video:
     # Create a script (list of sentences) and pair them up
+    print("Generating script and prompt pairs...")
     script = prompt_to_script(prompt)
     prompt_pairs = create_prompt_pairs(script)
 
     # Generate videos with StableDiffusionWalker
+    print("Generating videos...")
     videos = StableDiffusionVideo()(prompt_pairs)
 
     # Return a captioned and concatenated video
+    print("Generating storybook...")
     combined_video = caption_and_combine(videos, prompt_pairs)
     return combined_video