Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 6 additions & 26 deletions video2numpy/read_vids_cv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
from .utils import handle_url


MAX_RETRY = 2 # TODO: do this better, maybe param for this


def read_vids(vid_refs, worker_id, take_every_nth, resize_size, batch_size, queue_export):
"""
Reads list of videos, saves frames to Shared Queue
Expand All @@ -28,23 +25,17 @@ def read_vids(vid_refs, worker_id, take_every_nth, resize_size, batch_size, queu
t0 = time.perf_counter()
print(f"Worker #{worker_id} starting processing {len(vid_refs)} videos")

def get_frames(vid, ref, retry=0):
def get_frames(vid, ref):
# TODO: better way of testing if vid is url
if vid.startswith("http://") or vid.startswith("https://"):
load_vid, file, dst_name = handle_url(vid, retry)
load_vid, file, dst_name = handle_url(vid)
else:
load_vid, file, dst_name = vid, None, vid[:-4].split("/")[-1] + ".npy"

video_frames = []
time_0 = time.time()

cap = cv2.VideoCapture(load_vid) # pylint: disable=I1101

fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
minutes = (frame_count / fps) / 60
timeout = max(minutes, 0.5) # acceptable reading speed is 1 [min downloaded/s]

if not cap.isOpened():
print(f"Error: {vid} not opened")
return
Expand All @@ -59,14 +50,11 @@ def get_frames(vid, ref, retry=0):
ind = 0
while ret:
ret = cap.grab()
if time.time() - time_0 > timeout: # timeout if taking too long (maybe try another format)
raise TimeoutError
if ret and (ind % take_every_nth == 0):
ret, frame = cap.retrieve()
frame = resizer(frame)
video_frames.append(frame)
ind += 1

if len(video_frames) == 0:
print(f"Warning: {vid} contained 0 frames")
return
Expand All @@ -91,17 +79,9 @@ def get_frames(vid, ref, retry=0):

random.Random(worker_id).shuffle(vid_refs)
for vid, ref in vid_refs:
retry = 0
while retry < MAX_RETRY:
try:
get_frames(vid, ref, retry)
break
except TimeoutError as _:
print(f"TimeoutError: {vid} timed out")
retry += 1
except Exception as e: # pylint: disable=broad-except
print(f"Error: Video {vid} failed with message - {e}")
break
print("retrying...")
try:
get_frames(vid, ref)
except Exception as e: # pylint: disable=broad-except
print(f"Error: Video {vid} failed with message - {e}")
tf = time.perf_counter()
print(f"Worker #{worker_id} done processing {len(vid_refs)} videos in {tf-t0}[s]")
83 changes: 47 additions & 36 deletions video2numpy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,52 +3,63 @@
import tempfile
import yt_dlp

from timeout_decorator import timeout, TimeoutError

QUALITY = "360p"

# TODO make this better / audio support
def get_format_selector(retry):
"""
Gets format selector based on retry number.
"""
def get_fast_format(formats, find_format_timeout):
"""returns the closest format that downloads quickly"""

@timeout(find_format_timeout)
def check_speed(f):
url = f.get("url")
ntf, _ = handle_mp4_link(url)
with open(ntf.name, "rb") as vid_file:
_ = vid_file.read()
ntf.close()

format_id = None
for fmt in formats:
try:
check_speed(fmt)
format_id = fmt.get("format_id")
break
except TimeoutError as _:
pass

return format_id

def format_selector(ctx):
formats = ctx.get("formats")
if retry == 0:
for f in formats:
if f.get("format_note", None) != QUALITY:
continue
break
else:
for f in formats: # take WORST video format available
if f.get("vcodec", None) == "none":
continue
break
yield {
"format_id": f["format_id"],
"ext": f["ext"],
"requested_formats": [f],
"protocol": f["protocol"],
}

return format_selector


def handle_youtube(youtube_url, retry):
"""returns file and destination name from youtube url."""

def handle_youtube(youtube_url):
"""returns file and destination name from youtube url."""
# Probe download speed:
ydl_opts = {
"quiet": True,
"format": get_format_selector(retry),
"external-download": "ffmpeg",
"external-downloader-args": "ffmpeg_i:-ss 0 -t 2", # download 2 seconds
}
ydl = yt_dlp.YoutubeDL(ydl_opts)
info = ydl.extract_info(youtube_url, download=False)
formats = info.get("formats", None)
filtered_formats = [
f for f in formats if f["format_note"] != "DASH video" and f["height"] is not None and f["height"] >= 360 # const 360p
]

# TODO: how do we drop the video when format_id is None (all retires timed out)
format_id = get_fast_format(filtered_formats[:10], 4)
if format_id is None:
return None, ""

# Get actual video:
# TODO: figure out a way of just requesting the format by format_id
ydl_opts = {"quiet": True}
ydl = yt_dlp.YoutubeDL(ydl_opts)
info = ydl.extract_info(youtube_url, download=False)
formats = info.get("requested_formats", None)
f = formats[0]
formats = info.get("formats", None)
f = [f for f in formats if f["format_id"] == format_id][0]

cv2_vid = f.get("url", None)
dst_name = info.get("id") + ".npy"

return cv2_vid, dst_name


Expand All @@ -61,7 +72,7 @@ def handle_mp4_link(mp4_link):
return ntf, dst_name


def handle_url(url, retry=0):
def handle_url(url):
"""
Input:
url: url of video
Expand All @@ -72,8 +83,8 @@ def handle_url(url, retry=0):
name - numpy fname to save frames to.
"""
if "youtube" in url: # youtube link
load_file, name = handle_youtube(url, retry)
return load_file, None, name
file, name = handle_youtube(url)
return file, None, name
elif url.endswith(".mp4"): # mp4 link
file, name = handle_mp4_link(url)
return file.name, file, name
Expand Down