From acb2b207dd28f2a1c6cbfd67e6c21973aebccc03 Mon Sep 17 00:00:00 2001 From: iejMac Date: Mon, 21 Nov 2022 07:42:27 +0000 Subject: [PATCH] youtube downloading: update format selection --- video2numpy/read_vids_cv2.py | 32 +++----------- video2numpy/utils.py | 83 ++++++++++++++++++++---------------- 2 files changed, 53 insertions(+), 62 deletions(-) diff --git a/video2numpy/read_vids_cv2.py b/video2numpy/read_vids_cv2.py index 6be9a1a..8132092 100644 --- a/video2numpy/read_vids_cv2.py +++ b/video2numpy/read_vids_cv2.py @@ -9,9 +9,6 @@ from .utils import handle_url -MAX_RETRY = 2 # TODO: do this better, maybe param for this - - def read_vids(vid_refs, worker_id, take_every_nth, resize_size, batch_size, queue_export): """ Reads list of videos, saves frames to Shared Queue @@ -28,23 +25,17 @@ def read_vids(vid_refs, worker_id, take_every_nth, resize_size, batch_size, queu t0 = time.perf_counter() print(f"Worker #{worker_id} starting processing {len(vid_refs)} videos") - def get_frames(vid, ref, retry=0): + def get_frames(vid, ref): # TODO: better way of testing if vid is url if vid.startswith("http://") or vid.startswith("https://"): - load_vid, file, dst_name = handle_url(vid, retry) + load_vid, file, dst_name = handle_url(vid) else: load_vid, file, dst_name = vid, None, vid[:-4].split("/")[-1] + ".npy" video_frames = [] - time_0 = time.time() cap = cv2.VideoCapture(load_vid) # pylint: disable=I1101 - fps = cap.get(cv2.CAP_PROP_FPS) - frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - minutes = (frame_count / fps) / 60 - timeout = max(minutes, 0.5) # acceptable reading speed is 1 [min downloaded/s] - if not cap.isOpened(): print(f"Error: {vid} not opened") return @@ -59,14 +50,11 @@ def get_frames(vid, ref, retry=0): ind = 0 while ret: ret = cap.grab() - if time.time() - time_0 > timeout: # timeout if taking too long (maybe try another format) - raise TimeoutError if ret and (ind % take_every_nth == 0): ret, frame = cap.retrieve() frame = resizer(frame) video_frames.append(frame) ind += 1 - if len(video_frames) == 0: print(f"Warning: {vid} contained 0 frames") return @@ -91,17 +79,9 @@ def get_frames(vid, ref, retry=0): random.Random(worker_id).shuffle(vid_refs) for vid, ref in vid_refs: - retry = 0 - while retry < MAX_RETRY: - try: - get_frames(vid, ref, retry) - break - except TimeoutError as _: - print(f"TimeoutError: {vid} timed out") - retry += 1 - except Exception as e: # pylint: disable=broad-except - print(f"Error: Video {vid} failed with message - {e}") - break - print("retrying...") + try: + get_frames(vid, ref) + except Exception as e: # pylint: disable=broad-except + print(f"Error: Video {vid} failed with message - {e}") tf = time.perf_counter() print(f"Worker #{worker_id} done processing {len(vid_refs)} videos in {tf-t0}[s]") diff --git a/video2numpy/utils.py b/video2numpy/utils.py index 5cd6f1e..7ac7251 100644 --- a/video2numpy/utils.py +++ b/video2numpy/utils.py @@ -3,52 +3,63 @@ import tempfile import yt_dlp +from timeout_decorator import timeout, TimeoutError -QUALITY = "360p" -# TODO make this better / audio support -def get_format_selector(retry): - """ - Gets format selector based on retry number. - """ +def get_fast_format(formats, find_format_timeout): + """returns the closest format that downloads quickly""" + + @timeout(find_format_timeout) + def check_speed(f): + url = f.get("url") + ntf, _ = handle_mp4_link(url) + with open(ntf.name, "rb") as vid_file: + _ = vid_file.read() + ntf.close() + + format_id = None + for fmt in formats: + try: + check_speed(fmt) + format_id = fmt.get("format_id") + break + except TimeoutError as _: + pass + + return format_id - def format_selector(ctx): - formats = ctx.get("formats") - if retry == 0: - for f in formats: - if f.get("format_note", None) != QUALITY: - continue - break - else: - for f in formats: # take WORST video format available - if f.get("vcodec", None) == "none": - continue - break - yield { - "format_id": f["format_id"], - "ext": f["ext"], - "requested_formats": [f], - "protocol": f["protocol"], - } - - return format_selector - - -def handle_youtube(youtube_url, retry): - """returns file and destination name from youtube url.""" +def handle_youtube(youtube_url): + """returns file and destination name from youtube url.""" + # Probe download speed: ydl_opts = { "quiet": True, - "format": get_format_selector(retry), + "external-download": "ffmpeg", + "external-downloader-args": "ffmpeg_i:-ss 0 -t 2", # download 2 seconds } + ydl = yt_dlp.YoutubeDL(ydl_opts) + info = ydl.extract_info(youtube_url, download=False) + formats = info.get("formats", None) + filtered_formats = [ + f for f in formats if f["format_note"] != "DASH video" and f["height"] is not None and f["height"] >= 360 # const 360p + ] + # TODO: how do we drop the video when format_id is None (all retires timed out) + format_id = get_fast_format(filtered_formats[:10], 4) + if format_id is None: + return None, "" + + # Get actual video: + # TODO: figure out a way of just requesting the format by format_id + ydl_opts = {"quiet": True} ydl = yt_dlp.YoutubeDL(ydl_opts) info = ydl.extract_info(youtube_url, download=False) - formats = info.get("requested_formats", None) - f = formats[0] + formats = info.get("formats", None) + f = [f for f in formats if f["format_id"] == format_id][0] cv2_vid = f.get("url", None) dst_name = info.get("id") + ".npy" + return cv2_vid, dst_name @@ -61,7 +72,7 @@ def handle_mp4_link(mp4_link): return ntf, dst_name -def handle_url(url, retry=0): +def handle_url(url): """ Input: url: url of video @@ -72,8 +83,8 @@ def handle_url(url, retry=0): name - numpy fname to save frames to. """ if "youtube" in url: # youtube link - load_file, name = handle_youtube(url, retry) - return load_file, None, name + file, name = handle_youtube(url) + return file, None, name elif url.endswith(".mp4"): # mp4 link file, name = handle_mp4_link(url) return file.name, file, name