From a166c49e9f8946129ea68f86045120450c116b2a Mon Sep 17 00:00:00 2001
From: 7ae <26288920+7ae@users.noreply.github.com>
Date: Sat, 17 Feb 2024 08:14:08 -0800
Subject: [PATCH] Initial release

---
 .gitignore       |  1 +
 README.md        | 68 ++++++++++++++++++++++++++++++++++++++++++++
 out/.gitkeep     |  1 +
 queue.txt        |  3 ++
 requirements.txt | 10 +++++++
 utube.py         | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 157 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 out/.gitkeep
 create mode 100644 queue.txt
 create mode 100644 requirements.txt
 create mode 100755 utube.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c137003
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+./tmp
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0da9e05
--- /dev/null
+++ b/README.md
@@ -0,0 +1,68 @@
+# Utube
+
+Utube is a Python script that automates downloading YouTube videos and playlists using yt-dlp.
+
+## Features
+
+- Metadata and descriptions are exported.
+- Thumbnail images are downloaded and embedded in the file.
+- Metadata is embedded within the downloaded file.
+- Subtitles (excluding auto-generated ones) and live chat are downloaded, with subtitles embedded.
+- Segment information is included in the downloaded file.
+- Video IDs are logged in `archive.txt` to prevent re-downloading.
+- Geographic restrictions due to copyright are bypassed.
+- Playlist downloads are handled, with an option to set a maximum number of downloads.
+- The top 100 comments, with a maximum of 10 replies for each comment, are saved in `info.json`.
+
+## Installation
+
+```
+virtualenv -p python ./tmp/venv
+source tmp/venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Usage
+
+```
+python utube.py [-h] [--format {video,audio}] [--max-downloads NUMBER] [--path PATH] input
+```
+
+### Folder Structure
+
+Utube creates folders for playlists or channels based on their IDs. Downloaded videos are stored inside these folders. Each video is named to include details such as its title and upload date.
+
+-&nbsp;`./out/`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;- `{Playlist or Channel ID}/`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `{Upload Date}.{Uploader}.{Video Title}.({Resolution}).{Format ID}.[{Video ID}].{File Extension}`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `.info.json`: This file contains metadata and information about video.
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `.description`: This file contains description section of YouTube page. It is provided by uploader.
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `.{Thumbnail Extension}`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `.{Subtitle Language}.vtt`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `.live_chat.json`
+<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;- `archive.txt`: This file logs downloaded video IDs to prevent re-downloading.
+
+### Options
+
+- `input`: Accepts a file path or YouTube link. If a file path is provided, it should contain YouTube links separated by line breaks.
+- `--format`: Specifies the download format (audio or video). Default is video.
+- `--max-downloads`: Stops downloading once the maximum limit is reached. Default is unlimited.
+- `--path`: Sets the output path for downloaded files. Default is "./out".
+
+### Example
+
+Download videos from a file:
+
+`python utube.py queue.txt --max-downloads 5`
+
+- queue.txt:
+
+  ```txt
+  https://www.youtube.com/watch?v=xxxx&t=60s
+  https://www.youtube.com/shorts/xxxx
+  https://www.youtube.com/playlist?list=xxxx
+  ```
+
+Download audio from a playlist:
+
+`python utube.py "https://www.youtube.com/playlist?list=xxxx" --format audio`
diff --git a/out/.gitkeep b/out/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/out/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/queue.txt b/queue.txt
new file mode 100644
index 0000000..776f563
--- /dev/null
+++ b/queue.txt
@@ -0,0 +1,3 @@
+https://www.youtube.com/watch?v=xxxx&t=60s
+https://www.youtube.com/shorts/xxxx
+https://www.youtube.com/playlist?list=xxxx
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ef9adb3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+Brotli==1.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.2
+idna==3.4
+mutagen==1.47.0
+pycryptodomex==3.19.0
+requests==2.31.0
+urllib3==2.1.0
+websockets==12.0
+yt-dlp==2023.11.16
diff --git a/utube.py b/utube.py
new file mode 100755
index 0000000..7c9f024
--- /dev/null
+++ b/utube.py
@@ -0,0 +1,74 @@
+import argparse
+import os
+import subprocess
+import sys
+
+# define command-line arguments
+parser = argparse.ArgumentParser(description='Download YouTube videos and playlists.')
+parser.add_argument('input', help='File path or YouTube link. If a file path is provided, it should contain YouTube links line by line.')
+parser.add_argument('--format', choices=['video', 'audio'], default='video', help='Specify the download format (audio or video). Default is video.')
+parser.add_argument('--max-downloads', type=int, default=sys.maxsize, help='Maximum number of downloads. Default is unlimited.')
+parser.add_argument('--path', help='Output path for downloaded files. Default is "./out".')
+
+# parse the command-line arguments
+args = parser.parse_args()
+
+# use the provided output path or default to "./out"
+output_path = args.path or './out'
+
+# extract URLs from the input file or use the provided URL
+urls = [line.strip() for line in open(args.input)] if os.path.isfile(args.input) else [args.input.strip()]
+for u in urls:
+    # extract the base video URL without any additional parameters
+    base_url = u.split('&')[0]
+    print(f"\033[36mSYSTEM:\033[0m [{__file__}] URL: {base_url}")
+
+    # check if url is a video url or a playlist url
+    is_playlist = 'list=' in base_url
+
+    # download the best quality video file
+    video_format = ['bestvideo+bestaudio/best']
+
+    # download the best quality audio file and convert to flac
+    audio_format = ['bestaudio/best', '-x', '--audio-format', 'flac']
+
+    format_option = video_format if args.format == 'video' else audio_format
+
+    # extract channel id or playlist id from url, and use the id as folder name
+    channel_id = f'{base_url.split("list=")[1] if is_playlist else subprocess.check_output(["yt-dlp", "--print", "channel_url", "--playlist-items", "1", base_url], stderr=subprocess.DEVNULL, text=True).strip().split("/")[-1]}'
+    print(f"\033[36mSYSTEM:\033[0m [{__file__}] ID: {channel_id}")
+
+    try:
+        # create subdirectory using id as name (return to the initial directory after completing the download)
+        path = os.path.join(output_path, channel_id)
+        os.makedirs(path, exist_ok=True)
+        os.chdir(path)
+
+    except Exception as e:
+        # skip loop if id is invalid
+        print(f"\033[31mERROR:\033[0m [{__file__}] '{path}' is an invalid ID.")
+        continue
+
+    # construct filename with limited 'uploader' and 'title' to 60 bytes each to prevent exceeding maximum filename length (255)
+    filename = f'%(upload_date)s.%(uploader).60B.%(title).60B.(%(resolution)s).%(format_id)s.[%(id)s].%(ext)s'
+
+    options = [
+        '--write-info-json', '--write-description',  # export metadata and description
+        '--write-thumbnail', '--embed-thumbnail',  # download thumbnail image file
+        '--embed-metadata',  # embed metadata to the downloaded file
+        '--sub-langs', 'all', '--write-subs', '--embed-subs',  # download captions (except auto-generated) and live chat
+        '--embed-chapters',  # include segment information
+        '--download-archive', 'archive.txt',  # record id of downloaded video to prevent redownloading
+        '--geo-bypass',  # bypass geographic restriction due to copyright
+        '--yes-playlist' if is_playlist else '--no-playlist',
+        '--max-downloads', str(args.max_downloads),  # limit max download count
+        # retrieve up to 100 comments, with a maximum of 10 replies total, sorted by the top comments
+        '--get-comments', '--extractor-args', 'youtube:comment_sort=top;max_comments=100,all,10'
+    ]
+
+    # execute the download command
+    subprocess.run(['yt-dlp', '-f'] + format_option + ['-o', filename, base_url] + options)
+
+    # return to the initial directory once the download is complete
+    os.chdir('../..')
+