diff --git a/.gitignore b/.gitignore index bcb31d6..58f559e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ venv .secrets downloads -.idea \ No newline at end of file +.idea +__pycache__ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..17e15f2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/main.py b/main.py index c3749c8..d21cf95 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,18 @@ -import requests import os +import re +import shutil import sys +import time +from pathlib import Path +from threading import Thread from typing import List, Tuple -from utils import SecretsManager + +import requests +import subprocess from bs4 import BeautifulSoup + from models import * -import re -import shutil -from pathlib import Path +from utils import SecretsManager class ETLDownloader: @@ -26,8 +31,8 @@ def _get_soup(html): def _get_tmp_dir(self): return os.path.join(self.DOWNLOAD_PATH, self.selected_course.title, "tmp") - def _get_video_dir(self, video: Video, safe_filename=False): - filename = video.title + ".ts" + def _get_video_dir(self, video: Video, safe_filename=False, ext: str = "ts"): + filename = f"{video.title}.{ext}" if safe_filename: filename = filename.replace("/", "-") return os.path.join(self.DOWNLOAD_PATH, self.selected_course.title, filename) @@ -94,27 +99,48 @@ def _parse_stream_endpoint(self, url: str) -> Tuple[str, str]: media_id = m2.group(1) return endpoint, media_id - def download_vod(self, video: Video): + def get_last_index(self, endpoint: str, media_id: str) -> int: index = 0 - endpoint, media_id = self._parse_stream_endpoint(video.player_url) - video.media_id = media_id - directory = self._get_tmp_dir() - print(f"\t[*] {video.title} 다운로드 중.", end="") while True: chunk_url = f"{endpoint}/media_{media_id}_{index}.ts" - res = self.s.get(chunk_url) + res = self.s.head(chunk_url) if res.status_code != 200: - print("") - break - Path(directory).mkdir(parents=True, exist_ok=True) - with open( - os.path.join(directory, f"{index}_{media_id}.ts"), - "wb", - ) as f: - f.write(res.content) + return index index += 1 - print(".", end="") - video.num_files = index + + def download_proc(self, endpoint: str, media_id: str, index: str, directory: str): + chunk_url = f"{endpoint}/media_{media_id}_{index}.ts" + res = self.s.get(chunk_url) + with open( + os.path.join(directory, f"{index}_{media_id}.ts"), + "wb", + ) as f: + f.write(res.content) + print(".", end="", flush=True) + self.done_num += 1 + + def download_vod(self, video: Video): + endpoint, media_id = self._parse_stream_endpoint(video.player_url) + last_index = self.get_last_index(endpoint, media_id) + + video.media_id, video.num_files = media_id, last_index + 1 + + directory = self._get_tmp_dir() + Path(directory).mkdir(parents=True, exist_ok=True) + print(f"\t[*] {video.title} 다운로드 중.", end="", flush=True) + + self.done_num = 0 + for index in range(video.num_files): + thread = Thread( + target=self.download_proc, args=(endpoint, media_id, index, directory) + ) + thread.start() + time.sleep(0.05) + + while self.done_num < video.num_files: + time.sleep(1) + + print(flush=True) def concat_files(self, video: Video): directory = self._get_tmp_dir() @@ -133,9 +159,37 @@ def _delete_tmp_folder(self): except FileNotFoundError: return + def convert_to_mp4(self, video): + print(f"\t[*] {video.title} 변환 시작") + infile = self._get_video_dir(video, safe_filename=True) + outfile = self._get_video_dir(video, safe_filename=True, ext="mp4") + subprocess.run(["ffmpeg", "-i", infile, outfile]) + os.remove(infile) + print(f"\t[*] {video.title} 변환 완료") + def download_all_videos(self): self._delete_tmp_folder() videos = self.get_course_vods() + + def download_video(video: Video): + if Path(self._get_video_dir(video, safe_filename=True, ext="mp4")).exists(): + print(f"\t[*] {video.title}.mp4 파일이 이미 존재하므로 건너뜁니다.") + return + if Path(self._get_video_dir(video, safe_filename=True, ext="ts")).exists(): + print(f"\t[*] {video.title}.ts 파일이 이미 존재하므로 mp4로 변환 후 건너뜁니다.") + self.convert_to_mp4(video) + return + self.download_vod(video) + self.concat_files(video) + self.convert_to_mp4(video) + + for video in videos: + if "Lecture 11" in video.title or "digital" in video.title: + download_video(video) + # Thread(target=download_video, args=(video,)).start() + # time.sleep(2) + + return for video in videos: if Path(self._get_video_dir(video, safe_filename=True)).exists(): print(f"\t[*] {video.title}.ts 파일이 이미 존재하므로 건너뜁니다.")