From d8c4a16920534541c8dcd55601c3e588fc27e483 Mon Sep 17 00:00:00 2001 From: shp7724 Date: Sun, 31 Oct 2021 01:29:39 +0900 Subject: [PATCH 1/5] =?UTF-8?q?pycache=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bcb31d6..58f559e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ venv .secrets downloads -.idea \ No newline at end of file +.idea +__pycache__ \ No newline at end of file From fdd60cf9e808e36183ded4eb82e36ee02465e14f Mon Sep 17 00:00:00 2001 From: shp7724 Date: Sun, 31 Oct 2021 03:15:17 +0900 Subject: [PATCH 2/5] =?UTF-8?q?vscode=20=EC=84=B8=ED=8C=85=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..17e15f2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file From 189a09defb6533043c5ee7ddb96a9402fceff207 Mon Sep 17 00:00:00 2001 From: shinhong_park Date: Tue, 8 Mar 2022 20:18:40 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=EB=B3=91=EB=A0=AC=ED=99=94=20=EB=B0=8F=20m?= =?UTF-8?q?p4=20=EB=B3=80=ED=99=98=20=EA=B8=B0=EB=8A=A5=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index c3749c8..76fec79 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,18 @@ -import requests import os +import re +import shutil import sys +import time +from pathlib import Path +from threading import Thread from typing import List, Tuple -from utils import SecretsManager + +import requests +import subprocess from bs4 import BeautifulSoup + from models import * -import re -import shutil -from pathlib import Path +from utils import SecretsManager class ETLDownloader: @@ -26,8 +31,8 @@ def _get_soup(html): def _get_tmp_dir(self): return os.path.join(self.DOWNLOAD_PATH, self.selected_course.title, "tmp") - def _get_video_dir(self, video: Video, safe_filename=False): - filename = video.title + ".ts" + def _get_video_dir(self, video: Video, safe_filename=False, ext: str = "ts"): + filename = f"{video.title}.{ext}" if safe_filename: filename = filename.replace("/", "-") return os.path.join(self.DOWNLOAD_PATH, self.selected_course.title, filename) @@ -99,7 +104,7 @@ def download_vod(self, video: Video): endpoint, media_id = self._parse_stream_endpoint(video.player_url) video.media_id = media_id directory = self._get_tmp_dir() - print(f"\t[*] {video.title} 다운로드 중.", end="") + print(f"\t[*] {video.title} 다운로드 중.", end="", flush=True) while True: chunk_url = f"{endpoint}/media_{media_id}_{index}.ts" res = self.s.get(chunk_url) @@ -113,7 +118,7 @@ def download_vod(self, video: Video): ) as f: f.write(res.content) index += 1 - print(".", end="") + print(".", end="", flush=True) video.num_files = index def concat_files(self, video: Video): @@ -133,9 +138,33 @@ def _delete_tmp_folder(self): except FileNotFoundError: return + def convert_to_mp4(self, video): + infile = self._get_video_dir(video, safe_filename=True) + outfile = self._get_video_dir(video, safe_filename=True, ext="mp4") + subprocess.run(["ffmpeg", "-i", infile, outfile]) + os.remove(infile) + def download_all_videos(self): self._delete_tmp_folder() videos = self.get_course_vods() + + def download_video(video: Video): + if Path(self._get_video_dir(video, safe_filename=True, ext="mp4")).exists(): + print(f"\t[*] {video.title}.mp4 파일이 이미 존재하므로 건너뜁니다.") + return + if Path(self._get_video_dir(video, safe_filename=True, ext="ts")).exists(): + print(f"\t[*] {video.title}.ts 파일이 이미 존재하므로 mp4로 변환 후 건너뜁니다.") + self.convert_to_mp4(video) + return + self.download_vod(video) + self.concat_files(video) + self.convert_to_mp4(video) + + for video in videos: + Thread(target=download_video, args=(video,)).start() + time.sleep(2) + + return for video in videos: if Path(self._get_video_dir(video, safe_filename=True)).exists(): print(f"\t[*] {video.title}.ts 파일이 이미 존재하므로 건너뜁니다.") From 6467f5d83ccdf75fe1dd1da67809082d22443e59 Mon Sep 17 00:00:00 2001 From: shinhong_park Date: Tue, 8 Mar 2022 23:34:54 +0900 Subject: [PATCH 4/5] =?UTF-8?q?=EB=A1=9C=EA=B7=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.py b/main.py index 76fec79..6ea0d36 100644 --- a/main.py +++ b/main.py @@ -139,10 +139,12 @@ def _delete_tmp_folder(self): return def convert_to_mp4(self, video): + print(f"\t[*] {video.title} 변환 시작") infile = self._get_video_dir(video, safe_filename=True) outfile = self._get_video_dir(video, safe_filename=True, ext="mp4") subprocess.run(["ffmpeg", "-i", infile, outfile]) os.remove(infile) + print(f"\t[*] {video.title} 변환 완료") def download_all_videos(self): self._delete_tmp_folder() From 4e3326fc7345aa8605707d43019b2bec46325e7b Mon Sep 17 00:00:00 2001 From: shinhong_park Date: Sun, 1 May 2022 01:27:05 +0900 Subject: [PATCH 5/5] temporary changes for EEC --- main.py | 59 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 6ea0d36..d21cf95 100644 --- a/main.py +++ b/main.py @@ -99,27 +99,48 @@ def _parse_stream_endpoint(self, url: str) -> Tuple[str, str]: media_id = m2.group(1) return endpoint, media_id - def download_vod(self, video: Video): + def get_last_index(self, endpoint: str, media_id: str) -> int: index = 0 - endpoint, media_id = self._parse_stream_endpoint(video.player_url) - video.media_id = media_id - directory = self._get_tmp_dir() - print(f"\t[*] {video.title} 다운로드 중.", end="", flush=True) while True: chunk_url = f"{endpoint}/media_{media_id}_{index}.ts" - res = self.s.get(chunk_url) + res = self.s.head(chunk_url) if res.status_code != 200: - print("") - break - Path(directory).mkdir(parents=True, exist_ok=True) - with open( - os.path.join(directory, f"{index}_{media_id}.ts"), - "wb", - ) as f: - f.write(res.content) + return index index += 1 - print(".", end="", flush=True) - video.num_files = index + + def download_proc(self, endpoint: str, media_id: str, index: str, directory: str): + chunk_url = f"{endpoint}/media_{media_id}_{index}.ts" + res = self.s.get(chunk_url) + with open( + os.path.join(directory, f"{index}_{media_id}.ts"), + "wb", + ) as f: + f.write(res.content) + print(".", end="", flush=True) + self.done_num += 1 + + def download_vod(self, video: Video): + endpoint, media_id = self._parse_stream_endpoint(video.player_url) + last_index = self.get_last_index(endpoint, media_id) + + video.media_id, video.num_files = media_id, last_index + 1 + + directory = self._get_tmp_dir() + Path(directory).mkdir(parents=True, exist_ok=True) + print(f"\t[*] {video.title} 다운로드 중.", end="", flush=True) + + self.done_num = 0 + for index in range(video.num_files): + thread = Thread( + target=self.download_proc, args=(endpoint, media_id, index, directory) + ) + thread.start() + time.sleep(0.05) + + while self.done_num < video.num_files: + time.sleep(1) + + print(flush=True) def concat_files(self, video: Video): directory = self._get_tmp_dir() @@ -163,8 +184,10 @@ def download_video(video: Video): self.convert_to_mp4(video) for video in videos: - Thread(target=download_video, args=(video,)).start() - time.sleep(2) + if "Lecture 11" in video.title or "digital" in video.title: + download_video(video) + # Thread(target=download_video, args=(video,)).start() + # time.sleep(2) return for video in videos: