From 9a7c8df6b03cd1e62b24ef07856e44a9d9eedea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Setni=C4=8Dka?= Date: Mon, 8 Aug 2022 02:36:59 +0200 Subject: [PATCH] Frontend separated from the download core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This humongous patch separates download core from the frontend. It will make any new frontends (web, …) and batch usage of the ulozto-downloader much more easy to make. Previously every worker writes to the console independently (only targeting the right console row). Now every worker saves data in the DownloadPart struct, from where teh data is taken by some frontend. Frontend is injected as dependency using Frontend abstract class which defines API interface between core and the frontend. It is not yet stable, some changes will be probably needed in the future. Now only the ConsoleFrontend is implemented. It would be nice to refactor it using ncurses (it is possible now, because only one process writes to the console). Also CaptchaSolver abstract class was developed, primarily to allow easy passing of the log func for the CAPTCHA part. As side effect it also provides nice defined API :) --- uldlib/captcha.py | 137 +++++++++++++++----------- uldlib/cmd.py | 11 ++- uldlib/downloader.py | 227 +++++++++++++++---------------------------- uldlib/frontend.py | 216 ++++++++++++++++++++++++++++++++++++++++ uldlib/page.py | 73 ++++++-------- uldlib/part.py | 54 ++++++++++ uldlib/torrunner.py | 18 +--- uldlib/utils.py | 33 ++----- 8 files changed, 475 insertions(+), 294 deletions(-) create mode 100644 uldlib/frontend.py diff --git a/uldlib/captcha.py b/uldlib/captcha.py index 0d0e037..ab9db64 100644 --- a/uldlib/captcha.py +++ b/uldlib/captcha.py @@ -1,74 +1,96 @@ +from abc import abstractmethod import threading import time +from typing import Dict import requests from PIL import Image from io import BytesIO +from uldlib.frontend import Frontend +from uldlib.utils import LogLevel -def tkinter_user_prompt(img_url, print_func, stop_event: threading.Event = None): - """Display captcha from given URL and ask user for input in GUI window. - Arguments: - img_url (str): URL of the image with CAPTCHA +class CaptchaSolver(): + frontend: Frontend - Returns: - str: User answer to the CAPTCHA - """ - import tkinter as tk - from PIL import ImageTk + def __init__(self, frontend: Frontend): + self.frontend = frontend - root = tk.Tk() - root.focus_force() - root.title("Opiš kód z obrázku") - # use width x height + x_offset + y_offset (no spaces!) - root.geometry("300x140") + def log(self, msg: str, level: LogLevel = LogLevel.INFO): + self.frontend.captcha_log(msg, level) - def disable_event(): + def stats(self, stats: Dict[str, int]): + self.frontend.captcha_stats(stats) + + @abstractmethod + def solve(self, img_url: str, stop_event: threading.Event = None) -> str: pass - root.protocol("WM_DELETE_WINDOW", disable_event) - u = requests.get(img_url) - raw_data = u.content +class ManualInput(CaptchaSolver): + """Display captcha from given URL and ask user for input in GUI window.""" + + def __init__(self, frontend): + super().__init__(frontend) + + def solve(self, img_url: str, stop_event: threading.Event = None) -> str: + import tkinter as tk + from PIL import ImageTk + + root = tk.Tk() + root.focus_force() + root.title("Opiš kód z obrázku") + # use width x height + x_offset + y_offset (no spaces!) + root.geometry("300x140") + + def disable_event(): + pass + + root.protocol("WM_DELETE_WINDOW", disable_event) + + u = requests.get(img_url) + raw_data = u.content + + im = Image.open(BytesIO(raw_data)) + photo = ImageTk.PhotoImage(im) + label = tk.Label(image=photo) + label.image = photo + label.pack() - im = Image.open(BytesIO(raw_data)) - photo = ImageTk.PhotoImage(im) - label = tk.Label(image=photo) - label.image = photo - label.pack() + entry = tk.Entry(root) + entry.pack() + entry.bind('', lambda event: root.quit()) + entry.focus() - entry = tk.Entry(root) - entry.pack() - entry.bind('', lambda event: root.quit()) - entry.focus() + tk.Button(root, text='Send', command=root.quit).pack() - tk.Button(root, text='Send', command=root.quit).pack() + # Closing of the window separated to thread because it can be closed by + # the user input (done==True) or by the terminating application (stop_event) + done = False - # Closing of the window separated to thread because it can be closed by - # the user input (done==True) or by the terminating application (stop_event) - done = False + def stop_func(): + while True: + if done or (stop_event and stop_event.is_set()): + break + time.sleep(0.1) + self.log("Closing tkinter window, wait…") + root.quit() - def stop_func(): - while True: - if done or (stop_event and stop_event.is_set()): - break - time.sleep(0.1) - print_func("Closing tkinter window, wait…") - root.quit() + stop_thread = threading.Thread(target=stop_func) + stop_thread.start() + root.mainloop() # Wait for user input - stop_thread = threading.Thread(target=stop_func) - stop_thread.start() - root.mainloop() # Wait for user input + value = entry.get() + done = True + stop_thread.join() + root.destroy() + return value - value = entry.get() - done = True - stop_thread.join() - root.destroy() - return value +class AutoReadCaptcha(CaptchaSolver): + def __init__(self, model_path, model_url, frontend): + super().__init__(frontend) -class AutoReadCaptcha: - def __init__(self, model_path, model_url, print_func=print): from urllib.request import urlretrieve import os import tflite_runtime.interpreter as tflite @@ -80,20 +102,17 @@ def reporthook(blocknum, block_size, total_size): readsofar = blocknum * block_size if total_size > 0: percent = readsofar * 1e2 / total_size - s = "\r%5.1f%% %*d / %d" % ( - percent, len(str(total_size)), readsofar, total_size) - print_func(s, end="") - if readsofar >= total_size: # near the end - print_func(flush=True) + self.log("Downloading model from %s: %5.1f%% %*d / %d" % ( + model_url, percent, len(str(total_size)), readsofar, total_size)) else: # total size is unknown - print_func("read %d" % (readsofar,), flush=True) + self.log("Downloading model from %s: read %d" % (model_url, readsofar)) if not os.path.exists(model_path): - print_func(f"Downloading model from {model_url}") + self.log(f"Downloading model from {model_url}") # download into temp model in order to detect incomplete downloads model_temp_path = f"{model_path}.tmp" urlretrieve(model_url, model_temp_path, reporthook) - print_func("Downloading of the model finished") + self.log("Downloading of the model finished") # rename temp model os.rename(model_temp_path, model_path) @@ -101,13 +120,13 @@ def reporthook(blocknum, block_size, total_size): model_content = open(model_path, "rb").read() self.interpreter = tflite.Interpreter(model_content=model_content) - def __call__(self, img_url, print_func, stop_event=None): + def solve(self, img_url, stop_event=None) -> str: # stop_event not used, because tflite interpreter is hard to cancel (but is is quick) import numpy as np interpreter = self.interpreter - print_func("Auto solving CAPTCHA") + self.log("Auto solving CAPTCHA") u = requests.get(img_url) raw_data = u.content @@ -149,5 +168,5 @@ def decode(li): return "".join(result) decoded_label = [decode(x) for x in labels_indices][0] - print_func(f"CAPTCHA auto solved as '{decoded_label}'") + self.log(f"CAPTCHA auto solved as '{decoded_label}'") return decoded_label diff --git a/uldlib/cmd.py b/uldlib/cmd.py index 22b57f0..2619122 100644 --- a/uldlib/cmd.py +++ b/uldlib/cmd.py @@ -4,6 +4,7 @@ from os import path from uldlib import downloader, captcha, __version__, __path__ from uldlib.const import DEFAULT_CONN_TIMEOUT +from uldlib.frontend import ConsoleFrontend def run(): @@ -25,15 +26,17 @@ def run(): args = parser.parse_args() + # TODO: implement other frontends and allow to choose from them + frontend = ConsoleFrontend() + if args.auto_captcha: model_path = path.join(__path__[0], "model.tflite") model_download_url = "https://github.com/JanPalasek/ulozto-captcha-breaker/releases/download/v2.2/model.tflite" - captcha_solve_fnc = captcha.AutoReadCaptcha( - model_path, model_download_url) + solver = captcha.AutoReadCaptcha(model_path, model_download_url, frontend) else: - captcha_solve_fnc = captcha.tkinter_user_prompt + solver = captcha.ManualInput(frontend) - d = downloader.Downloader(captcha_solve_fnc) + d = downloader.Downloader(frontend, solver) # Register sigint handler def sigint_handler(sig, frame): diff --git a/uldlib/downloader.py b/uldlib/downloader.py index 1c8c5de..5c8013c 100644 --- a/uldlib/downloader.py +++ b/uldlib/downloader.py @@ -1,72 +1,66 @@ -from .const import CLI_STATUS_STARTLINE, DOWNPOSTFIX, DOWN_CHUNK_SIZE, DEFAULT_CONN_TIMEOUT -from . import utils -from .torrunner import TorRunner -from .segfile import SegFileLoader, SegFileMonitor -from .page import Page -from .part import DownloadPart -import colors -import requests import os from queue import Queue +import requests import sys import threading import time -from datetime import timedelta -from types import FunctionType -from typing import List +from typing import List, Type +from uldlib.captcha import CaptchaSolver +from uldlib.const import DOWNPOSTFIX, DOWN_CHUNK_SIZE, DEFAULT_CONN_TIMEOUT +from uldlib.frontend import DownloadInfo, Frontend +from uldlib.page import Page +from uldlib.part import DownloadPart +from uldlib.segfile import SegFileLoader +from uldlib.torrunner import TorRunner +from uldlib.utils import LogLevel class Downloader: - cli_initialized: bool terminating: bool + threads: List[threading.Thread] + stop_download: threading.Event + + frontend: Type[Frontend] + frontend_thread: threading.Thread = None + stop_frontend: threading.Event + + captcha_solver: Type[CaptchaSolver] captcha_thread: threading.Thread = None - monitor: threading.Thread = None - captcha_solve_func: FunctionType + stop_captcha: threading.Event + download_url_queue: Queue parts: int - stop_download: threading.Event - stop_captcha: threading.Event - stop_monitor: threading.Event - def __init__(self, captcha_solve_func): - self.captcha_solve_func = captcha_solve_func + def __init__(self, frontend: Type[Frontend], captcha_solver: Type[CaptchaSolver]): + self.frontend = frontend + self.log = frontend.main_log + self.captcha_solver = captcha_solver + self.cli_initialized = False - self.monitor = None self.conn_timeout = None self.stop_download = threading.Event() self.stop_captcha = threading.Event() - self.stop_monitor = threading.Event() + self.stop_frontend = threading.Event() def terminate(self): + if self.terminating: + return self.terminating = True - if self.cli_initialized: - sys.stdout.write("\033[{};{}H".format( - self.parts + CLI_STATUS_STARTLINE + 2, 0)) - sys.stdout.write("\033[?25h") # show cursor - self.cli_initialized = False - print('Terminating download. Please wait for stopping all threads.') + self.log('Terminating download. Please wait for stopping all threads.') self.stop_captcha.set() self.stop_download.set() - for p in self.threads: - p.join() - print('Download terminated.') - self.stop_monitor.set() if self.captcha_thread: self.captcha_thread.join() - if self.monitor: - self.monitor.join() - print('End download monitor') - - def _captcha_print_func_wrapper(self, text): - if not self.cli_initialized: - sys.stdout.write(colors.blue( - "[Link solve]\t") + text + "\033[K\r") - else: - utils.print_captcha_status(text, self.parts) + for p in self.threads: + p.join() + self.log('Download terminated.') + self.stop_frontend.set() + if self.frontend_thread: + self.frontend_thread.join() def _captcha_breaker(self, page, parts): msg = "" @@ -75,51 +69,12 @@ def _captcha_breaker(self, page, parts): else: msg = "Solve CAPTCHA dlink .." - # utils.print_captcha_status(msg, parts) for url in self.captcha_download_links_generator: if self.stop_captcha.is_set(): break - utils.print_captcha_status(msg, parts) + self.captcha_solver.log(msg) self.download_url_queue.put(url) - def _save_progress(self, filename, parts, size, interval_sec): - - m = SegFileMonitor(filename) - - t_start = time.time() - s_start = m.size() - last_bps = [(s_start, t_start)] - - while True: - time.sleep(interval_sec) - - if self.stop_monitor.is_set(): - m.clean() - break - - s = m.size() - t = time.time() - - total_bps = (s - s_start) / (t - t_start) - - # Average now bps for last 10 measurements - if len(last_bps) >= 10: - last_bps = last_bps[1:] - (s_last, t_last) = last_bps[0] - now_bps = (s - s_last) / (t - t_last) - last_bps.append((s, t)) - - remaining = (size - s) / total_bps if total_bps > 0 else 0 - - utils.print_saved_status( - f"{(s / 1024 ** 2):.2f} MB" - f" ({(s / size * 100):.2f} %)" - f"\tavg. speed: {(total_bps / 1024 ** 2):.2f} MB/s" - f"\tcurr. speed: {(now_bps / 1024 ** 2):.2f} MB/s" - f"\tremaining: {timedelta(seconds=round(remaining))}", - parts - ) - def _download_part(self, part: DownloadPart): try: self._download_part_internal(part) @@ -135,7 +90,6 @@ def _download_part_internal(self, part: DownloadPart): """ writer = part.writer - id = part.id part.lock.acquire() part.started = True @@ -210,13 +164,10 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO self.isLimited = False self.isCaptcha = False - started = time.time() - previously_downloaded = 0 - # 1. Prepare downloads - print("Starting downloading for url '{}'".format(url)) + self.log("Starting downloading for url '{}'".format(url)) # 1.1 Get all needed information - print("Getting info (filename, filesize, ...)") + self.log("Getting info (filename, filesize, …)") try: tor = TorRunner() @@ -224,38 +175,41 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO page.parse() except RuntimeError as e: - print(colors.red('Cannot download file: ' + str(e))) + self.log('Cannot download file: ' + str(e), error=True) sys.exit(1) # Do check - only if .udown status file not exists get question output_filename = os.path.join(target_dir, page.filename) if os.path.isfile(output_filename) and not os.path.isfile(output_filename+DOWNPOSTFIX): - print(colors.yellow( - "WARNING: File '{}' already exists, overwrite it? [y/n] ".format(output_filename)), end="") - if input().strip() != 'y': + answer = self.frontend.prompt( + "WARNING: File '{}' already exists, overwrite it? [y/n] ".format(output_filename), + level=LogLevel.WARNING + ) + if answer != 'y': sys.exit(1) + info = DownloadInfo() + info.filename = page.filename + info.url = page.url + if page.quickDownloadURL is not None: - print("You are VERY lucky, this is QUICK direct download without CAPTCHA, downloading as 1 quick part :)") - self.download_type = "fullspeed direct download (without CAPTCHA)" + self.log("You are VERY lucky, this is QUICK direct download without CAPTCHA, downloading as 1 quick part :)") + info.download_type = "fullspeed direct download (without CAPTCHA)" download_url = page.quickDownloadURL self.captcha_solve_func = None if page.slowDownloadURL is not None: self.isLimited = True if page.isDirectDownload: - print("You are lucky, this is slow direct download without CAPTCHA :)") - self.download_type = "slow direct download (without CAPTCHA)" + self.log("You are lucky, this is slow direct download without CAPTCHA :)") + info.download_type = "slow direct download (without CAPTCHA)" else: self.isCaptcha = True - print( - "CAPTCHA protected download - CAPTCHA challenges will be displayed\n") - self.download_type = "CAPTCHA protected download" + self.log("CAPTCHA protected download - CAPTCHA challenges will be displayed") + info.download_type = "CAPTCHA protected download" self.captcha_download_links_generator = page.captcha_download_links_generator( - captcha_solve_func=self.captcha_solve_func, - print_func=self._captcha_print_func_wrapper, - stop_event=self.stop_captcha, + solver=self.captcha_solver, stop_event=self.stop_captcha, ) download_url = next(self.captcha_download_links_generator) @@ -266,26 +220,18 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO file_data = SegFileLoader(output_filename, total_size, parts) writers = file_data.make_writers() except Exception as e: - print(colors.red( - f"Failed: Can not create '{output_filename}' error: {e} ")) - self.terminate() - sys.exit() - - # 2. Initialize cli status table interface - # if windows, use 'cls', otherwise use 'clear' - os.system('cls' if os.name == 'nt' else 'clear') - sys.stdout.write("\033[?25l") # hide cursor - self.cli_initialized = True - page.cli_initialized = True # for tor in Page - print(colors.blue("File:\t\t") + colors.bold(page.filename)) - print(colors.blue("URL:\t\t") + page.url) - print(colors.blue("Download type:\t") + self.download_type) - print(colors.blue("Size / parts: \t") + - colors.bold(f"{round(total_size / 1024**2, 2)}MB => " + - f"{file_data.parts} x {round(file_data.part_size / 1024**2, 2)}MB")) + self.log(f"Failed: Can not create '{output_filename}' error: {e} ", level=LogLevel.ERROR) + sys.exit(1) + + info.total_size = total_size + info.part_size = file_data.part_size + info.parts = file_data.parts downloads: List[DownloadPart] = [DownloadPart(w) for w in writers] + # 2. All info gathered, initialize frontend + + self.log("Download in progress") # fill placeholder before download started for part in downloads: if page.isDirectDownload: @@ -293,6 +239,12 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO else: part.set_status("Waiting for CAPTCHA…") + self.frontend_thread = threading.Thread( + target=self.frontend.run, + args=(info, downloads, self.stop_frontend) + ) + self.frontend_thread.start() + # Prepare queue for recycling download URLs self.download_url_queue = Queue(maxsize=0) @@ -309,16 +261,10 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO cpb_started = False page.alreadyDownloaded = 0 - # save status monitor - self.monitor = threading.Thread(target=self._save_progress, args=( - file_data.filename, file_data.parts, file_data.size, 1/3)) - self.monitor.start() - # 3. Start all downloads fill self.threads for part in downloads: if self.terminating: return - id = part.id if part.writer.written == part.writer.size: part.completed = True @@ -343,11 +289,9 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO # no need for another CAPTCHAs self.stop_captcha.set() if self.isCaptcha: - utils.print_captcha_status( - "All downloads started, no need to solve another CAPTCHAs..", self.parts) + self.captcha_solver.log("All downloads started, no need to solve another CAPTCHAs…") else: - utils.print_captcha_status( - "All downloads started, no need to solve another direct links..", self.parts) + self.captcha_solver.log("All downloads started, no need to solve another direct links…") # 4. Wait for all downloads to finish success = True @@ -357,27 +301,16 @@ def download(self, url, parts=10, target_dir="", conn_timeout=DEFAULT_CONN_TIMEO if part.error: success = False - # clear cli - sys.stdout.write("\033[{};{}H".format( - parts + CLI_STATUS_STARTLINE + 2, 0)) - sys.stdout.write("\033[K") - sys.stdout.write("\033[?25h") # show cursor - self.cli_initialized = False + self.stop_captcha.set() + self.stop_frontend.set() + if self.captcha_thread: + self.captcha_thread.join() + if self.frontend_thread: + self.frontend_thread.join() # result end status if not success: - print(colors.red("Failure of one or more downloads, exiting")) + self.log("Failure of one or more downloads, exiting", level=LogLevel.ERROR) sys.exit(1) - elapsed = time.time() - started - # speed in bytes per second: - speed = (total_size - previously_downloaded) / elapsed if elapsed > 0 else 0 - print(colors.green("All downloads finished")) - print("Stats: Downloaded {}{} MB in {} (average speed {} MB/s)".format( - round((total_size - previously_downloaded) / 1024**2, 2), - "" if previously_downloaded == 0 else ( - "/"+str(round(total_size / 1024**2, 2)) - ), - str(timedelta(seconds=round(elapsed))), - round(speed / 1024**2, 2) - )) + self.log("All downloads successfully finished", level=LogLevel.SUCCESS) diff --git a/uldlib/frontend.py b/uldlib/frontend.py new file mode 100644 index 0000000..d9b1610 --- /dev/null +++ b/uldlib/frontend.py @@ -0,0 +1,216 @@ + +from abc import abstractmethod +from datetime import timedelta +import colors +import os +import sys +import time +import threading +from typing import Dict, List, Tuple + +from uldlib.const import CLI_STATUS_STARTLINE +from uldlib.part import DownloadPart +from uldlib.utils import LogLevel + + +class DownloadInfo: + filename: str + url: str + download_type: str + total_size: int + part_size: int + parts: int + + +class Frontend(): + + @abstractmethod + def captcha_log(self, msg: str, level: LogLevel = LogLevel.INFO): + pass + + @abstractmethod + def captcha_stats(self, stats: Dict[str, int]): + pass + + @abstractmethod + def main_log(self, msg: str, level: LogLevel = LogLevel.INFO): + pass + + @abstractmethod + def prompt(self, msg: str, level: LogLevel = LogLevel.INFO) -> str: + pass + + @abstractmethod + def run(self, parts: List[DownloadPart], stop_event: threading.Event): + pass + + +class ConsoleFrontend(Frontend): + cli_initialized: bool + + last_log: Tuple[str, LogLevel] + + last_captcha_log: Tuple[str, LogLevel] + last_captcha_stats: Dict[str, int] + + def __init__(self): + self.cli_initialized = False + self.last_log = ("", LogLevel.INFO) + self.last_captcha_log = ("", LogLevel.INFO) + self.last_captcha_stats = None + + def captcha_log(self, msg: str, level: LogLevel = LogLevel.INFO): + self.last_captcha_log = (msg, level) + if not self.cli_initialized: + sys.stdout.write(colors.blue( + "[Link solve]\t") + self._color(msg, level) + "\033[K\r") + + def captcha_stats(self, stats: Dict[str, int]): + self.last_captcha_stats = stats + + def main_log(self, msg: str, level: LogLevel = LogLevel.INFO): + self.last_log = (msg, level) + + if self.cli_initialized: + return + print(self._color(msg, level)) + + def prompt(self, msg: str, level: LogLevel = LogLevel.INFO) -> str: + print(self._color(msg, level), end="") + return input().strip() + + @staticmethod + def _stat_fmt(stats: Dict[str, int]): + count = colors.blue(stats['all']) + ok = colors.green(stats['ok']) + bad = colors.red(stats['bad']) + lim = colors.red(stats['lim']) + blo = colors.red(stats['block']) + net = colors.red(stats['net']) + return f"[Ok: {ok} / {count}] :( [Badcp: {bad} Limited: {lim} Censored: {blo} NetErr: {net}]" + + @staticmethod + def _print(text, x=0, y=0): + sys.stdout.write("\033[{};{}H".format(y, x)) + sys.stdout.write("\033[K") + sys.stdout.write(text) + sys.stdout.flush() + + @staticmethod + def _color(text: str, level: LogLevel) -> str: + if level == LogLevel.WARNING: + return colors.yellow(text) + if level == LogLevel.ERROR: + return colors.red(text) + if level == LogLevel.SUCCESS: + return colors.green(text) + return text + + def run(self, info: DownloadInfo, parts: List[DownloadPart], stop_event: threading.Event): + os.system('cls' if os.name == 'nt' else 'clear') + sys.stdout.write("\033[?25l") # hide cursor + self.cli_initialized = True + + print(colors.blue("File:\t\t") + colors.bold(info.filename)) + print(colors.blue("URL:\t\t") + info.url) + print(colors.blue("Download type:\t") + info.download_type) + print(colors.blue("Size / parts: \t") + + colors.bold(f"{round(info.total_size / 1024**2, 2)}MB => " + + f"{info.parts} x {round(info.part_size / 1024**2, 2)}MB")) + + t_start = time.time() + s_start = 0 + for part in parts: + (_, _, size) = part.get_frontend_status() + s_start += size + last_bps = [(s_start, t_start)] + + while True: + if stop_event.is_set(): + break + + t = time.time() + # Get parts info + lines = [] + s = 0 + for part in parts: + (line, level, size) = part.get_frontend_status() + lines.append(self._color(line, level)) + s += size + + # Print parts + for (line, part) in zip(lines, parts): + self._print( + colors.blue(f"[Part {part.id}]") + f"\t{line}", + y=(part.id + CLI_STATUS_STARTLINE)) + + y = info.parts + CLI_STATUS_STARTLINE + + # Print CAPTCHA/TOR status + (msg, level) = self.last_captcha_log + self._print( + colors.yellow("[Link solve]\t") + + self._color(msg, level), + y=y + ) + y += 1 + if self.last_captcha_stats is not None: + self._print( + colors.yellow("\t\t") + self._stat_fmt(self.last_captcha_stats), + y=y + ) + y += 1 + + # Print overall progress line + if t == t_start: + total_bps = 0 + now_bps = 0 + else: + total_bps = (s - s_start) / (t - t_start) + # Average now bps for last 10 measurements + if len(last_bps) >= 10: + last_bps = last_bps[1:] + (s_last, t_last) = last_bps[0] + now_bps = (s - s_last) / (t - t_last) + last_bps.append((s, t)) + + remaining = (info.total_size - s) / total_bps if total_bps > 0 else 0 + + self._print(colors.yellow( + f"[Progress]\t" + f"{(s / 1024 ** 2):.2f} MB" + f" ({(s / info.total_size * 100):.2f} %)" + f"\tavg. speed: {(total_bps / 1024 ** 2):.2f} MB/s" + f"\tcurr. speed: {(now_bps / 1024 ** 2):.2f} MB/s" + f"\tremaining: {timedelta(seconds=round(remaining))}"), + y=y + ) + y += 1 + + # Print last log message + (msg, level) = self.last_log + self._print( + colors.yellow("[STATUS]\t") + + self._color(msg, level), + y=y + ) + y += 1 + + time.sleep(0.5) + + if self.cli_initialized: + sys.stdout.write("\033[{};{}H".format(y + 2, 0)) + sys.stdout.write("\033[?25h") # show cursor + self.cli_initialized = False + + elapsed = time.time() - t_start + # speed in bytes per second: + speed = (s - s_start) / elapsed if elapsed > 0 else 0 + print(colors.blue("Statistics:\t") + "Downloaded {}{} MB in {} (average speed {} MB/s)".format( + round((s - s_start) / 1024**2, 2), + "" if s_start == 0 else ( + "/"+str(round(info.total_size / 1024**2, 2)) + ), + str(timedelta(seconds=round(elapsed))), + round(speed / 1024**2, 2) + )) diff --git a/uldlib/page.py b/uldlib/page.py index f6712e9..67e6b75 100644 --- a/uldlib/page.py +++ b/uldlib/page.py @@ -1,11 +1,15 @@ import re import shutil import threading +from typing import Type from urllib.parse import urlparse, urljoin from os import path import sys import requests -import colors + +from uldlib.captcha import CaptchaSolver +from uldlib.frontend import LogLevel +from uldlib.torrunner import TorRunner from .const import XML_HEADERS, DEFAULT_CONN_TIMEOUT from .linkcache import LinkCache @@ -41,7 +45,7 @@ class Page: numTorLinks: int alreadyDownloaded: int - def __init__(self, url, target_dir, parts, tor, conn_timeout=DEFAULT_CONN_TIMEOUT): + def __init__(self, url, target_dir, parts, tor: TorRunner, conn_timeout=DEFAULT_CONN_TIMEOUT): """Check given url and if it looks ok GET the Uloz.to page and save it. Arguments: @@ -142,22 +146,13 @@ def parse(self): raise RuntimeError(f"Cannot parse {self.pagename} page to get download information," + " no direct download URL and no CAPTCHA challenge URL found") - def _stat_fmt(self): - count = colors.blue(self.stats['all']) - ok = colors.green(self.stats['ok']) - bad = colors.red(self.stats['bad']) - lim = colors.red(self.stats['lim']) - blo = colors.red(self.stats['block']) - net = colors.red(self.stats['net']) - return f":) [Ok: {ok} / {count}] :( [Badcp: {bad} Limited: {lim} Censored: {blo} NetErr: {net}]" - # print TOR network error and += stats - def _error_net_stat(self, err, print_func): + def _error_net_stat(self, err, log_func): self.stats["all"] += 1 - print_func(colors.red(f"Network error get new TOR connection: {err}")) + log_func(f"Network error get new TOR connection: {err}", level=LogLevel.ERROR) self.stats["net"] += 1 - def _link_validation_stat(self, resp, print_func): + def _link_validation_stat(self, resp, log_func): linkdata = resp.text self.stats["all"] += 1 ok = False @@ -180,29 +175,25 @@ def _link_validation_stat(self, resp, print_func): elif lim_str in linkdata: self.stats["lim"] += 1 if not self.isDirectDownload: - print_func(colors.red(lim_msg)) + log_func(lim_msg, level=LogLevel.ERROR) elif blk_str in linkdata: self.stats["block"] += 1 if not self.isDirectDownload: - print_func(colors.red(blk_msg)) + log_func(blk_msg, level=LogLevel.ERROR) elif bcp_str in linkdata: self.stats["bad"] += 1 - print_func(colors.red(bcp_msg)) + log_func(bcp_msg, level=LogLevel.ERROR) reload = False # bad captcha same IP again return (ok, reload) - def _captcha_send_print_stat(self, answ, print_func): - print_func(f"Send CAP: '{answ}' {self._stat_fmt()} timeout: {colors.blue(self.conn_timeout)}") - - def captcha_download_links_generator(self, captcha_solve_func, print_func=print, stop_event: threading.Event = None): + def captcha_download_links_generator(self, solver: Type[CaptchaSolver], stop_event: threading.Event = None): """ Generator for CAPTCHA download links using Tor sessions. Get download link by solving CAPTCHA, calls CAPTCHA related functions.. Arguments: - captcha_solve_func (func): Function which gets CAPTCHA challenge URL and returns CAPTCHA answer - print_func (func): Function used for printing log (default is bultin 'print') + solver (CaptchaSolver): Class with solve method which gets CAPTCHA challenge URL and returns CAPTCHA answer stop_event: Threading event to check when to stop Returns: @@ -230,8 +221,7 @@ def captcha_download_links_generator(self, captcha_solve_func, print_func=print, print("Starting TOR...") # tor started after cli initialized try: - self.tor.start( - cli_initialized=self.cli_initialized, parts=self.parts) + self.tor.start(log_func=solver.log) self.torRunning = True proxies = { 'http': 'socks5://127.0.0.1:' + str(self.tor.tor_ports[0]), @@ -240,7 +230,7 @@ def captcha_download_links_generator(self, captcha_solve_func, print_func=print, except OSError as e: self._error_net_stat( - f"Tor start failed: {e}, exiting.. try run program again..", print_func) + f"Tor start failed: {e}, exiting.. try run program again..", solver.log) # remove tor data if path.exists(self.tor.ddir): shutil.rmtree(self.tor.ddir, ignore_errors=True) @@ -262,23 +252,21 @@ def captcha_download_links_generator(self, captcha_solve_func, print_func=print, resp = requests.Response() if self.isDirectDownload: - print_func( - f"TOR get downlink {self._stat_fmt()} timeout: {colors.blue(self.conn_timeout)}") + solver.log(f"TOR get downlink (timeout {self.conn_timeout})") resp = s.get(self.captchaURL, headers=XML_HEADERS, proxies=proxies, timeout=self.conn_timeout) else: - print_func( - f"TOR post captcha {self._stat_fmt()} timeout: {colors.blue(self.conn_timeout)}") + solver.log(f"TOR get new CAPTCHA (timeout {self.conn_timeout})") r = s.get(self.captchaURL, headers=XML_HEADERS) # captcha_image_url = parse_single( r.text, r'') if captcha_image_url is None: - print_func( - "ERROR: Cannot parse CAPTCHA image URL from the page. Changing Tor circuit.") + solver.log("ERROR: Cannot parse CAPTCHA image URL from the page. Changing Tor circuit.", level=LogLevel.ERROR) self.stats["all"] += 1 self.stats["net"] += 1 + solver.stats(self.stats) reload = True continue @@ -289,20 +277,19 @@ def captcha_download_links_generator(self, captcha_solve_func, print_func=print, # https://github.com/setnicka/ulozto-downloader/issues/82 captcha_image_url = urljoin("https:", captcha_image_url) - print_func("Image URL obtained, trying to solve") - captcha_answer = captcha_solve_func( - captcha_image_url, print_func=print_func, - stop_event=stop_event) + solver.log("Image URL obtained, trying to solve") + captcha_answer = solver.solve(captcha_image_url, stop_event) captcha_data["captcha_value"] = captcha_answer - self._captcha_send_print_stat( - captcha_answer, print_func) + solver.log(f"CAPTCHA answer '{captcha_answer}' (timeout {self.conn_timeout})") + resp = s.post(self.captchaURL, data=captcha_data, headers=XML_HEADERS, proxies=proxies, timeout=self.conn_timeout) # generate result or break - result = self._link_validation_stat(resp, print_func) + result = self._link_validation_stat(resp, solver.log) + solver.stats(self.stats) # for noreload (bad captcha no need reload TOR) reload = result[1] if result[0]: @@ -314,10 +301,12 @@ def captcha_download_links_generator(self, captcha_solve_func, print_func=print, except requests.exceptions.ConnectionError: self._error_net_stat( - "Connection error, try new TOR session.", print_func) + "Connection error, try new TOR session.", solver.log) except requests.exceptions.ChunkedEncodingError: self._error_net_stat( - "Error while communicating over Tor, try new TOR session", print_func) + "Error while communicating over Tor, try new TOR session", solver.log) except requests.exceptions.ReadTimeout: self._error_net_stat( - "ReadTimeout error, try new TOR session.", print_func) + "ReadTimeout error, try new TOR session.", solver.log) + + solver.stats(self.stats) diff --git a/uldlib/part.py b/uldlib/part.py index 2a8ed49..69a9162 100644 --- a/uldlib/part.py +++ b/uldlib/part.py @@ -1,5 +1,10 @@ import threading +import time +from datetime import timedelta +from typing import Tuple + +from uldlib.utils import LogLevel from uldlib.segfile import SegFileWriter @@ -44,3 +49,52 @@ def set_status(self, status: str, error: bool = False, warning: bool = False): self.error = error self.warning = warning self.lock.release() + + def get_frontend_status(self) -> Tuple[str, LogLevel, int]: + """ + Returns status line for given part + """ + + level = LogLevel.INFO + self.lock.acquire() + downloaded = self.d_total + + if self.error: + msg = self.status if self.status else "ERROR: Unknown error" + level = LogLevel.ERROR + elif self.warning: + msg = self.status if self.status else "WARNING: Unknown warning" + level = LogLevel.WARNING + elif self.status and self.completed: + msg = self.status + level = LogLevel.SUCCESS + elif self.status: + msg = self.status + elif self.completed: + elapsed = self.completion_time - self.start_time + speed = self.d_now / elapsed if elapsed > 0 else 0 + msg = "Successfully downloaded {}{} MB in {} (speed {} KB/s)".format( + round(self.d_now / 1024**2, 2), + "" if self.d_now == self.d_total else ( + "/"+str(round(self.d_total / 1024**2, 2)) + ), + str(timedelta(seconds=round(elapsed))), + round(speed / 1024, 2) + ) + level = LogLevel.SUCCESS + else: + elapsed = time.time() - self.start_time + speed = self.d_now / elapsed if elapsed > 0 else 0 + # remaining time in seconds: + remaining = (self.size - self.d_total) / speed if speed > 0 else 0 + + msg = "{:.2f}%\t{:.2f}/{:.2f} MB\tspeed: {:.2f} KB/s\telapsed: {}\tremaining: {}".format( + round(self.d_total / self.size * 100, 2), + round(self.d_total / 1024**2, 2), round(self.size / 1024**2, 2), + round(speed / 1024, 2), + str(timedelta(seconds=round(elapsed))), + str(timedelta(seconds=round(remaining))), + ) + + self.lock.release() + return (msg, level, downloaded) diff --git a/uldlib/torrunner.py b/uldlib/torrunner.py index e529e6f..04c9d27 100644 --- a/uldlib/torrunner.py +++ b/uldlib/torrunner.py @@ -1,7 +1,6 @@ import socket import stem.process from stem.control import Controller -from .utils import print_tor_status import os import uuid import shutil @@ -32,7 +31,7 @@ def _two_free_ports(self, at): at += 1 return (ports[0], ports[1]) - def start(self, cli_initialized=False, parts=0): + def start(self, log_func): os.mkdir(self.ddir) self.tor_ports = self._two_free_ports(41000) config = "SocksPort " + str(self.tor_ports[0]) + "\n" @@ -44,25 +43,14 @@ def start(self, cli_initialized=False, parts=0): c.write(config) c.close() - def print_cli_wrapper(line): - return print_tor_status(line, parts) - - def print_no_cli(line): - return print(line, end="\r") - - if cli_initialized: - print_func = print_cli_wrapper - else: - print_func = print_no_cli - def get_tor_ready(line): p = re.compile(r'Bootstrapped \d+%') msg = re.findall(p, line) if len(msg) > 0: - print_func(f"Tor: {msg[0]}") # log + log_func(f"Tor: {msg[0]}") # log if "Bootstrapped 100%" in line: - print_func("TOR is ready, download links started") + log_func("TOR is ready, download links started") self.process = stem.process.launch_tor( torrc_path=os.path.join(self.ddir, "torrc"), diff --git a/uldlib/utils.py b/uldlib/utils.py index cdabdd9..9dd594d 100644 --- a/uldlib/utils.py +++ b/uldlib/utils.py @@ -1,29 +1,8 @@ -import sys -import colors -from . import const +from enum import Enum -def _print(text, x=0, y=0): - sys.stdout.write("\033[{};{}H".format(y, x)) - sys.stdout.write("\033[K") - sys.stdout.write(text) - sys.stdout.flush() - -def print_part_status(id, text): - _print(colors.blue(f"[Part {id}]") + f"\t{text}", - y=(id + const.CLI_STATUS_STARTLINE)) - - -def print_captcha_status(text, parts): - _print(colors.yellow("[Link solve]") + - f"\t{text}", y=(parts + 0 + const.CLI_STATUS_STARTLINE)) - - -def print_tor_status(text, parts): - _print(colors.yellow("[Tor start]") + - f"\t{text}", y=(parts + 0 + const.CLI_STATUS_STARTLINE)) - - -def print_saved_status(text, parts): - _print(colors.yellow(f"[Progress]\t {text}"), - y=(parts + 1 + const.CLI_STATUS_STARTLINE)) +class LogLevel(Enum): + INFO = 1 + WARNING = 2 + ERROR = 3 + SUCCESS = 4