From 8df5c314a34190163b8e4710dfd04e04c27c4cf3 Mon Sep 17 00:00:00 2001 From: Sharon Brizinov Date: Sat, 16 Jan 2021 18:15:04 +0200 Subject: [PATCH] added HTTP index storage provier --- packaging/requirements.txt | 4 +- src/providers/__init__.py | 0 .../aws_provider.py} | 38 +- src/providers/base_provider.py | 30 ++ src/providers/httpindex_provider.py | 357 ++++++++++++++++++ src/providers/providers.py | 16 + src/s3viewer.py | 14 +- 7 files changed, 414 insertions(+), 45 deletions(-) create mode 100644 src/providers/__init__.py rename src/{providers.py => providers/aws_provider.py} (80%) create mode 100644 src/providers/base_provider.py create mode 100644 src/providers/httpindex_provider.py create mode 100644 src/providers/providers.py diff --git a/packaging/requirements.txt b/packaging/requirements.txt index 20631f5..1379497 100644 --- a/packaging/requirements.txt +++ b/packaging/requirements.txt @@ -1,2 +1,4 @@ PyQt5 -PyInstaller \ No newline at end of file +PyInstaller +beautifulsoup4 +html5lib \ No newline at end of file diff --git a/src/providers/__init__.py b/src/providers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/providers.py b/src/providers/aws_provider.py similarity index 80% rename from src/providers.py rename to src/providers/aws_provider.py index 2006ed1..0a62485 100644 --- a/src/providers.py +++ b/src/providers/aws_provider.py @@ -5,41 +5,7 @@ from urllib.parse import urlparse from utils import show_message_box - - -def find_provider_class_by_url(url): - if S3StorageProvider.is_provider(url): - return S3StorageProvider - return None - -class StorageProvider(): - NODE_BATCH_UPDATE_COUNT = 1 - - def __init__(self, url): - self.url = url - self.should_stop = False - - @staticmethod - def is_provider(url): - return False - - def check(self): - return True - - def get_download_url(self, relative_path): - return relative_path - - def hostname(self): - return self.url - - def yield_dirlist(self): - pass - - def get_default_error_message(self): - pass - - def stop(self): - self.should_stop = True +from providers.base_provider import StorageProvider class S3StorageProvider(StorageProvider): NODE_BATCH_UPDATE_COUNT = 1000 @@ -52,7 +18,7 @@ def is_provider(url): if scheme and "http" in scheme: return ".amazonaws.com" in url # If we don't have HTTP we assume it's just a AWS S3 bucket name - return url + return True # We accept a couple of formats. For example: # - BUCKET_NAME diff --git a/src/providers/base_provider.py b/src/providers/base_provider.py new file mode 100644 index 0000000..1c0cc4f --- /dev/null +++ b/src/providers/base_provider.py @@ -0,0 +1,30 @@ + +class StorageProvider(): + NODE_BATCH_UPDATE_COUNT = 1 + + def __init__(self, url): + self.url = url + self.should_stop = False + + @staticmethod + def is_provider(url): + return False + + def check(self): + return True + + def get_download_url(self, relative_path): + return relative_path + + def hostname(self): + return self.url + + def yield_dirlist(self): + pass + + def get_default_error_message(self): + pass + + def stop(self): + self.should_stop = True + diff --git a/src/providers/httpindex_provider.py b/src/providers/httpindex_provider.py new file mode 100644 index 0000000..11c86ba --- /dev/null +++ b/src/providers/httpindex_provider.py @@ -0,0 +1,357 @@ +import sys +import os +import re +import time +import argparse +import requests +import collections +import urllib.parse +import bs4 + +from utils import show_message_box +from providers.base_provider import StorageProvider + +################################################################################## +### Most of the code here is from https://github.com/gumblex/htmllisting-parser ## +################################################################################## +MAX_RECURSE_LEVEL = 50 +USER_AGENT = "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19" +HEADERS = {"User-Agent": USER_AGENT} + +RE_ISO8601 = re.compile(r'\d{4}-\d+-\d+T\d+:\d{2}:\d{2}Z') +DATETIME_FMTs = ( +(re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}:\d{2}'), "%d-%b-%Y %H:%M:%S"), +(re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}'), "%d-%b-%Y %H:%M"), +(re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}:\d{2}'), "%Y-%m-%d %H:%M:%S"), +(RE_ISO8601, "%Y-%m-%dT%H:%M:%SZ"), +(re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}'), "%Y-%m-%d %H:%M"), +(re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}:\d{2}'), "%Y-%b-%d %H:%M:%S"), +(re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}'), "%Y-%b-%d %H:%M"), +(re.compile(r'[F-W][a-u]{2} [A-S][a-y]{2} +\d+ \d{2}:\d{2}:\d{2} \d{4}'), "%a %b %d %H:%M:%S %Y"), +(re.compile(r'[F-W][a-u]{2}, \d+ [A-S][a-y]{2} \d{4} \d{2}:\d{2}:\d{2} .+'), "%a, %d %b %Y %H:%M:%S %Z"), +(re.compile(r'\d{4}-\d+-\d+'), "%Y-%m-%d"), +(re.compile(r'\d+/\d+/\d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}'), "%d/%m/%Y %H:%M:%S %z"), +(re.compile(r'\d{2} [A-S][a-y]{2} \d{4}'), "%d %b %Y") +) + +RE_FILESIZE = re.compile(r'\d+(\.\d+)? ?[BKMGTPEZY]|\d+|-', re.I) +RE_ABSPATH = re.compile(r'^((ht|f)tps?:/)?/') +RE_COMMONHEAD = re.compile('Name|(Last )?modifi(ed|cation)|date|Size|Description|Metadata|Type|Parent Directory', re.I) +RE_HASTEXT = re.compile('.+') +RE_HEAD_NAME = re.compile('name$|^file|^download') +RE_HEAD_MOD = re.compile('modifi|^uploaded|date|time') +RE_HEAD_SIZE = re.compile('size|bytes$') + +FileEntry = collections.namedtuple('FileEntry', 'name modified size description') + +def human2bytes(s): + """ + >>> human2bytes('1M') + 1048576 + >>> human2bytes('1G') + 1073741824 + """ + if s is None: + return None + try: + return int(s) + except ValueError: + symbols = 'BKMGTPEZY' + letter = s[-1:].strip().upper() + num = float(s[:-1]) + prefix = {symbols[0]: 1} + for i, s in enumerate(symbols[1:]): + prefix[s] = 1 << (i+1)*10 + return int(num * prefix[letter]) + +def aherf2filename(a_href): + isdir = ('/' if a_href[-1] == '/' else '') + return os.path.basename(urllib.parse.unquote(a_href.rstrip('/'))) + isdir + +def parse(soup): + ''' + Try to parse apache/nginx-style directory listing with all kinds of tricks. + + Exceptions or an empty listing suggust a failure. + We strongly recommend generating the `soup` with 'html5lib'. + + Returns: Current directory, Directory listing + ''' + cwd = None + listing = [] + if soup.title and soup.title.string and soup.title.string.startswith('Index of '): + cwd = soup.title.string[9:] + elif soup.h1: + title = soup.h1.get_text().strip() + if title.startswith('Index of '): + cwd = title[9:] + [img.decompose() for img in soup.find_all('img')] + file_name = file_mod = file_size = file_desc = None + pres = [x for x in soup.find_all('pre') if + x.find('a', string=RE_HASTEXT)] + tables = [x for x in soup.find_all('table') if + x.find(string=RE_COMMONHEAD)] if not pres else () + heads = [] + if pres: + pre = pres[0] + started = False + for element in (pre.hr.next_siblings if pre.hr else pre.children): + if element.name == 'a': + if not element.string or not element.string.strip(): + continue + elif started: + if file_name: + listing.append(FileEntry( + file_name, file_mod, file_size, file_desc)) + file_name = aherf2filename(element['href']) + file_mod = file_size = file_desc = None + elif (element.string in ('Parent Directory', '..', '../') or + element['href'][0] not in '?/'): + started = True + elif not element.name: + line = element.string.replace('\r', '').split('\n', 1)[0].lstrip() + for regex, fmt in DATETIME_FMTs: + match = regex.match(line) + if match: + file_mod = time.strptime(match.group(0), fmt) + line = line[match.end():].lstrip() + break + match = RE_FILESIZE.match(line) + if match: + sizestr = match.group(0) + if sizestr == '-': + file_size = None + else: + file_size = human2bytes(sizestr.replace(' ', '').replace(',', '')) + line = line[match.end():].lstrip() + if line: + file_desc = line.rstrip() + if file_name and file_desc == '/': + file_name += '/' + file_desc = None + else: + continue + if file_name: + listing.append(FileEntry(file_name, file_mod, file_size, file_desc)) + elif tables: + started = False + for tr in tables[0].find_all('tr'): + status = 0 + file_name = file_mod = file_size = file_desc = None + if started: + if tr.parent.name in ('thead', 'tfoot') or tr.th: + continue + for td in tr.find_all('td'): + if status >= len(heads): + raise AssertionError("can't detect table column number") + if td.get('colspan'): + continue + elif heads[status] == 'name': + if not td.a: + continue + a_str = td.a.get_text().strip() + a_href = td.a['href'] + if not a_str or not a_href or a_href[0] == '#': + continue + elif a_str == 'Parent Directory' or a_href == '../': + break + else: + file_name = aherf2filename(a_href) + status = 1 + elif heads[status] == 'modified': + if td.time: + timestr = td.time.get('datetime', '') + if RE_ISO8601.match(timestr): + file_mod = time.strptime(timestr, "%Y-%m-%dT%H:%M:%SZ") + status += 1 + continue + timestr = td.get_text().strip() + if timestr: + for regex, fmt in DATETIME_FMTs: + if regex.match(timestr): + file_mod = time.strptime(timestr, fmt) + break + else: + if td.get('data-sort-value'): + file_mod = time.gmtime(int(td['data-sort-value'])) + # else: + # raise AssertionError( + # "can't identify date/time format") + status += 1 + elif heads[status] == 'size': + sizestr = td.get_text().strip().replace(',', '') + if sizestr == '-' or not sizestr: + file_size = None + elif td.get('data-sort-value'): + file_size = int(td['data-sort-value']) + else: + match = RE_FILESIZE.match(sizestr) + if match: + file_size = human2bytes( + match.group(0).replace(' ', '')) + else: + file_size = None + status += 1 + elif heads[status] == 'description': + file_desc = file_desc or ''.join(map(str, td.children) + ).strip(' \t\n\r\x0b\x0c\xa0') or None + status += 1 + elif status: + # unknown header + status += 1 + if file_name: + listing.append(FileEntry( + file_name, file_mod, file_size, file_desc)) + elif tr.hr: + started = True + continue + elif tr.find(string=RE_COMMONHEAD): + namefound = False + colspan = False + for th in (tr.find_all('th') if tr.th else tr.find_all('td')): + if th.get('colspan'): + colspan = True + continue + name = th.get_text().strip(' \t\n\r\x0b\x0c\xa0↑↓').lower() + if not name: + continue + elif not namefound and RE_HEAD_NAME.search(name): + heads.append('name') + namefound = True + elif name in ('size', 'description'): + heads.append(name) + elif RE_HEAD_MOD.search(name): + heads.append('modified') + elif RE_HEAD_SIZE.search(name): + heads.append('size') + elif name.endswith('signature'): + heads.append('signature') + else: + heads.append('description') + if colspan: + continue + if not heads: + heads = ('name', 'modified', 'size', 'description') + elif not namefound: + heads[0] = 'name' + started = True + continue + elif soup.ul: + for li in soup.ul.find_all('li'): + a = li.a + if not a or not a.get('href'): + continue + file_name = urllib.parse.unquote(a['href']) + if (file_name in {'Parent Directory', '.', './', '..', '../', '#'} + or RE_ABSPATH.match(file_name)): + continue + else: + listing.append(FileEntry(file_name, None, None, None)) + return cwd, listing + +def fetch_listing(url, timeout=30): + req = requests.get(url, headers=HEADERS, timeout=timeout) + req.raise_for_status() + soup = bs4.BeautifulSoup(req.content, 'html5lib') + return parse(soup) + +def is_directory(entry): + return entry.description == "Directory" or (not entry.description and not entry.size) + +def print_fetch_dir(url, max_recurse_level=MAX_RECURSE_LEVEL, recurse_level=0): + if recurse_level == 0: + print(url) + print("-----------------------") + if recurse_level == max_recurse_level: + return + recurse_level += 1 + cwd, listing = fetch_listing(url) + # Fix cwd to support inner starting point + # cwd shouldn't start with /, but it should end with one + if cwd: + cwd = cwd.strip("/") + "/" + else: + cwd = "" + for f in listing: + filename_print = cwd + f.name + if is_directory(f): + if not filename_print.endswith("/"): + filename_print = filename_print + "/" + date_format = time.strftime('%Y-%m-%d %H:%M:%S', f.modified) + size_format = f.size or "0" + print("{}{:>13} {}".format(date_format, size_format, filename_print)) + if is_directory(f): + print_fetch_dir(url=url + f.name, max_recurse_level=max_recurse_level, recurse_level=recurse_level) + +# BFS and DFS mixture - output entire content of each directory met +def yield_fetch_dir(url, max_recurse_level=MAX_RECURSE_LEVEL, recurse_level=0): + if recurse_level == max_recurse_level: + return + queue_process = [] + recurse_level += 1 + cwd, listing = fetch_listing(url) + # Fix cwd to support inner starting point + # cwd shouldn't start with /, but it should end with one + if cwd: + cwd = cwd.strip("/") + "/" + else: + cwd = "" + for f in listing: + filename_output = cwd + f.name + if is_directory(f): + if not filename_output.endswith("/"): + filename_output = filename_output + "/" + date_format = time.strftime('%Y-%m-%d %H:%M:%S', f.modified) + size_format = f.size or "0" + yield "{}{:>13} {}".format(date_format, size_format, filename_output) + os.linesep + queue_process.append(f) + for f in queue_process: + if is_directory(f): + yield from yield_fetch_dir(url=url + f.name, max_recurse_level=max_recurse_level, recurse_level=recurse_level) + + +class HTTPIndexStorageProvider(StorageProvider): + NODE_BATCH_UPDATE_COUNT = 40 + + @staticmethod + def is_provider(url): + url = url.lower() + scheme = urllib.parse.urlparse(url).scheme + if scheme and "http" in scheme: + return True + return False + + def check(self): + try: + cwd, listing = fetch_listing(self.url) + return len(listing) > 0 + except Exception as e: + show_message_box(self.get_default_error_message()) + return False + + def get_download_url(self, relative_path): + uri_obj = urllib.parse.urlparse(self.url) + return '{uri.scheme}://{uri.netloc}/{relative_path}'.format(uri=uri_obj, relative_path=relative_path) + + def yield_dirlist(self): + for dirlist_line in yield_fetch_dir(self.url): + # Stop + if self.should_stop: + break + yield dirlist_line + + def get_default_error_message(self): + return "Could not parse Apache/nginx-style directory listing. Are you sure it's a valid HTTP dir index?".format(self.hostname()) + + def hostname(self): + return urllib.parse.urlparse(self.url).netloc + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='DirLister IndexOf/') + parser.add_argument('-u', '--url', dest='url', help='URL') + parser.add_argument('--max_level', dest='max_level', type=int, default=MAX_RECURSE_LEVEL, help='Max recurse level') + + args = parser.parse_args() + print_fetch_dir(url=args.url, max_recurse_level=args.max_level) \ No newline at end of file diff --git a/src/providers/providers.py b/src/providers/providers.py new file mode 100644 index 0000000..3ea629a --- /dev/null +++ b/src/providers/providers.py @@ -0,0 +1,16 @@ +import shutil +import subprocess +import codecs +from distutils.spawn import find_executable +from urllib.parse import urlparse + +from utils import show_message_box +from providers.aws_provider import S3StorageProvider +from providers.httpindex_provider import HTTPIndexStorageProvider + +def find_provider_class_by_url(url): + if S3StorageProvider.is_provider(url): + return S3StorageProvider + elif HTTPIndexStorageProvider.is_provider(url): + return HTTPIndexStorageProvider + return None diff --git a/src/s3viewer.py b/src/s3viewer.py index 0cb5746..0911492 100644 --- a/src/s3viewer.py +++ b/src/s3viewer.py @@ -11,7 +11,7 @@ from nodefs import * from dirlist import * from consts import * -from providers import * +from providers.providers import * class Mode(): @@ -195,7 +195,7 @@ def setupUi(self, MainWindow): # Label statistics self.labelStatistics = QtWidgets.QLabel(self.centralwidget) self.labelStatistics.setObjectName("labelStatistics") - self.labelStatistics.setText("Please load storage provider") + self.labelStatistics.setText("Please load a storage provider") self.verticalLayout.addWidget(self.labelStatistics) # Progress bar ￿ self.progressBar = QtWidgets.QProgressBar(self.centralwidget) @@ -369,16 +369,12 @@ def update_progress_bar(self, blocknum, blocksize, totalsize): if totalsize > 0: download_percentage = readed_data * 100 / totalsize self.progressBar.setValue(download_percentage) - # Finished downloading - if self.progressBar.value() >= 100: - self.mode.finished_downloading() - self.update_ui() QApplication.processEvents() def prepare_dirs_for_download(self, node): path_download = node.full_path.lstrip("/") # remove the first / if any # Make dirs - path_save_to = os.path.join(self.working_dir, self.current_url, path_download) + path_save_to = os.path.join(self.working_dir, self.current_provider.hostname(), path_download) path_save_to_dir = os.path.dirname(path_save_to) try: os.makedirs(path_save_to_dir, exist_ok=True) @@ -402,7 +398,6 @@ def download_node(self, node): # Download urllib.request.urlretrieve(url_download_encoded, path_save_to, self.update_progress_bar) # Update node - self.node_processing = node self.node_processing.is_downloaded = True self.node_processing.download_path = path_save_to except Exception as e: @@ -413,12 +408,15 @@ def download_node(self, node): def download_node_with_gui_update(self, node): if not self.selected_tree_node or not self.selected_tree_node.is_file: return + self.node_processing = node # Update UI self.mode.starting_downloading() + self.update_ui() # Download if self.download_node(node): self.selected_tree_item.setText(3, " V ") self.mode.finished_downloading() + self.update_ui() ################################################ ################## Actions #####################