Add Project files

tylerjrbuell · Nov 16, 2021 · 3eb8bfa · 3eb8bfa
1 parent 49c7209
commit 3eb8bfa
Show file tree

Hide file tree

Showing 7 changed files with 166 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+/aws-config
+/Extractions
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.8-slim-buster
+WORKDIR /aws-s3
+
+COPY requirements.txt .
+#Install Amazon sdk
+RUN pip install -r requirements.txt
+#Install os packages
+RUN apt-get update && apt-get install -yq curl unzip
+#Install aws cli
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+RUN unzip awscliv2.zip 
+RUN ./aws/install
+
+ENTRYPOINT python main.py
diff --git a/README.md b/README.md
@@ -0,0 +1,32 @@
+# AWS S3 Bucket Extractor
+![s3-Extractor](demo_gif.gif)
+## Prerequisites
+
+- Docker
+- AWS Credentials (Access Key ID, Secret Access Key)
+
+## Existing AWS CLI credentials
+- If you have previously installed and configured AWS cli on your host, copy your configuration files (config & credentials) from `%UserProfile%/.aws` (windows) or `$HOME/.aws` (Linux) to the respective file in `aws-config`
+
+## Quick Start
+1. RUN `docker-compose run --rm s3-extract-cli`
+2. Enter AWS cli configuration details (follow interactive prompts)
+    - AWS Access Key ID
+    - AWS Secret Access Key
+    - Default region name (ex: us-east-1)
+    - Default output format (ex: json)
+3. Enter the target S3 URI address (ex: s3://bucket_name/subfolder) `NOTE: This can be copied to your clipboard on the AWS S3 Web Console`
+
+## SubFolders
+- The Extractor preserves the last subfolder in the path and extracts recursively
+  so all files beneath the given bucket/subdirectory will be downloaded
+
+## Extracting All Files From a Bucket
+- To extract all files from a single s3 bucket use the following s3 URI syntax (ex: s3://bucket_name/*)
+- Nested Subdirectory structure is not currently preserved
+- *DISCLAIMER* - Downloading all files from a bucket is not recommended and can take a long time depending
+                on how many objects and their sizes. I recommend isolating files into a subfolder and targeting that folder and any subfolder that fall beneath it.
+## Output
+- By default files will be downloaded into a normal directory in the `Extractions` folder
+- Extractor will give the option to `compress` the folder to a `.zip` format
+
diff --git a/demo_gif.gif b/demo_gif.gif
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,10 @@
+version: '2.1'
+services:
+  s3-extract-cli:
+    build: .
+    container_name: s3-extractor
+    working_dir: /aws-s3
+    stdin_open: true
+    volumes:
+      - .:/aws-s3
+      - ./aws-config:/root/.aws/
diff --git a/main.py b/main.py
@@ -0,0 +1,106 @@
+import boto3
+import os
+from tqdm import tqdm
+from shutil import make_archive
+
+def confirm(prompt):
+    answer = ""
+    while answer not in ["y", "n"]:
+        answer = input(prompt).lower()
+    return answer == "y"
+
+#Set AWS cli config
+def aws_login():
+    """Prompts user for aws credentials and default config
+    via the aws cli 
+
+    Returns:
+        boolean: status of login
+    """
+    if(not os.path.exists('./aws-config/credentials')):
+        while not os.path.exists('./aws-config/credentials'):
+            print('\n|-----⚙️  AWS Configuration⚙️-----|\n')
+            os.system('aws configure')
+        return True
+    print('\nCongrats you have pre-configured credentials 🔥👏\n')
+    os.system('aws configure list')
+    if confirm('\nWould you like to logout and re-setup your configuration? [Y/N] -> '):
+        os.unlink('./aws-config/credentials')
+        aws_login()
+    return True
+
+#Authenticate
+aws_login()
+
+# Init s3 resource
+s3 = boto3.resource('s3')
+
+#set output directory
+output_dir = 'Extractions'
+
+#set base directory
+base_dir = os.path.join(os.path.abspath(os.curdir),output_dir)
+
+
+
+def extract_bucket_contents(bucket_name,folder_name):
+    """Extracts the contents of a given AWS s3 bucket
+
+    Args:
+        bucket_name (string): Name of the bucket
+        folder_name (string): Subfolder name within the bucket
+    """
+    bucket =  s3.Bucket(bucket_name)
+    objects = bucket.objects.filter(Prefix=folder_name)
+    total_objects = sum(1 for _ in objects)
+    if total_objects > 0:
+        os.makedirs(f'{base_dir}/{bucket_name}/{folder_name}',exist_ok=True)
+        os.chdir(f'{base_dir}/{bucket_name}/{folder_name}')
+        print(f'\nExtracting {total_objects} object(s) from s3://{bucket_name}/{folder_name}...\n')
+        with tqdm(total=total_objects,ncols=100,desc="Download Progress") as pbar:
+            for obj in objects:
+                pbar.update(1)
+                path, filename = os.path.split(obj.key)
+                if filename:
+                    if not os.path.exists(filename):
+                        bucket.download_file(obj.key,filename)
+        os.chdir(base_dir)
+        if confirm("\nWould you also like to compress the bucket contents to a zip file? [Y/N] -> "):
+            print(f'\nWriting zip file {os.path.join(base_dir,bucket_name)}.zip...')
+            make_archive(bucket_name,'zip',os.path.join(os.curdir,bucket_name))
+    else:
+        print(f'\nNo objects found at the given location: s3://{bucket_name}/{folder_name}')
+
+def get_s3_target():
+    """Retrives a s3 URI address from the user and 
+    parses it to return the associated bucket name and folder\prefix path
+
+    Returns:
+        tuple: bucket name | folder path
+    """
+    s3_uri = input('\nEnter s3 URI (ex: s3://bucket_name/subfolder) -> ')
+    while s3_uri == '':
+        print('\n**s3 URI is required!**')
+        s3_uri = input('\nEnter s3 URI (ex: s3://bucket_name/subfolder) -> ')
+    try:
+        bucket_name, folder_name = s3_uri.replace("s3://", "").split("/", 1)
+        return bucket_name,folder_name.strip('*')
+    except ValueError:
+        print(f'\n** Error parsing s3 URI please try again **')
+
+
+def main():
+    try:
+        print("\n\n|------🪣  S3 Bucket Extractor🪣------|\n")
+        while True:
+            bucket_name, folder_name = get_s3_target()
+            extract_bucket_contents(bucket_name,folder_name)
+    except Exception as e:
+        print(f'\n** Error extracting bucket contents: {e} **')
+        main()
+
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+boto3
+tqdm