added arguments in prep for credential usage

ocost3000 · ocost3000 · commit ae9b4e9e3e6e · 2021-04-18T23:02:41.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,8 @@
 # VS Code config
 .vscode/
 # extracted info
-domains/
+domains/
+# personal login credentials
+*/keys.py
+# python
+__pycache__/
diff --git a/sites-google-com_imgs.txt b/sites-google-com_imgs.txt
diff --git a/sites-google-com_pages.txt b/sites-google-com_pages.txt
diff --git a/src/arguments.py b/src/arguments.py
@@ -0,0 +1,10 @@
+import argparse
+
+def make_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("url",
+                        help="URL from domain to scrape")
+    parser.add_argument("-l", "--login", 
+                        help="specify you require logging into website to scrape",
+                        action="store_true")
+    return parser.parse_args()
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,42 @@
+import sys, os
+from urllib.request import urlopen
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+import util
+import arguments
+
+if __name__ == "__main__":
+
+    args = arguments.make_args()
+    main_url = args.url
+
+    # check if credentials needed
+    if args.login:
+        print("logging in")
+
+    # get domain
+    domain = urlparse(main_url).netloc
+    scheme = urlparse(main_url).scheme
+
+    # make soup
+    soup = util.make_soup(main_url)
+
+    # get href links
+    link_tags = soup.find_all("a")
+
+    # create main output if not existing
+    main_folder_path = os.path.join(os.getcwd(), "domains")
+    if not os.path.isdir(main_folder_path):
+        os.mkdir(main_folder_path)
+
+    # create instance folder
+    folder_name = "{}".format(domain.replace(".","-"))
+    folder_path = util.make_folder(folder_name, main_folder_path)
+
+    # write domain links to file
+    domain_urls = util.get_domain_links(link_tags, main_url)
+    util.write_to_file(domain_urls, "pages", folder_path, folder_name)
+
+    # write image links to file
+    img_urls = util.get_img_links(domain_urls, main_url) 
+    util.write_to_file(img_urls, "imgs", folder_path, folder_name)
diff --git a/src/util.py b/src/util.py
@@ -9,10 +9,10 @@
 
 def make_soup(url): 
     """
-    returns beatiful soup representation of webpage
+    returns soup representation of webpage
     """
     # get HTTPResponse into page
-    page = urlopen(main_url)
+    page = urlopen(url)
 
     # read
     html_bytes = page.read()
@@ -78,37 +78,4 @@ def get_img_links(domain_urls, main_url) -> List[str]:
             if full_img_url not in img_urls:
                 img_urls.append(full_img_url) 
 
-    return img_urls
-
-if __name__ == "__main__":
-
-    try:
-        sys.argv[1]
-    except IndexError: # test value
-        main_url = "http://olympus.realpython.org/profiles/aphrodite"
-    else:
-        main_url = sys.argv[1]
-
-    # get domain
-    domain = urlparse(main_url).netloc
-    scheme = urlparse(main_url).scheme
-
-    soup = make_soup(main_url)
-
-    # get href links
-    link_tags = soup.find_all("a")
-    # create main folder if not existing
-    main_folder_path = os.path.join(os.getcwd(), "domains")
-    if not os.path.isdir(main_folder_path):
-        os.mkdir(main_folder_path)
-    # create folder
-    folder_name = "{}".format(domain.replace(".","-"))
-    folder_path = make_folder(folder_name, main_folder_path)
-
-    # write links to file
-    domain_urls = get_domain_links(link_tags, main_url)
-    write_to_file(domain_urls, "pages", folder_path, folder_name)
-
-    # get image links
-    img_urls = get_img_links(domain_urls, main_url) 
-    write_to_file(img_urls, "imgs", folder_path, folder_name)
+    return img_urls