Skip to content

Commit ae9b4e9

Browse files
committed
added arguments in prep for credential usage
1 parent c789ebd commit ae9b4e9

File tree

6 files changed

+60
-70
lines changed

6 files changed

+60
-70
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
# VS Code config
22
.vscode/
33
# extracted info
4-
domains/
4+
domains/
5+
# personal login credentials
6+
*/keys.py
7+
# python
8+
__pycache__/

sites-google-com_imgs.txt

Lines changed: 0 additions & 13 deletions
This file was deleted.

sites-google-com_pages.txt

Lines changed: 0 additions & 20 deletions
This file was deleted.

src/arguments.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import argparse
2+
3+
def make_args():
4+
parser = argparse.ArgumentParser()
5+
parser.add_argument("url",
6+
help="URL from domain to scrape")
7+
parser.add_argument("-l", "--login",
8+
help="specify you require logging into website to scrape",
9+
action="store_true")
10+
return parser.parse_args()

src/main.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import sys, os
2+
from urllib.request import urlopen
3+
from urllib.parse import urlparse
4+
from bs4 import BeautifulSoup
5+
import util
6+
import arguments
7+
8+
if __name__ == "__main__":
9+
10+
args = arguments.make_args()
11+
main_url = args.url
12+
13+
# check if credentials needed
14+
if args.login:
15+
print("logging in")
16+
17+
# get domain
18+
domain = urlparse(main_url).netloc
19+
scheme = urlparse(main_url).scheme
20+
21+
# make soup
22+
soup = util.make_soup(main_url)
23+
24+
# get href links
25+
link_tags = soup.find_all("a")
26+
27+
# create main output if not existing
28+
main_folder_path = os.path.join(os.getcwd(), "domains")
29+
if not os.path.isdir(main_folder_path):
30+
os.mkdir(main_folder_path)
31+
32+
# create instance folder
33+
folder_name = "{}".format(domain.replace(".","-"))
34+
folder_path = util.make_folder(folder_name, main_folder_path)
35+
36+
# write domain links to file
37+
domain_urls = util.get_domain_links(link_tags, main_url)
38+
util.write_to_file(domain_urls, "pages", folder_path, folder_name)
39+
40+
# write image links to file
41+
img_urls = util.get_img_links(domain_urls, main_url)
42+
util.write_to_file(img_urls, "imgs", folder_path, folder_name)

soup_script.py renamed to src/util.py

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99

1010
def make_soup(url):
1111
"""
12-
returns beatiful soup representation of webpage
12+
returns soup representation of webpage
1313
"""
1414
# get HTTPResponse into page
15-
page = urlopen(main_url)
15+
page = urlopen(url)
1616

1717
# read
1818
html_bytes = page.read()
@@ -78,37 +78,4 @@ def get_img_links(domain_urls, main_url) -> List[str]:
7878
if full_img_url not in img_urls:
7979
img_urls.append(full_img_url)
8080

81-
return img_urls
82-
83-
if __name__ == "__main__":
84-
85-
try:
86-
sys.argv[1]
87-
except IndexError: # test value
88-
main_url = "http://olympus.realpython.org/profiles/aphrodite"
89-
else:
90-
main_url = sys.argv[1]
91-
92-
# get domain
93-
domain = urlparse(main_url).netloc
94-
scheme = urlparse(main_url).scheme
95-
96-
soup = make_soup(main_url)
97-
98-
# get href links
99-
link_tags = soup.find_all("a")
100-
# create main folder if not existing
101-
main_folder_path = os.path.join(os.getcwd(), "domains")
102-
if not os.path.isdir(main_folder_path):
103-
os.mkdir(main_folder_path)
104-
# create folder
105-
folder_name = "{}".format(domain.replace(".","-"))
106-
folder_path = make_folder(folder_name, main_folder_path)
107-
108-
# write links to file
109-
domain_urls = get_domain_links(link_tags, main_url)
110-
write_to_file(domain_urls, "pages", folder_path, folder_name)
111-
112-
# get image links
113-
img_urls = get_img_links(domain_urls, main_url)
114-
write_to_file(img_urls, "imgs", folder_path, folder_name)
81+
return img_urls

0 commit comments

Comments
 (0)