-
Couldn't load subscription status.
- Fork 1
Webscraper #4
Webscraper #4
Changes from 3 commits
81a1869
b14af68
1617691
a4511e9
5c209f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| import re | ||
| import requests | ||
| import urllib.request | ||
| import time | ||
| # from test_data import * | ||
| from bs4 import BeautifulSoup | ||
|
|
||
| # Started by Jeremy Lim on 20/07/2019 | ||
|
|
||
| # TO-DO LIST | ||
| # - Get timetable done | ||
| # - Get GenEd done | ||
| # - Majors | ||
| # - Degrees | ||
| # - Add to database | ||
|
|
||
| ##### COURSES ##### | ||
|
|
||
| # To print the info of each course | ||
| def printInfo(code, link, name, cred): | ||
| print("CODE: " + code) # prints the code of the course | ||
| print("LINK: " + link) # prints the link of the course | ||
| print("NAME: " + name) # prints the name of the course | ||
| print("CREDITS: " + cred) # prints the credits of the course | ||
|
|
||
| # To go through each letter's links for courses | ||
| def run_course(): | ||
| alphabet = ['A','B','C','D','E','F','G','H','I','L','M','N','O','P','R','S','T','V','Y','Z'] | ||
|
|
||
| for letter in alphabet[0:2]: | ||
|
||
|
|
||
| # runs the url for the letter search | ||
| course_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brCoursesByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter | ||
LiberoHS marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| response = requests.get(course_url) | ||
| course_soup = BeautifulSoup(response.text, "html.parser") | ||
| # soup = BeautifulSoup(test_url, "html.parser") | ||
|
|
||
| tr = course_soup.find_all('tr') # this finds the first instance | ||
|
|
||
| for i in range(1,3): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for i in range(1, len(tr)) but again it will probably change with a regex implementation |
||
| counter = 0 | ||
| td = tr[i].find_all('td') # this finds each of the td in tr | ||
| # print(td) | ||
| code = td[0].text | ||
| link = td[1].find_all('a')[0]['href'] # searches for the link | ||
| name = td[1].find_all('a')[0].text # searches for the a tags | ||
| cred = td[2].text # gets the credits | ||
|
|
||
| # DEBUGGING FOR MAIN INFO | ||
| printInfo(code, link, name, cred) | ||
|
|
||
| # goes to the course link and scrapes the data | ||
| link_url = requests.get(link) | ||
| link_soup = BeautifulSoup(link_url.text, "html.parser") | ||
|
|
||
| p_data = link_soup.find_all('p') | ||
| h_data = link_soup.find_all('h2') | ||
|
|
||
| # collects course data | ||
| for p_instance in p_data: | ||
| search = p_instance.findChildren() | ||
| # print("LENGTH: " + str(len(search))) | ||
| if (len(search) > 0 and len(search[0].contents) > 0): | ||
| if (search[0].text == 'Faculty:'): | ||
| if (len(search) > 1): | ||
| print("FACULTY: " + search[1].text) | ||
| else: | ||
| print("FACULTY: " + p_instance.contents[1].strip()) | ||
|
|
||
| if (search[0].text == 'School:'): | ||
| if (len(search) > 1): | ||
| print("SCHOOL: " + search[1].text) | ||
| else: | ||
| print("SCHOOL: " + p_instance.contents[1].strip()) | ||
|
|
||
| if (search[0].text == 'Career:'): | ||
| if (len(search) > 1): | ||
| print("CAREER: " + search[1].text) | ||
| else: | ||
| print("CAREER: " + p_instance.contents[1].strip()) | ||
|
|
||
| # GenEd not working yet | ||
|
|
||
| if (search[0].text == "Available for General Education:"): | ||
| if (len(search) > 1): | ||
| counter += 1 | ||
| break | ||
|
|
||
| # collects course descripter | ||
| for h_instance in h_data: | ||
| if (h_instance.text == "Description"): | ||
| desc_tags = str(h_instance.find_next_siblings()[0]) | ||
| desc = str(re.sub("<.*?>", "", desc_tags)) | ||
| print("DESCRIPTION: " + desc) | ||
|
|
||
| # checks for General Education existence in course link | ||
| if (counter == 0): | ||
| print("GENED: False") | ||
| print("") | ||
| else: | ||
| print("GENED: True") | ||
| print("") | ||
|
|
||
| ##### SPECIALISATIONS (WIP) ##### | ||
|
|
||
| def run_spec(): | ||
| alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','R','S','T','V','W'] | ||
|
||
|
|
||
| for letter in alphabet[0:2]: | ||
|
||
| spec_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brSpecialisationsByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter | ||
LiberoHS marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| response = requests.get(spec_url) | ||
| spec_soup = BeautifulSoup(response.text, "html.parser") | ||
|
|
||
| spec_tr = spec_soup.find_all('tr') # this finds the first instance | ||
|
|
||
| for i in range(1,3): | ||
| counter = 0 | ||
| spec_td = spec_tr[i].find_all('td') # this finds each of the td in tr | ||
| spec_name = spec_td[0].text | ||
| spec_link = spec_td[0].find_all('a')[0]['href'] | ||
| print(spec_name) | ||
| print(spec_link) | ||
| print("") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it needs to collect courses inside the specialisation links, but they use different structures, so that's why i got a bit stuck |
||
|
|
||
| run_course() | ||
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hard coded constants should go at top of code preferably
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can import this: https://docs.python.org/3/library/string.html#string.ascii_uppercase
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think i brought this up some time ago in person with him, he is not using the whole alphabet for some reason which has eluded me now