-
Couldn't load subscription status.
- Fork 1
Webscraper #4
Webscraper #4
Changes from all commits
81a1869
b14af68
1617691
a4511e9
5c209f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,216 @@ | ||
| ''' | ||
| Copyright 2019 UNSW CSESoc | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. | ||
| ''' | ||
|
|
||
| import codecs, os, re, requests, sys | ||
| from requests.exceptions import HTTPError | ||
|
|
||
| COURSE_LIST_RE = re.compile(r'<TD class="(?:evenTableCell)?" align="left">([A-Z]{4}[0-9]{4})</TD>') | ||
| PREREQS_RE = re.compile(r"Pre-?req(?:uisites?)?:(.*?)(?:</p>|;)") | ||
| EXCLUSIONS_RE = re.compile(r"((?:Excluded|Exclusion|Exclusions|(?:and )?Excludes)[: ](.*?))(?:</p>|<br />)", re.IGNORECASE) | ||
| COREQS_RE = re.compile(r"Co-?requisite:(.*?)</p>", re.IGNORECASE) | ||
| NAME_RE = re.compile(r"<title>UNSW Handbook Course - (.*?) - [A-Z]{4}[0-9]{4}</title>", re.DOTALL) | ||
| DESC_RE = re.compile(r"<!-- Start Course Description -->(.*?)<!-- End Course description -->", re.DOTALL | re.IGNORECASE) | ||
| GENED_RE = re.compile(r"Available for General Education:") | ||
| OUTLINE_RE = re.compile(r"Course Outline:.*?<a .*?href=[\"'](.*?)[\"']") | ||
| UOC_RE = re.compile(r"Units of Credit:.*?([0-9]+)") | ||
| COURSE_RE = re.compile(r"[A-Z]{4}[0-9]{4}", re.IGNORECASE) | ||
| BR_RE = re.compile(r"<br ?/?>", re.IGNORECASE) | ||
| TAG_RE = re.compile(r"</?.*?>") | ||
|
|
||
| TYPE_PREREQUISITE = "prerequisite" | ||
| TYPE_COREQUISITE = "corequisite" | ||
| TYPE_EXCLUSION = "exclusion" | ||
|
|
||
| COURSES_DIR = "courses" | ||
| UG = "undergraduate" | ||
| PG = "postgraduate" | ||
| CURRENT_YEAR = "2018" | ||
|
|
||
| def scrape_list(url): | ||
|
|
||
| print("Fetching page data") | ||
| try: | ||
| data = requests.get(url).text | ||
| print("Finding course codes") | ||
| codes = re.findall(COURSE_LIST_RE, data) | ||
| print("Done") | ||
| return codes | ||
| except HTTPError as http_err: | ||
| print("HTTP error") | ||
| except Exception as err: | ||
| print(err) | ||
| return None | ||
|
|
||
| def scrape_area(area, level=UG): | ||
| print("Finding all courses for " + str(area)) | ||
| return scrape_list("http://legacy.handbook.unsw.edu.au/vbook" + CURRENT_YEAR + "/brCoursesBySubjectArea.jsp?studyArea=" + str(area) + "&StudyLevel=" + str(level)) | ||
|
|
||
| def scrape_everything(level): | ||
| url = "http://legacy.handbook.unsw.edu.au/vbook%s/brCoursesBySubjectArea.jsp?StudyLevel=%s&descr=A" % (CURRENT_YEAR, level) | ||
| print("Reading area list") | ||
| data = requests.get(url).text | ||
| codes = re.findall(r'>([A-Z]{4}): .*?</A></TD>', data) | ||
| print(codes) | ||
| for code in codes: | ||
| for course in scrape_area(code, level): | ||
| scrape(course, level) | ||
|
|
||
| def scrape(course, level=UG): | ||
| url = "http://legacy.handbook.unsw.edu.au/%s/courses/%s/%s.html" % (level, CURRENT_YEAR, course) | ||
| filename = "%s/%s.html" % (COURSES_DIR, course) | ||
| if os.path.exists(filename): | ||
| print("Skipping " + course) | ||
| return | ||
| print("Fetching " + course) | ||
| try: | ||
| data = requests.get(url).text | ||
| except Exception as e: | ||
| print("FAILED: " + e.message) | ||
| return | ||
| with open(filename, "w") as f: | ||
| f.write(data) | ||
|
|
||
| if __name__ == "__main__": | ||
| if not os.path.exists(COURSES_DIR): | ||
| os.mkdir(COURSES_DIR) | ||
| scrape_everything(UG) | ||
| scrape_everything(PG) | ||
|
|
||
| # Database Construction | ||
| # TODO: Connect to our postgres server and populate in same fashion | ||
| # Take following steps... | ||
| # Check for DB, error exit on non-existence | ||
| # Create tables if they dont exist | ||
| # start by just making unique tables for pathways, eventually should have one course table | ||
| #cur.execute("CREATE TABLE courses (code text primary key, name text, description text, prerequisites text, corequisites text, exclusions text, gened integer, outline text, uoc integer)") | ||
| #cur.execute("CREATE TABLE relationships (source text, destination text, type text)") | ||
|
|
||
| print("\nLoading course list") | ||
| filenames = os.listdir(COURSES_DIR) | ||
| i = 0 | ||
| for filename in filenames: | ||
| i += 1 | ||
| code = filename.rstrip(".html") | ||
| print("Reading %s (%d/%d)" % (code, i, len(filenames))) | ||
|
|
||
| # open with unicode support | ||
| f = codecs.open("%s/%s" % (COURSES_DIR, filename), encoding="utf-8", mode="r") | ||
| data = f.read() | ||
| f.close() | ||
|
|
||
| # strip 's and <strong> tags | ||
| data = data.replace(" ", " ") | ||
| data = data.replace("<strong>", "") | ||
| data = data.replace("</strong>", "") | ||
|
|
||
| # find name | ||
| match = re.search(NAME_RE, data) | ||
| if match: | ||
| name = match.group(1).strip().replace("\n", "") | ||
| print("Found name:", name) | ||
| else: | ||
| name = None | ||
| print("Failed to find name for course: " + str(filename)) | ||
| continue | ||
|
|
||
| # find exclusions. all of them. | ||
| exclusions = "" | ||
| exclusions_list = [] | ||
| while True: | ||
| match = re.search(EXCLUSIONS_RE, data) | ||
| if match: | ||
| exclusions = match.group(2).strip() | ||
| print("Found exclusions:", exclusions) | ||
| data = data.replace(match.group(1), "") | ||
| exclusions_list = re.findall(COURSE_RE, exclusions) | ||
| print("Exclusions list:", exclusions_list) | ||
| else: | ||
| break | ||
| # find corequisites | ||
| match = re.search(COREQS_RE, data) | ||
| if match: | ||
| coreqs = match.group(1).strip() | ||
| print("Found corequisites: ", coreqs) | ||
| data = data.replace(match.group(0), "") | ||
| coreqs_list = list(map(str.upper, re.findall(COURSE_RE, coreqs))) | ||
| print("Corequisites list: ", coreqs_list) | ||
| else: | ||
| coreqs = None | ||
| coreqs_list = [] | ||
| print("Couldn't find corequisites") | ||
|
|
||
| # find prerequisites | ||
| match = re.search(PREREQS_RE, data) | ||
| if match: | ||
| prereqs = match.group(1).strip() | ||
| print("Found prerequisites: ", prereqs) | ||
| data = data.replace(match.group(0), "") | ||
| prereqs_list = list(map(str.upper, re.findall(COURSE_RE, prereqs))) | ||
| print("Prerequisites list: ", prereqs_list) | ||
| else: | ||
| prereqs = None | ||
| prereqs_list = [] | ||
| print("Couldn't find prerequisites") | ||
|
|
||
| # find description | ||
| match = re.search(DESC_RE, data) | ||
| if match: | ||
| desc = match.group(1).strip() | ||
| print("Found description: " + str(desc)) | ||
| else: | ||
| desc = None | ||
| print("Couldn't find description") | ||
|
|
||
| # find general education statement | ||
| match = re.search(GENED_RE, data) | ||
| if match: | ||
| gened = 1 | ||
| else: | ||
| gened = 0 | ||
|
|
||
| # find course outline | ||
| match = re.search(OUTLINE_RE, data) | ||
| if match: | ||
| outline = match.group(1).strip() | ||
| print("Found course outline: ", outline) | ||
| else: | ||
| outline = None | ||
| print("Couldn't find course outline") | ||
|
|
||
| # find uoc | ||
| match = re.search(UOC_RE, data) | ||
| if match: | ||
| uoc = match.group(1).strip() | ||
| try: | ||
| uoc = int(uoc) | ||
| print("Found UoC: ", uoc) | ||
| except: | ||
| print("UoC was not an integer: '%s'" % uoc) | ||
| uoc = None | ||
| else: | ||
| uoc = None | ||
| print("Couldn't find UoC") | ||
|
|
||
|
|
||
| #cur.execute("INSERT INTO courses (code, name, description, prerequisites, corequisites, exclusions, gened, outline, uoc) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (code, name, desc, prereqs, coreqs, exclusions, gened, outline, uoc)) | ||
| #for prereq in prereqs_list: | ||
| # cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, prereq, TYPE_PREREQUISITE)) | ||
| #for coreq in coreqs_list: | ||
| # cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, coreq, TYPE_COREQUISITE)) | ||
| #for exclusion in exclusions_list: | ||
| # cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, exclusion, TYPE_EXCLUSION)) | ||
| #print() | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| #!/usr/bin/python3 | ||
|
|
||
| import re | ||
| import requests | ||
| import urllib.request | ||
| import time | ||
| from bs4 import BeautifulSoup | ||
|
|
||
| # Started by Jeremy Lim on 20/07/2019 | ||
|
|
||
| # TO-DO LIST | ||
| # - Get timetable done | ||
| # - Get GenEd done | ||
| # - Majors | ||
| # - Degrees | ||
| # - Add to database | ||
|
|
||
| # Stripped down alphabet | ||
| course_alphabet = ['A','B','C','D','E','F','G','H','I','L','M','N','O','P','R','S','T','V','Y','Z'] | ||
| spec_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','R','S','T','V','W'] | ||
|
|
||
| ##### COURSES ##### | ||
|
|
||
| # To print the info of each course | ||
| def print_course(code, link, name, cred): | ||
| print("CODE: " + code) # prints the code of the course | ||
| print("LINK: " + link) # prints the link of the course | ||
| print("NAME: " + name) # prints the name of the course | ||
| print("CREDITS: " + cred) # prints the credits of the course | ||
|
|
||
| # To go through each letter's links for courses | ||
| def run_course(): | ||
|
|
||
| for letter in course_alphabet: | ||
| course_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brCoursesByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter | ||
| response = requests.get(course_url) | ||
| course_soup = BeautifulSoup(response.text, "html.parser") | ||
|
|
||
| # Do webscraping | ||
| tr = course_soup.find_all('tr') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. searches for the table rows because all the courses share a pattern. tbh should use regex to make it more efficient. |
||
| for i in range(1,3): | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for i in range(1, len(tr)) but again it will probably change with a regex implementation |
||
| counter = 0 | ||
| td = tr[i].find_all('td') | ||
| code = td[0].text | ||
| link = td[1].find_all('a')[0]['href'] | ||
| name = td[1].find_all('a')[0].text | ||
| cred = td[2].text | ||
|
|
||
| print_course(code, link, name, cred) | ||
|
|
||
| # Go to course link and scrape the data | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this enters each course's link and scrapes all the relevant data, some stuff like term offerings are still not done because it requires some extra tests |
||
| link_url = requests.get(link) | ||
| link_soup = BeautifulSoup(link_url.text, "html.parser") | ||
|
|
||
| p_data = link_soup.find_all('p') | ||
| h_data = link_soup.find_all('h2') | ||
| for p_instance in p_data: | ||
| search = p_instance.findChildren() | ||
| if (len(search) > 0 and len(search[0].contents) > 0): | ||
| if (search[0].text == 'Faculty:'): | ||
| if (len(search) > 1): | ||
| print("FACULTY: " + search[1].text) | ||
| else: | ||
| print("FACULTY: " + p_instance.contents[1].strip()) | ||
|
|
||
| if (search[0].text == 'School:'): | ||
| if (len(search) > 1): | ||
| print("SCHOOL: " + search[1].text) | ||
| else: | ||
| print("SCHOOL: " + p_instance.contents[1].strip()) | ||
|
|
||
| if (search[0].text == 'Career:'): | ||
| if (len(search) > 1): | ||
| print("CAREER: " + search[1].text) | ||
| else: | ||
| print("CAREER: " + p_instance.contents[1].strip()) | ||
|
|
||
| # GenEd not working yet | ||
|
|
||
| if (search[0].text == "Available for General Education:"): | ||
| if (len(search) > 1): | ||
| counter += 1 | ||
| break | ||
|
|
||
| for h_instance in h_data: | ||
| if (h_instance.text == "Description"): | ||
| desc_tags = str(h_instance.find_next_siblings()[0]) | ||
| desc = str(re.sub("<.*?>", "", desc_tags)) | ||
| print("DESCRIPTION: " + desc) | ||
|
|
||
| # checks for General Education existence in course link | ||
| if (counter == 0): | ||
| print("GENED: False\n") | ||
| else: | ||
| print("GENED: True\n") | ||
|
|
||
| ##### SPECIALISATIONS (WIP) ##### | ||
|
|
||
| def run_spec(): | ||
|
|
||
| for letter in spec_alphabet: | ||
| spec_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brSpecialisationsByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter | ||
LiberoHS marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| response = requests.get(spec_url) | ||
| spec_soup = BeautifulSoup(response.text, "html.parser") | ||
|
|
||
| spec_tr = spec_soup.find_all('tr') # this finds the first instance | ||
| for i in range(1,3): | ||
| counter = 0 | ||
| spec_td = spec_tr[i].find_all('td') # this finds each of the td in tr | ||
| spec_name = spec_td[0].text | ||
| spec_link = spec_td[0].find_all('a')[0]['href'] | ||
| print(spec_name) | ||
| print(spec_link) | ||
| print("") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it needs to collect courses inside the specialisation links, but they use different structures, so that's why i got a bit stuck |
||
|
|
||
| if __name__ == "__main__": | ||
| run_course() | ||
Uh oh!
There was an error while loading. Please reload this page.