Skip to content
This repository was archived by the owner on Jan 23, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 216 additions & 0 deletions server/scripts/pathways_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
'''
Copyright 2019 UNSW CSESoc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''

import codecs, os, re, requests, sys
from requests.exceptions import HTTPError

COURSE_LIST_RE = re.compile(r'<TD class="(?:evenTableCell)?" align="left">([A-Z]{4}[0-9]{4})</TD>')
PREREQS_RE = re.compile(r"Pre-?req(?:uisites?)?:(.*?)(?:</p>|;)")
EXCLUSIONS_RE = re.compile(r"((?:Excluded|Exclusion|Exclusions|(?:and )?Excludes)[: ](.*?))(?:</p>|<br />)", re.IGNORECASE)
COREQS_RE = re.compile(r"Co-?requisite:(.*?)</p>", re.IGNORECASE)
NAME_RE = re.compile(r"<title>UNSW Handbook Course - (.*?) - [A-Z]{4}[0-9]{4}</title>", re.DOTALL)
DESC_RE = re.compile(r"<!-- Start Course Description -->(.*?)<!-- End Course description -->", re.DOTALL | re.IGNORECASE)
GENED_RE = re.compile(r"Available for General Education:")
OUTLINE_RE = re.compile(r"Course Outline:.*?<a .*?href=[\"'](.*?)[\"']")
UOC_RE = re.compile(r"Units of Credit:.*?([0-9]+)")
COURSE_RE = re.compile(r"[A-Z]{4}[0-9]{4}", re.IGNORECASE)
BR_RE = re.compile(r"<br ?/?>", re.IGNORECASE)
TAG_RE = re.compile(r"</?.*?>")

TYPE_PREREQUISITE = "prerequisite"
TYPE_COREQUISITE = "corequisite"
TYPE_EXCLUSION = "exclusion"

COURSES_DIR = "courses"
UG = "undergraduate"
PG = "postgraduate"
CURRENT_YEAR = "2018"

def scrape_list(url):

print("Fetching page data")
try:
data = requests.get(url).text
print("Finding course codes")
codes = re.findall(COURSE_LIST_RE, data)
print("Done")
return codes
except HTTPError as http_err:
print("HTTP error")
except Exception as err:
print(err)
return None

def scrape_area(area, level=UG):
print("Finding all courses for " + str(area))
return scrape_list("http://legacy.handbook.unsw.edu.au/vbook" + CURRENT_YEAR + "/brCoursesBySubjectArea.jsp?studyArea=" + str(area) + "&StudyLevel=" + str(level))

def scrape_everything(level):
url = "http://legacy.handbook.unsw.edu.au/vbook%s/brCoursesBySubjectArea.jsp?StudyLevel=%s&descr=A" % (CURRENT_YEAR, level)
print("Reading area list")
data = requests.get(url).text
codes = re.findall(r'>([A-Z]{4}): .*?</A></TD>', data)
print(codes)
for code in codes:
for course in scrape_area(code, level):
scrape(course, level)

def scrape(course, level=UG):
url = "http://legacy.handbook.unsw.edu.au/%s/courses/%s/%s.html" % (level, CURRENT_YEAR, course)
filename = "%s/%s.html" % (COURSES_DIR, course)
if os.path.exists(filename):
print("Skipping " + course)
return
print("Fetching " + course)
try:
data = requests.get(url).text
except Exception as e:
print("FAILED: " + e.message)
return
with open(filename, "w") as f:
f.write(data)

if __name__ == "__main__":
if not os.path.exists(COURSES_DIR):
os.mkdir(COURSES_DIR)
scrape_everything(UG)
scrape_everything(PG)

# Database Construction
# TODO: Connect to our postgres server and populate in same fashion
# Take following steps...
# Check for DB, error exit on non-existence
# Create tables if they dont exist
# start by just making unique tables for pathways, eventually should have one course table
#cur.execute("CREATE TABLE courses (code text primary key, name text, description text, prerequisites text, corequisites text, exclusions text, gened integer, outline text, uoc integer)")
#cur.execute("CREATE TABLE relationships (source text, destination text, type text)")

print("\nLoading course list")
filenames = os.listdir(COURSES_DIR)
i = 0
for filename in filenames:
i += 1
code = filename.rstrip(".html")
print("Reading %s (%d/%d)" % (code, i, len(filenames)))

# open with unicode support
f = codecs.open("%s/%s" % (COURSES_DIR, filename), encoding="utf-8", mode="r")
data = f.read()
f.close()

# strip &nbsp;'s and <strong> tags
data = data.replace("&nbsp;", " ")
data = data.replace("<strong>", "")
data = data.replace("</strong>", "")

# find name
match = re.search(NAME_RE, data)
if match:
name = match.group(1).strip().replace("\n", "")
print("Found name:", name)
else:
name = None
print("Failed to find name for course: " + str(filename))
continue

# find exclusions. all of them.
exclusions = ""
exclusions_list = []
while True:
match = re.search(EXCLUSIONS_RE, data)
if match:
exclusions = match.group(2).strip()
print("Found exclusions:", exclusions)
data = data.replace(match.group(1), "")
exclusions_list = re.findall(COURSE_RE, exclusions)
print("Exclusions list:", exclusions_list)
else:
break
# find corequisites
match = re.search(COREQS_RE, data)
if match:
coreqs = match.group(1).strip()
print("Found corequisites: ", coreqs)
data = data.replace(match.group(0), "")
coreqs_list = list(map(str.upper, re.findall(COURSE_RE, coreqs)))
print("Corequisites list: ", coreqs_list)
else:
coreqs = None
coreqs_list = []
print("Couldn't find corequisites")

# find prerequisites
match = re.search(PREREQS_RE, data)
if match:
prereqs = match.group(1).strip()
print("Found prerequisites: ", prereqs)
data = data.replace(match.group(0), "")
prereqs_list = list(map(str.upper, re.findall(COURSE_RE, prereqs)))
print("Prerequisites list: ", prereqs_list)
else:
prereqs = None
prereqs_list = []
print("Couldn't find prerequisites")

# find description
match = re.search(DESC_RE, data)
if match:
desc = match.group(1).strip()
print("Found description: " + str(desc))
else:
desc = None
print("Couldn't find description")

# find general education statement
match = re.search(GENED_RE, data)
if match:
gened = 1
else:
gened = 0

# find course outline
match = re.search(OUTLINE_RE, data)
if match:
outline = match.group(1).strip()
print("Found course outline: ", outline)
else:
outline = None
print("Couldn't find course outline")

# find uoc
match = re.search(UOC_RE, data)
if match:
uoc = match.group(1).strip()
try:
uoc = int(uoc)
print("Found UoC: ", uoc)
except:
print("UoC was not an integer: '%s'" % uoc)
uoc = None
else:
uoc = None
print("Couldn't find UoC")


#cur.execute("INSERT INTO courses (code, name, description, prerequisites, corequisites, exclusions, gened, outline, uoc) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", (code, name, desc, prereqs, coreqs, exclusions, gened, outline, uoc))
#for prereq in prereqs_list:
# cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, prereq, TYPE_PREREQUISITE))
#for coreq in coreqs_list:
# cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, coreq, TYPE_COREQUISITE))
#for exclusion in exclusions_list:
# cur.execute("INSERT INTO relationships (source, destination, type) VALUES (?, ?, ?)", (code, exclusion, TYPE_EXCLUSION))
#print()

117 changes: 117 additions & 0 deletions server/scripts/webscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/python3

import re
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

# Started by Jeremy Lim on 20/07/2019

# TO-DO LIST
# - Get timetable done
# - Get GenEd done
# - Majors
# - Degrees
# - Add to database

# Stripped down alphabet
course_alphabet = ['A','B','C','D','E','F','G','H','I','L','M','N','O','P','R','S','T','V','Y','Z']
spec_alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','R','S','T','V','W']

##### COURSES #####

# To print the info of each course
def print_course(code, link, name, cred):
print("CODE: " + code) # prints the code of the course
print("LINK: " + link) # prints the link of the course
print("NAME: " + name) # prints the name of the course
print("CREDITS: " + cred) # prints the credits of the course

# To go through each letter's links for courses
def run_course():

for letter in course_alphabet:
course_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brCoursesByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter
response = requests.get(course_url)
course_soup = BeautifulSoup(response.text, "html.parser")

# Do webscraping
tr = course_soup.find_all('tr')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

searches for the table rows because all the courses share a pattern. tbh should use regex to make it more efficient.

for i in range(1,3):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for i in range(1, len(tr)) but again it will probably change with a regex implementation

counter = 0
td = tr[i].find_all('td')
code = td[0].text
link = td[1].find_all('a')[0]['href']
name = td[1].find_all('a')[0].text
cred = td[2].text

print_course(code, link, name, cred)

# Go to course link and scrape the data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this enters each course's link and scrapes all the relevant data, some stuff like term offerings are still not done because it requires some extra tests

link_url = requests.get(link)
link_soup = BeautifulSoup(link_url.text, "html.parser")

p_data = link_soup.find_all('p')
h_data = link_soup.find_all('h2')
for p_instance in p_data:
search = p_instance.findChildren()
if (len(search) > 0 and len(search[0].contents) > 0):
if (search[0].text == 'Faculty:'):
if (len(search) > 1):
print("FACULTY: " + search[1].text)
else:
print("FACULTY: " + p_instance.contents[1].strip())

if (search[0].text == 'School:'):
if (len(search) > 1):
print("SCHOOL: " + search[1].text)
else:
print("SCHOOL: " + p_instance.contents[1].strip())

if (search[0].text == 'Career:'):
if (len(search) > 1):
print("CAREER: " + search[1].text)
else:
print("CAREER: " + p_instance.contents[1].strip())

# GenEd not working yet

if (search[0].text == "Available for General Education:"):
if (len(search) > 1):
counter += 1
break

for h_instance in h_data:
if (h_instance.text == "Description"):
desc_tags = str(h_instance.find_next_siblings()[0])
desc = str(re.sub("<.*?>", "", desc_tags))
print("DESCRIPTION: " + desc)

# checks for General Education existence in course link
if (counter == 0):
print("GENED: False\n")
else:
print("GENED: True\n")

##### SPECIALISATIONS (WIP) #####

def run_spec():

for letter in spec_alphabet:
spec_url = 'http://legacy.handbook.unsw.edu.au/vbook2018/brSpecialisationsByAtoZ.jsp?StudyLevel=Undergraduate&descr=' + letter
response = requests.get(spec_url)
spec_soup = BeautifulSoup(response.text, "html.parser")

spec_tr = spec_soup.find_all('tr') # this finds the first instance
for i in range(1,3):
counter = 0
spec_td = spec_tr[i].find_all('td') # this finds each of the td in tr
spec_name = spec_td[0].text
spec_link = spec_td[0].find_all('a')[0]['href']
print(spec_name)
print(spec_link)
print("")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it needs to collect courses inside the specialisation links, but they use different structures, so that's why i got a bit stuck


if __name__ == "__main__":
run_course()