Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ NEBULA_API_URL=
NEBULA_API_STORAGE_BUCKET=
NEBULA_API_KEY=
NEBULA_API_STORAGE_KEY=
#Budgets
NEBULA_API_BUDGET_STORAGE_BUCKET=

# Uploader
MONGODB_URI=
13 changes: 8 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@ Project maintained by [Nebula Labs](https://about.utdnebula.com).

### Design

#### - The `grade-data` directory contains .csv files of UTD grade data.
- Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively.
- This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester.
- This grade data is collected independently from the scrapers, and is used during the parsing process.
#### - The `scrapers` directory contains the scrapers for various UTD data sources. This is where the data pipeline begins.
- The scrapers are concerned solely with data collection, not necessarily validation or processing of said data. Those responsibilities are left to the parsing stage.
#### - The `parser` directory contains the files and methods that parse the scraped data. This is the 'middle man' of the data pipeline.
- The parsing stage is responsible for 'making sense' of the scraped data; this consists of reading, validating, and merging/intermixing of various data sources.
- The input data is considered **immutable** by the parsing stage. This means the parsers should never modify the data being fed into them.
#### - The `uploader` directory contains the uploader that sends the parsed data to the Nebula API MongoDB database. This is the final stage of the data pipeline.
- The uploader(s) are concerned solely with pushing parsed data to the database. Data, at this point, is assumed to be valid and ready for use.
#### - The `static-data/grades` directory contains .csv files of UTD grade data.
- Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively.
- This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester.
- This grade data is collected independently from the scrapers, and is used during the parsing process.
#### - The `static-data/budgets` directory contains .pdf files of UTD budget data.
- Files are named by fiscal year.
- This budget data is used as a backup of scraped data as some years have been removed from the website.

### Contributing

Expand Down Expand Up @@ -79,7 +82,7 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r
| `./api-tools -parse -academicCalendars` | Parses academic calendar PDFs. |
| `./api-tools -parse -astra` | Parses Astra data. |
| `./api-tools -parse -cometCalendar` | Parses Comet Calendar data. |
| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./grade-data`). |
| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./static-data/grades`). |
| `./api-tools -parse -discounts` | Parses discount programs HTML. |
| `./api-tools -parse -degrees` | Parses degrees from HTML. |
| `./api-tools -parse -map` | Parses UTD Map data. |
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.26

require (
github.com/PuerkitoBio/goquery v1.12.0
github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244
github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe
github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc
github.com/chromedp/chromedp v0.15.1
github.com/dongri/phonenumber v0.1.12
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052 h1:bN/JW1
github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052/go.mod h1:vWwnuoXFE/Lo9yW6Z6DJguCtAHu0xMym+6r2IEru1v0=
github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 h1:vp2hsJiJwxpgYCTGd3hxWPQay7g7MvtYbLINDmN1+p4=
github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244/go.mod h1:lp0oZHhVmqAqm0gf6Ald2jZXepZ0xFheTsW76T9wC7I=
github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3Up3U7PKvWV7yyZ7ouvNd8081Zwmd4p5NFD3kk4=
github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I=
github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
Expand Down
14 changes: 12 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,14 @@ func main() {
academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.")
// Flag for degree scraping and parsing
degrees := flag.Bool("degrees", false, "Alongside -scrape, -parse, or -upload. Signifies that the degrees should be scraped/parsed/uploaded.")
// Flag for budget scraping
budgets := flag.Bool("budgets", false, "Alongside -scrape, -parse, or -upload, signifies that the budgets should be scraped/parsed/uploaded.")

// Flags for parsing
parse := flag.Bool("parse", false, "Puts the tool into parsing mode.")
csvDir := flag.String("csv", "./grade-data", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.")
gradesDir := flag.String("gradesDir", "./static-data/grades", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.")
useBackupBudgets := flag.Bool("useBackupBudgets", false, "Alongside -parse, specifies that backup budget data should also be parsed.")
budgetsDir := flag.String("budgetsDir", "./static-data/budgets", "Alongside -parse, specifies the path to the directory of PDF files containing budget data.")
skipValidation := flag.Bool("skipv", false, "Alongside -parse, signifies that the post-parsing validation should be skipped. Be careful with this!")

// Flags for uploading data
Expand Down Expand Up @@ -122,6 +126,8 @@ func main() {
scrapers.ScrapeAcademicCalendars(*outDir)
case *degrees:
scrapers.ScrapeDegrees(*outDir)
case *budgets:
scrapers.ScrapeBudgets(*outDir)
default:
log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!")
}
Expand All @@ -141,8 +147,10 @@ func main() {
parser.ParseDiscounts(*inDir, *outDir)
case *degrees:
parser.ParseDegrees(*inDir, *outDir)
case *budgets:
parser.ParseBudgets(*inDir, *outDir, *budgetsDir, *useBackupBudgets)
default:
parser.Parse(*inDir, *outDir, *csvDir, *skipValidation)
parser.Parse(*inDir, *outDir, *gradesDir, *skipValidation)
}
case *upload:
switch {
Expand All @@ -156,6 +164,8 @@ func main() {
uploader.UploadDiscounts(*inDir)
case *degrees:
uploader.UploadDegrees(*inDir)
case *budgets:
uploader.UploadBudgets(*inDir)
default:
uploader.Upload(*inDir, *replace, *staticOnly)
}
Expand Down
197 changes: 29 additions & 168 deletions parser/academicCalendars.go → parser/academicCalendarsParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,15 @@ complicated installation process, or errored on one of the PDFs.
package parser

import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"io/fs"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"reflect"
"strings"
"sync"
"time"
Expand All @@ -30,12 +26,8 @@ import (
"google.golang.org/genai"
)

// Store client to only create once
var once sync.Once
var geminiClient *genai.Client

// What gets sent to Gemini, with the PDF content added
var prompt = `Parse this PDF content and generate the following JSON schema.
var academicCalendarPrompt = `Parse this PDF content and generate the following JSON schema.

{
_id: %s,
Expand Down Expand Up @@ -99,14 +91,14 @@ func ParseAcademicCalendars(inDir string, outDir string) {
for path := range jobs {
log.Printf("Parsing %s...", filepath.Base(path))

academicCalendar, err := parsePdf(path)
academicCalendar, err := parseAcademicCalendarPdf(path)
if err != nil {
if strings.Contains(err.Error(), "429") {
// Exponential-ish backoff up to 60s for 429 rate limiting
backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second}
for _, delay := range backoffs {
time.Sleep(delay)
academicCalendar, err = parsePdf(path)
academicCalendar, err = parseAcademicCalendarPdf(path)
if err == nil || !strings.Contains(err.Error(), "429") {
break
}
Expand Down Expand Up @@ -149,7 +141,12 @@ func ParseAcademicCalendars(inDir string, outDir string) {
}

// Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not
func parsePdf(path string) (schema.AcademicCalendar, error) {
func parseAcademicCalendarPdf(path string) (schema.AcademicCalendar, error) {
apiBucket, err := getAcademicCalendarBucket()
if err != nil {
return schema.AcademicCalendar{}, err
}

// "Fall 2025" to "25F"
filename := filepath.Base(path)
filename = filename[0 : len(filename)-4]
Expand All @@ -165,18 +162,18 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
timeline := filenameParts[0]

// Read PDF
content, err := readPdf(path)
content, err := utils.ReadPdf(path, 1)
if err != nil {
return schema.AcademicCalendar{}, err
}

// Build prompt
promptFilled := fmt.Sprintf(prompt, name, timeline, content)
promptFilled := fmt.Sprintf(academicCalendarPrompt, name, timeline, content)

// Check cache
hashByte := sha256.Sum256([]byte(promptFilled))
hash := hex.EncodeToString(hashByte[:]) + ".json"
result, err := checkCache(hash)
result, err := utils.CheckCache(hash, apiBucket)
if err != nil {
return schema.AcademicCalendar{}, err
}
Expand All @@ -189,23 +186,30 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
log.Printf("No cache for %s, asking Gemini.", filename)

// AI
geminiClient := getGeminiClient()
geminiClient := utils.GetGeminiClient()

// Response schema
calendarSchema := utils.StructToSchema(reflect.TypeOf(schema.AcademicCalendar{}))

// Send request with default config
response, err := geminiClient.Models.GenerateContent(context.Background(),
"gemini-2.5-pro",
genai.Text(promptFilled),
&genai.GenerateContentConfig{},
// Enforce response schema
&genai.GenerateContentConfig{
ResponseMIMEType: "application/json",
ResponseSchema: calendarSchema,
},
)
if err != nil {
return schema.AcademicCalendar{}, err
}

// Get response, remove backtick formatting if present
result = strings.ReplaceAll(strings.ReplaceAll(response.Candidates[0].Content.Parts[0].Text, "```json", ""), "```", "")
// Get response
result = response.Candidates[0].Content.Parts[0].Text

// Set cache for next time
err = setCache(hash, result)
err = utils.SetCache(hash, result, apiBucket)
if err != nil {
return schema.AcademicCalendar{}, err
}
Expand All @@ -221,154 +225,11 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
return academicCalendar, nil
}

// Read the text from the first page of a PDF
// Using external program pdftotext
func readPdf(path string) (string, error) {
cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-")

var out bytes.Buffer
var stderr bytes.Buffer
cmd.Stdout = &out
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String())
}

return out.String(), nil
}

// Check cache for a response to the same prompt
func checkCache(hash string) (string, error) {
apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
if err != nil {
return "", err
}

client := &http.Client{}

// Make request
req, err := http.NewRequest("GET", apiUrl+"storage/"+apiBucket+"/"+hash, nil)
if err != nil {
return "", err
}
req.Header.Add("x-api-key", apiKey)
req.Header.Add("x-storage-key", apiStorageKey)
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()

// Read the response body
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
var parsedBody schema.APIResponse[schema.ObjectInfo]
err = json.Unmarshal([]byte(body), &parsedBody)
if err != nil {
// If this errors, return ("", nil) to indicate not found
return "", nil
}

// Fetch object
req, err = http.NewRequest("GET", parsedBody.Data.MediaLink, nil)
if err != nil {
return "", err
}
resp, err = client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()

// Read the response body
body, err = io.ReadAll(resp.Body)
if err != nil {
return "", err
}

return string(body), nil
}

// Upload AI response to cache
func setCache(hash string, result string) error {
apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
if err != nil {
return err
}

// Make request
jsonStr := []byte(result)
bodyReader := bytes.NewBuffer(jsonStr)
req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader)
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Add("x-api-key", apiKey)
req.Header.Add("x-storage-key", apiStorageKey)
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

return nil
}

// Get all the keys to access the Nebula API storage routes
func getNebulaKeys() (string, string, string, string, error) {
apiUrl, err := utils.GetEnv("NEBULA_API_URL")
if err != nil {
return "", "", "", "", err
}
// Get the storage bucket for the academic calendar cache
func getAcademicCalendarBucket() (string, error) {
apiBucket, err := utils.GetEnv("NEBULA_API_STORAGE_BUCKET")
if err != nil {
return "", "", "", "", err
}
apiKey, err := utils.GetEnv("NEBULA_API_KEY")
if err != nil {
return "", "", "", "", err
}
apiStorageKey, err := utils.GetEnv("NEBULA_API_STORAGE_KEY")
if err != nil {
return "", "", "", "", err
return "", err
}

return apiUrl, apiBucket, apiKey, apiStorageKey, nil
}

// Create client only once
// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT
func getGeminiClient() *genai.Client {
once.Do(func() {
// Create JSON file
serviceAccount, err := utils.GetEnv("GEMINI_SERVICE_ACCOUNT")
if err != nil {
panic(err)
}
jsonFile, err := utils.GetEnv("GOOGLE_APPLICATION_CREDENTIALS")
if err != nil {
panic(err)
}
err = os.WriteFile(jsonFile, []byte(serviceAccount), 0644)
if err != nil {
panic(err)
}

// Create client
geminiClient, err = genai.NewClient(context.Background(),
&genai.ClientConfig{
Project: "api-tools-451421",
Location: "us-central1",
Backend: genai.BackendVertexAI,
})
if err != nil {
panic(err)
}
})
return geminiClient
return apiBucket, nil
}
Comment thread
TyHil marked this conversation as resolved.
Loading
Loading