UTDNebula · mikehquan19 · May 2, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
@@ -14,6 +14,8 @@ NEBULA_API_URL=
 NEBULA_API_STORAGE_BUCKET=
 NEBULA_API_KEY=
 NEBULA_API_STORAGE_KEY=
+#Budgets
+NEBULA_API_BUDGET_STORAGE_BUCKET=
 
 # Uploader
 MONGODB_URI=
@@ -6,17 +6,20 @@ Project maintained by [Nebula Labs](https://about.utdnebula.com).
 
 ### Design
 
-#### - The `grade-data` directory contains .csv files of UTD grade data. 
-  - Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively.
-  - This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester.
-  - This grade data is collected independently from the scrapers, and is used during the parsing process.
 #### - The `scrapers` directory contains the scrapers for various UTD data sources. This is where the data pipeline begins.
   - The scrapers are concerned solely with data collection, not necessarily validation or processing of said data. Those responsibilities are left to the parsing stage.
 #### - The `parser` directory contains the files and methods that parse the scraped data. This is the 'middle man' of the data pipeline.
   - The parsing stage is responsible for 'making sense' of the scraped data; this consists of reading, validating, and merging/intermixing of various data sources.
   - The input data is considered **immutable** by the parsing stage. This means the parsers should never modify the data being fed into them.
 #### - The `uploader` directory contains the uploader that sends the parsed data to the Nebula API MongoDB database. This is the final stage of the data pipeline.
   - The uploader(s) are concerned solely with pushing parsed data to the database. Data, at this point, is assumed to be valid and ready for use.
+#### - The `static-data/grades` directory contains .csv files of UTD grade data. 
+  - Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively.
+  - This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester.
+  - This grade data is collected independently from the scrapers, and is used during the parsing process.
+#### - The `static-data/budgets` directory contains .pdf files of UTD budget data. 
+  - Files are named by fiscal year.
+  - This budget data is used as a backup of scraped data as some years have been removed from the website.
 
 ### Contributing
 
@@ -79,7 +82,7 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r
 | `./api-tools -parse -academicCalendars` | Parses academic calendar PDFs. |
 | `./api-tools -parse -astra` | Parses Astra data. |
 | `./api-tools -parse -cometCalendar` | Parses Comet Calendar data. |
-| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./grade-data`). |
+| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./static-data/grades`). |
 | `./api-tools -parse -discounts` | Parses discount programs HTML. |
 | `./api-tools -parse -degrees` | Parses degrees from HTML. |
 | `./api-tools -parse -map` | Parses UTD Map data. |

@@ -4,7 +4,7 @@ go 1.26
 
 require (
 	github.com/PuerkitoBio/goquery v1.12.0
-	github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244
+	github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe
 	github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc
 	github.com/chromedp/chromedp v0.15.1
 	github.com/dongri/phonenumber v0.1.12

@@ -62,6 +62,8 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052 h1:bN/JW1
 github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052/go.mod h1:vWwnuoXFE/Lo9yW6Z6DJguCtAHu0xMym+6r2IEru1v0=
 github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 h1:vp2hsJiJwxpgYCTGd3hxWPQay7g7MvtYbLINDmN1+p4=
 github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244/go.mod h1:lp0oZHhVmqAqm0gf6Ald2jZXepZ0xFheTsW76T9wC7I=
+github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3Up3U7PKvWV7yyZ7ouvNd8081Zwmd4p5NFD3kk4=
+github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I=
 github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
 github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
 github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=

@@ -50,10 +50,14 @@ func main() {
 	academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.")
 	// Flag for degree scraping and parsing
 	degrees := flag.Bool("degrees", false, "Alongside -scrape, -parse, or -upload. Signifies that the degrees should be scraped/parsed/uploaded.")
+	// Flag for budget scraping
+	budgets := flag.Bool("budgets", false, "Alongside -scrape, -parse, or -upload, signifies that the budgets should be scraped/parsed/uploaded.")
 
 	// Flags for parsing
 	parse := flag.Bool("parse", false, "Puts the tool into parsing mode.")
-	csvDir := flag.String("csv", "./grade-data", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.")
+	gradesDir := flag.String("gradesDir", "./static-data/grades", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.")
+	useBackupBudgets := flag.Bool("useBackupBudgets", false, "Alongside -parse, specifies that backup budget data should also be parsed.")
+	budgetsDir := flag.String("budgetsDir", "./static-data/budgets", "Alongside -parse, specifies the path to the directory of PDF files containing budget data.")
 	skipValidation := flag.Bool("skipv", false, "Alongside -parse, signifies that the post-parsing validation should be skipped. Be careful with this!")
 
 	// Flags for uploading data
@@ -122,6 +126,8 @@ func main() {
 			scrapers.ScrapeAcademicCalendars(*outDir)
 		case *degrees:
 			scrapers.ScrapeDegrees(*outDir)
+		case *budgets:
+			scrapers.ScrapeBudgets(*outDir)
 		default:
 			log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!")
 		}
@@ -141,8 +147,10 @@ func main() {
 			parser.ParseDiscounts(*inDir, *outDir)
 		case *degrees:
 			parser.ParseDegrees(*inDir, *outDir)
+		case *budgets:
+			parser.ParseBudgets(*inDir, *outDir, *budgetsDir, *useBackupBudgets)
 		default:
-			parser.Parse(*inDir, *outDir, *csvDir, *skipValidation)
+			parser.Parse(*inDir, *outDir, *gradesDir, *skipValidation)
 		}
 	case *upload:
 		switch {
@@ -156,6 +164,8 @@ func main() {
 			uploader.UploadDiscounts(*inDir)
 		case *degrees:
 			uploader.UploadDegrees(*inDir)
+		case *budgets:
+			uploader.UploadBudgets(*inDir)
 		default:
 			uploader.Upload(*inDir, *replace, *staticOnly)
 		}

@@ -8,19 +8,15 @@ complicated installation process, or errored on one of the PDFs.
 package parser
 
 import (
-	"bytes"
 	"context"
 	"crypto/sha256"
 	"encoding/hex"
 	"encoding/json"
 	"fmt"
-	"io"
 	"io/fs"
 	"log"
-	"net/http"
-	"os"
-	"os/exec"
 	"path/filepath"
+	"reflect"
 	"strings"
 	"sync"
 	"time"
@@ -30,12 +26,8 @@ import (
 	"google.golang.org/genai"
 )
 
-// Store client to only create once
-var once sync.Once
-var geminiClient *genai.Client
-
 // What gets sent to Gemini, with the PDF content added
-var prompt = `Parse this PDF content and generate the following JSON schema.
+var academicCalendarPrompt = `Parse this PDF content and generate the following JSON schema.
 
 {
   _id: %s,
@@ -99,14 +91,14 @@ func ParseAcademicCalendars(inDir string, outDir string) {
 			for path := range jobs {
 				log.Printf("Parsing %s...", filepath.Base(path))
 
-				academicCalendar, err := parsePdf(path)
+				academicCalendar, err := parseAcademicCalendarPdf(path)
 				if err != nil {
 					if strings.Contains(err.Error(), "429") {
 						// Exponential-ish backoff up to 60s for 429 rate limiting
 						backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second}
 						for _, delay := range backoffs {
 							time.Sleep(delay)
-							academicCalendar, err = parsePdf(path)
+							academicCalendar, err = parseAcademicCalendarPdf(path)
 							if err == nil || !strings.Contains(err.Error(), "429") {
 								break
 							}
@@ -149,7 +141,12 @@ func ParseAcademicCalendars(inDir string, outDir string) {
 }
 
 // Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not
-func parsePdf(path string) (schema.AcademicCalendar, error) {
+func parseAcademicCalendarPdf(path string) (schema.AcademicCalendar, error) {
+	apiBucket, err := getAcademicCalendarBucket()
+	if err != nil {
+		return schema.AcademicCalendar{}, err
+	}
+
 	// "Fall 2025" to "25F"
 	filename := filepath.Base(path)
 	filename = filename[0 : len(filename)-4]
@@ -165,18 +162,18 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
 	timeline := filenameParts[0]
 
 	// Read PDF
-	content, err := readPdf(path)
+	content, err := utils.ReadPdf(path, 1)
 	if err != nil {
 		return schema.AcademicCalendar{}, err
 	}
 
 	// Build prompt
-	promptFilled := fmt.Sprintf(prompt, name, timeline, content)
+	promptFilled := fmt.Sprintf(academicCalendarPrompt, name, timeline, content)
 
 	// Check cache
 	hashByte := sha256.Sum256([]byte(promptFilled))
 	hash := hex.EncodeToString(hashByte[:]) + ".json"
-	result, err := checkCache(hash)
+	result, err := utils.CheckCache(hash, apiBucket)
 	if err != nil {
 		return schema.AcademicCalendar{}, err
 	}
@@ -189,23 +186,30 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
 		log.Printf("No cache for %s, asking Gemini.", filename)
 
 		// AI
-		geminiClient := getGeminiClient()
+		geminiClient := utils.GetGeminiClient()
+
+		// Response schema
+		calendarSchema := utils.StructToSchema(reflect.TypeOf(schema.AcademicCalendar{}))
 
 		// Send request with default config
 		response, err := geminiClient.Models.GenerateContent(context.Background(),
 			"gemini-2.5-pro",
 			genai.Text(promptFilled),
-			&genai.GenerateContentConfig{},
+			// Enforce response schema
+			&genai.GenerateContentConfig{
+				ResponseMIMEType: "application/json",
+				ResponseSchema:   calendarSchema,
+			},
 		)
 		if err != nil {
 			return schema.AcademicCalendar{}, err
 		}
 
-		// Get response, remove backtick formatting if present
-		result = strings.ReplaceAll(strings.ReplaceAll(response.Candidates[0].Content.Parts[0].Text, "```json", ""), "```", "")
+		// Get response
+		result = response.Candidates[0].Content.Parts[0].Text
 
 		// Set cache for next time
-		err = setCache(hash, result)
+		err = utils.SetCache(hash, result, apiBucket)
 		if err != nil {
 			return schema.AcademicCalendar{}, err
 		}
@@ -221,154 +225,11 @@ func parsePdf(path string) (schema.AcademicCalendar, error) {
 	return academicCalendar, nil
 }
 
-// Read the text from the first page of a PDF
-// Using external program pdftotext
-func readPdf(path string) (string, error) {
-	cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-")
-
-	var out bytes.Buffer
-	var stderr bytes.Buffer
-	cmd.Stdout = &out
-	cmd.Stderr = &stderr
-
-	if err := cmd.Run(); err != nil {
-		return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String())
-	}
-
-	return out.String(), nil
-}
-
-// Check cache for a response to the same prompt
-func checkCache(hash string) (string, error) {
-	apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
-	if err != nil {
-		return "", err
-	}
-
-	client := &http.Client{}
-
-	// Make request
-	req, err := http.NewRequest("GET", apiUrl+"storage/"+apiBucket+"/"+hash, nil)
-	if err != nil {
-		return "", err
-	}
-	req.Header.Add("x-api-key", apiKey)
-	req.Header.Add("x-storage-key", apiStorageKey)
-	resp, err := client.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	// Read the response body
-	body, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return "", err
-	}
-	var parsedBody schema.APIResponse[schema.ObjectInfo]
-	err = json.Unmarshal([]byte(body), &parsedBody)
-	if err != nil {
-		// If this errors, return ("", nil) to indicate not found
-		return "", nil
-	}
-
-	// Fetch object
-	req, err = http.NewRequest("GET", parsedBody.Data.MediaLink, nil)
-	if err != nil {
-		return "", err
-	}
-	resp, err = client.Do(req)
-	if err != nil {
-		return "", err
-	}
-	defer resp.Body.Close()
-
-	// Read the response body
-	body, err = io.ReadAll(resp.Body)
-	if err != nil {
-		return "", err
-	}
-
-	return string(body), nil
-}
-
-// Upload AI response to cache
-func setCache(hash string, result string) error {
-	apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys()
-	if err != nil {
-		return err
-	}
-
-	// Make request
-	jsonStr := []byte(result)
-	bodyReader := bytes.NewBuffer(jsonStr)
-	req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader)
-	if err != nil {
-		return err
-	}
-	req.Header.Set("Content-Type", "application/json")
-	req.Header.Add("x-api-key", apiKey)
-	req.Header.Add("x-storage-key", apiStorageKey)
-	client := &http.Client{}
-	resp, err := client.Do(req)
-	if err != nil {
-		return err
-	}
-	defer resp.Body.Close()
-
-	return nil
-}
-
-// Get all the keys to access the Nebula API storage routes
-func getNebulaKeys() (string, string, string, string, error) {
-	apiUrl, err := utils.GetEnv("NEBULA_API_URL")
-	if err != nil {
-		return "", "", "", "", err
-	}
+// Get the storage bucket for the academic calendar cache
+func getAcademicCalendarBucket() (string, error) {
 	apiBucket, err := utils.GetEnv("NEBULA_API_STORAGE_BUCKET")
 	if err != nil {
-		return "", "", "", "", err
-	}
-	apiKey, err := utils.GetEnv("NEBULA_API_KEY")
-	if err != nil {
-		return "", "", "", "", err
-	}
-	apiStorageKey, err := utils.GetEnv("NEBULA_API_STORAGE_KEY")
-	if err != nil {
-		return "", "", "", "", err
+		return "", err
 	}
-
-	return apiUrl, apiBucket, apiKey, apiStorageKey, nil
-}
-
-// Create client only once
-// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT
-func getGeminiClient() *genai.Client {
-	once.Do(func() {
-		// Create JSON file
-		serviceAccount, err := utils.GetEnv("GEMINI_SERVICE_ACCOUNT")
-		if err != nil {
-			panic(err)
-		}
-		jsonFile, err := utils.GetEnv("GOOGLE_APPLICATION_CREDENTIALS")
-		if err != nil {
-			panic(err)
-		}
-		err = os.WriteFile(jsonFile, []byte(serviceAccount), 0644)
-		if err != nil {
-			panic(err)
-		}
-
-		// Create client
-		geminiClient, err = genai.NewClient(context.Background(),
-			&genai.ClientConfig{
-				Project:  "api-tools-451421",
-				Location: "us-central1",
-				Backend:  genai.BackendVertexAI,
-			})
-		if err != nil {
-			panic(err)
-		}
-	})
-	return geminiClient
+	return apiBucket, nil
 }