Generate Bloom Filters #1082

Workflow file for this run

.github/workflows/generate-bloomfilter.yml at f7c1203

	# Generate Bloom filters with submitted Hacker News stories periodically and
	# on-command for use in the accompanying browser extension.
	#
	# Created by Jacob Strieb
	# January 2021


	name: Generate Bloom Filters

	# Run once every 24 hours because the BigQuery database is updated once every
	# 24 hours. Also run when manually triggered.
	on:
	schedule:
	# This time is based loosely around when the BigQuery data updates
	- cron: "5 16 * * *"
	workflow_dispatch:


	jobs:
	create-bloomfilter:
	name: Create Bloom Filters
	runs-on: ubuntu-latest
	steps:
	- name: Install dependencies
	run: \|
	sudo apt-get install jo

	- uses: actions/checkout@v2

	- name: Set up BigQuery
	run: \|
	# This BQ_JSON key is obtained by downloading the JSON key for a
	# service account on BigQuery. It is then copied directly into a
	# GitHub Actions secret (under Settings)
	echo '${{ secrets.BQ_JSON }}' > bq.json
	gcloud auth \
	activate-service-account \
	--key-file bq.json
	# Unfortunately, bq outputs some dumb shit the first time we run it
	# with the new authentication, so we do a dummy call here to prevent
	# it from outputting the annoying welcome message directly into the
	# CSV we are attempting to process
	bq query \
	--use_legacy_sql=false \
	'SELECT COUNT(*) FROM `bigquery-public-data.hacker_news.full`
	WHERE type = "story"'

	- name: Compile Bloom filter creation program
	run: \|
	make create

	- name: Pull BigQuery data
	run: \|
	# Note that max_rows must be an unsigned, 32-bit int
	bq query \
	--format csv \
	--max_rows 99999999 \
	--use_legacy_sql=false \
	'SELECT
	url,
	score,
	descendants,
	time
	FROM
	`bigquery-public-data.hacker_news.full`
	WHERE
	type = "story"
	AND dead IS NOT TRUE
	AND deleted IS NOT TRUE
	AND url != ""' \
	> data.csv
	head -n 50 data.csv

	# Put the data in a sqlite3 database
	sqlite3 \
	data.db \
	".mode csv" \
	".import data.csv hn"

	- name: Generate Bloom Filters
	run: \|
	# Get the time of the last submission
	LATEST=$(sqlite3 \
	-csv \
	data.db \
	"SELECT CAST(time AS INT) AS inttime
	FROM hn ORDER BY inttime DESC LIMIT 1")
	# TODO: Make this accurate -- not sure why but it's inaccurate now
	printf \
	"Last story submitted %d hours ago\n" \
	$((($(date +%s) - LATEST) / 3600))

	# Generate one Bloom filter for each date range/threshold combination
	# Used as sqlite3 date modifiers:
	# https://sqlite.org/lang_datefunc.html
	DATE_RANGES=(
	# "-1 month"
	"-7 days"
	"-24 hours"
	)

	THRESHOLDS=(
	0
	10
	75
	250
	500
	)

	# Apply the threshold to points (score) or comments (descendants)
	THRESHOLD_KEY="score"

	mkdir generated

	for THRESHOLD in "${THRESHOLDS[@]}"; do
	FILENAME="hn-$THRESHOLD.bloom"

	# Make a Bloom filter for each threshold with no time restriction
	echo "Creating bloom filter for stories from all time and" \
	"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."

	sqlite3 \
	-header \
	-csv \
	data.db \
	"SELECT * FROM hn
	WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD" \
	\| python3 canonicalize.py \
	\| bin/bloom-create "generated/$FILENAME"

	# For the current threshold, make a bloom filter for each date range
	for DATE_RANGE in "${DATE_RANGES[@]}"; do
	DATE_RANGE_STR="$(echo $DATE_RANGE \| sed 's/[- ]//g')"
	FILENAME="hn-$DATE_RANGE_STR-$THRESHOLD.bloom"

	echo "Creating bloom filter for $DATE_RANGE and" \
	"$THRESHOLD+ $THRESHOLD_KEY. Writing to $FILENAME..."

	sqlite3 \
	-header \
	-csv \
	data.db \
	"SELECT * FROM hn
	WHERE CAST($THRESHOLD_KEY AS INT) >= $THRESHOLD
	AND CAST(time AS INT) > strftime('%s', (
	SELECT CAST(time AS INT) AS inttime FROM hn
	ORDER BY inttime DESC LIMIT 1
	), 'unixepoch', '$DATE_RANGE')" \
	\| python3 canonicalize.py \
	\| bin/bloom-create "generated/$FILENAME"
	done
	done

	# Output a JSON file with information about the thresholds and dates
	#
	# TODO: Fix to use $LATEST instead of now
	DATES=()
	for DATE_RANGE in "${DATE_RANGES[@]}"; do
	DATE_RANGE_STR="$(echo $DATE_RANGE \| sed 's/[- ]//g')"
	DATE="$(date -d "now ""$DATE_RANGE" +%s)"
	DATES+=("$DATE=$DATE_RANGE_STR")
	done

	# 24 hours used in next_generated since this runs every 24 hours and
	# takes 10 to 20 minutes to complete
	jo \
	-p \
	thresholds=$(jo -a "${THRESHOLDS[@]}") \
	dates=$(jo "${DATES[@]}") \
	version="0.6" \
	compressed="true" \
	date_generated="$(date +%s)" \
	next_generated="$(date -d "now +24 hours +20 minutes" +%s)" \
	last_submitted="$(sqlite3 \
	-csv \
	data.db \
	"SELECT CAST(time AS INT) AS inttime
	FROM hn ORDER BY inttime DESC LIMIT 1")" \
	> generated/info.json

	- name: Create Release
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	cd generated

	# Keep the latest extension in the next release
	curl \
	--location \
	--output "hackernews-button.xpi" \
	"https://github.com/jstrieb/hackernews-button/releases/latest/download/hackernews-button.xpi"

	# TODO: Fix fatal name collision failure if run more than once a day?
	gh release create \
	"$(date +%Y%m%d)" \
	--title "$(date '+%b %d, %Y') Bloom filters" \
	--notes "Updated Bloom filters for Hacker News submitted stories for $(date '+%B %d, %^Y')" \
	*

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Generate Bloom Filters #1082

Workflow file

Generate Bloom Filters #1082

Jobs

Run details

Workflow file for this run