forked from dathere/qsv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmarks.sh
executable file
·872 lines (782 loc) · 43.1 KB
/
benchmarks.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
#!/bin/bash
#
# Usage: ./benchmarks.sh <argument>
# where <argument> is a substring pattern of the benchmark name.
# e.g. ./benchmarks.sh sort - will run benchmarks with "sort" in the benchmark name
# if <argument> is omitted, all benchmarks are executed.
#
# if <argument> is "reset", the benchmark data will be downloaded and prepared again.
# though the results/benchmark_results.csv and resutls/run_info_history.tsv historical
# archives will be preserved.
# if <argument> is "clean", temporary files will be deleted.
# if <argument> is "setup", setup and install all the required tools.
# if <argument> is "help", help text is displayed.
#
# ==============================================================================================
#
# This script benchmarks Quicksilver (qsv) using a 520mb, 41 column, 1M row sample of
# NYC's 311 data. If it doesn't exist on your system, it will be downloaded for you.
#
# Though this script was primarily created to maintain the Benchmark page on the qsv site,
# it was also designed to be a useful tool for users to benchmark qsv on their own systems,
# so it be can run on hardware and workloads that reflect your requirements/environment.
#
# Make sure you're using a release-optimized `qsv`.
# If you can't use the prebuilt binaries at https://github.com/jqnatividad/qsv/releases/latest,
# build it to have at least the apply, geocode, luau, to and polars features enabled:
# i.e. `cargo build --release --locked -F feature_capable,apply,geocode,luau,to,polars` or
# `cargo install --locked qsv -F feature_capable,apply,geocode,luau,to,polars`
#
# This shell script has been tested on Linux and macOS. It should work on other Unix-like systems,
# but will NOT run on native Windows. If you're on Windows, you can run it using Cygwin or WSL
# (see https://www.cygwin.com/ and https://docs.microsoft.com/en-us/windows/wsl/install-win10).
# It requires hyperfine (https://github.com/sharkdp/hyperfine#hyperfine) to run the benchmarks.
# It also requires 7-Zip (https://www.7-zip.org/download.html) as we need the high compression
# ratio so we don't have to deal with git-lfs to host the large compressed file on GitHub.
#
# And of course, it dogfoods `qsv` as well to prepare the benchmark data, fetch the rowcount,
# and to parse and format the benchmark results. :)
# It uses the following commands: apply, cat, count, luau, sample, schema, select, snappy, sort, tojsonl
# and to xlsx. It's a good example of how qsv can be used to automate data preparation & analysis tasks.
arg_pat="$1"
# the version of this script
bm_version=5.3.2
# CONFIGURABLE VARIABLES ---------------------------------------
# change as needed to reflect your environment/workloads
# the path to the qsv binary, change this if you're not using the prebuilt binaries
# e.g. you compiled a tuned version of qsv with different features and/or CPU optimizations enabled
# qsv_bin=../target/release/qsvlite
# qsv_bin=../target/debug/qsv
qsv_bin=qsv
# the path to the qsv binary that we dogfood to run the benchmarks
# we use several optional features when dogfooding qsv (apply, luau & to)
# and the user may be benchmarking a qsv binary variant that doesn't have these features enabled
qsv_benchmarker_bin=qsv
benchmark_data_url=https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/NYC_311_SR_2010-2020-sample-1M.7z
# where to download the benchmark data compressed file - this could be a zip or 7z file
datazip=NYC_311_SR_2010-2020-sample-1M.7z
# where to store the benchmark data
data=NYC_311_SR_2010-2020-sample-1M.csv
# Hyoerfine options - run `hyperfine --help`` for more info
# number of warmup runs for each benchmark. A minimum of 2 is recommended
warmup_runs=2
# number of benchmark runs for each benchmark. A minimum of 3 is recommended
benchmark_runs=3
# ---------------------------- end of CONFIGURABLE VARIABLES
data_filename=$(basename -- "$data")
filestem="${data_filename%.*}"
# check if qsv is installed
if ! command -v "$qsv_bin" &>/dev/null; then
echo "qsv could not be found"
echo "Please install qsv from https://qsv.dathere.com"
exit
fi
# get current version of qsv
raw_version=$("$qsv_bin" --version)
# get the version of qsv used to run this script
# we use this to determine if the user is using a different qsv binary
# than the one used to run this script
benchmarker_version=$("$qsv_benchmarker_bin" --version)
# if arg_pat is equal to "help", show usage
if [[ "$arg_pat" == "help" ]]; then
echo "qsv Benchmark Script v$bm_version"
echo ""
echo "Usage: ./benchmarks.sh <argument>"
echo ""
echo " where <argument> is a substring pattern of the benchmark name."
echo " e.g. ./benchmarks.sh sort - will run benchmarks with \"sort\" in the benchmark name"
echo " if <argument> is omitted, all benchmarks will be executed."
echo ""
echo " if <argument> is \"reset\", the benchmark data will be downloaded and prepared again."
echo " though the results/benchmark_results.csv historical archive will be preserved."
echo " if <argument> is \"clean\", temporary files will be deleted."
echo " if <argument> is \"setup\", setup and install all the required tools."
echo " if <argument> is \"help\", help text is displayed."
echo ""
echo "benchmarking: $raw_version"
echo "dogfooding: $benchmarker_version"
exit
fi
# check if required tools/dependencies are installed ---------
# check if benchmarker_bin has the apply feature enabled
if [[ "$benchmarker_version" != *"apply;"* ]]; then
echo "ERROR: $qsv_benchmarker_bin does not have the apply feature enabled."
echo "The qsv apply command is needed to format the benchmarks results."
exit
fi
# check if the benchmarker_bin has the luau feature enabled
if [[ "$benchmarker_version" != *"Luau"* ]]; then
echo "ERROR: $qsv_benchmarker_bin does not have the luau feature enabled."
echo "The qsv luau command is needed to aggregate the benchmarks results."
exit
fi
# check if the benchmarker_bin has the to feature enabled
if [[ "$benchmarker_version" != *"to;"* ]]; then
# check if benchmark_data.xlsx exists
if [ ! -r benchmark_data.xlsx ]; then
echo "ERROR: $qsv_benchmarker_bin does not have the to feature enabled."
echo "The qsv to xlsx command is needed to create an Excel spreadsheet"
echo "as benchmark_data.xlsx does not exist."
exit
fi
fi
# set sevenz_bin to "7z" on Linux/Cygwin and "7zz" on macOS
if [[ "$OSTYPE" == "darwin"* ]]; then
sevenz_bin=7zz
else
sevenz_bin=7z
fi
# if arg_pat is equal to "setup", setup and install all the required tools
if [[ "$arg_pat" == "setup" ]]; then
need_sevenz=0
need_hyperfine=0
need_awk=0
need_sed=0
need_duckdb=0
# check if 7z is installed
if ! command -v "$sevenz_bin" &>/dev/null; then
need_sevenz=1
fi
# check if hyperfine is installed
if ! command -v hyperfine &>/dev/null; then
need_hyperfine=1
fi
# check if awk is installed
if ! command -v awk &>/dev/null; then
need_awk=1
fi
# check if sed is installed
if ! command -v sed &>/dev/null; then
need_sed=1
fi
# check if duckdb is installed
if ! command -v duckdb &>/dev/null; then
need_duckdb=1
fi
# if all required tools are installed, exit
if [[ "$need_sevenz" -eq 0 && "$need_hyperfine" -eq 0 && "$need_awk" -eq 0 && "$need_sed" -eq 0 ]]; then
echo "> All required tools are installed..."
exit
fi
# check if homebrew is installed, if not, install it
# as we need it to install the required tools
if ! command -v brew &>/dev/null; then
echo "INFO: Homebrew could not be found. Installing brew first. Please enter requested info when prompted."
curl -fsSL "https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh"
fi
# if 7z is not installed, install it
if [[ "$need_sevenz" -eq 1 ]]; then
echo "INFO: 7-Zip could not be found. Installing..."
brew install 7zip
fi
# if hyperfine is not installed, install it
if [[ "$need_hyperfine" -eq 1 ]]; then
echo "INFO: hyperfine could not be found. Installing..."
brew install hyperfine
fi
# if awk is not installed, install it
if [[ "$need_awk" -eq 1 ]]; then
echo "INFO: awk could not be found. Installing..."
brew install gawk
fi
# if sed is not installed, install it
if [[ "$need_sed" -eq 1 ]]; then
echo "INFO: sed could not be found. Installing..."
brew install gnu-sed
fi
# if duckdb is not installed, install it
if [[ "$need_duckdb" -eq 1 ]]; then
echo "INFO: duckdb could not be found. Installing..."
brew install duckdb
fi
echo "> All required tools installed! You can run ./benchmarks.sh now."
exit
fi
# check if 7z is installed
if ! command -v "$sevenz_bin" &>/dev/null; then
echo "ERROR: $sevenz_bin could not be found."
echo "Please install 7-Zip v23.01 and above or run \"./benchmarks.sh setup\" to install it."
exit
fi
# check if hyperfine is installed
if ! command -v hyperfine &>/dev/null; then
echo "ERROR: hyperfine could not be found"
echo "Please install hyperfine v1.18.0 and above or run \"./benchmarks.sh setup\" to install it."
exit
fi
# check if awk is installed
if ! command -v awk &>/dev/null; then
echo "ERROR: awk could not be found"
echo "Please install awk or run \"./benchmarks.sh setup\" to install it."
exit
fi
# check if sed is installed
if ! command -v sed &>/dev/null; then
echo "ERROR: sed could not be found"
echo "Please install sed or run \"./benchmarks.sh setup\" to install it."
exit
fi
# check if duckdb is installed
if ! command -v duckdb &>/dev/null; then
echo "ERROR: duckdb could not be found"
echo "Please install duckdb or run \"./benchmarks.sh setup\" to install it."
exit
fi
# qsv version metadata ----------------
version=$(echo "$raw_version" | cut -d' ' -f2 | cut -d'-' -f1)
# get target platform from version
platform=$(echo "$raw_version" | sed 's/.*(\([a-z0-9_-]*\) compiled with Rust.*/\1/')
# get qsv kind
kind=$(echo "$raw_version" | sed 's/.* \([a-zA-Z-]*\)$/\1/')
# get num cores & memory size
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
num_cores=$(sysctl -n hw.ncpu)
mem_size=$(sysctl -n hw.memsize)
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
# Linux
num_cores=$(nproc)
mem_size=$(free -b | awk '/Mem/ {print $7}')
elif [[ "$OSTYPE" == "msys" || "$OSTYPE" == "cygwin" ]]; then
# Windows
num_cores=$(wmic cpu get NumberOfCores | grep -Eo '^[0-9]+')
mem_size=$(wmic OS get FreePhysicalMemory | grep -Eo '[0-9]+')
mem_size=$((mem_size * 1024))
else
echo "Unsupported operating system: $OSTYPE"
exit 1
fi
function cleanup_files {
# Clean up temporary files
rm -f "$filestem".csv.*
rm -f "$filestem".stats.*
rm -f results/hf_result.csv
rm -f results/hf_result_nocmd.csv
rm -f results/results_work.csv
rm -f results/run_info_work.tsv
rm -f results/entry.csv
rm -r -f split_tempdir
rm -r -f split_tempdir_chunks
rm -r -f split_tempdir_kbs
rm -r -f split_tempdir_idx
rm -r -f split_tempdir_idx_j1
rm -r -f split_tempdir_chunks_idx
rm -r -f split_tempdir_chunks_idx_j1
rm -f benchmark_work.*
rm -r -f benchmark_work
rm -f extsort_sorted.csv
}
# if arg_pat is equal to "reset", download and prepare the benchmark data again
# the results/benchmark_results.csv historical archive will be preserved
if [[ "$arg_pat" == "reset" ]]; then
rm -f "$datazip"
rm -f "$filestem".*
rm -f communityboards.csv
rm -f data_to_exclude.csv
rm -f data_unsorted.csv
rm -f data_sorted.csv
rm -f benchmark_data.xlsx
rm -f benchmark_data.jsonl
rm -f benchmark_data.json
rm -f benchmark_data.schema.json
rm -f searchset_patterns.txt
rm -f searchset_patterns_unicode.txt
echo "> Benchmark data reset..."
echo " Historical benchmarks archive preserved in results/benchmark_results.csv"
exit
fi
# if arg_pat is equal to "clean", clean up temporary files
if [[ "$arg_pat" == "clean" ]]; then
cleanup_files
echo "> Temporary files cleaned up..."
exit
fi
echo "> Setting up Benchmark environment..."
SECONDS=0
cleanup_files
if [ ! -r "$data" ]; then
echo "> Downloading Benchmark data..."
curl -sS "$benchmark_data_url" >"$datazip"
"$sevenz_bin" e -y "$datazip"
echo ""
fi
# we get the rowcount, just in case the benchmark data was modified by the user to tailor
# the benchmark to their system/workload. We use the rowcount to compute records per second
rowcount=$("$qsv_bin" count "$data")
printf " Benchmark data rowcount: %'.0f\n" "$rowcount"
qsv_absolute_path=$(which "$qsv_bin")
benchmarker_absolute_path=$(which "$qsv_benchmarker_bin")
printf " Benchmarking qsv binary: %s\n" "$qsv_absolute_path"
printf " %s\n" "$raw_version"
printf " Dogfooding qsv binary: %s\n" "$benchmarker_absolute_path"
echo ""
if [ ! -r communityboards.csv ]; then
echo "> Downloading community board data..."
curl -sS https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/communityboards.csv >communityboards.csv
echo ""
fi
if [ ! -r searchset_patterns_unicode.txt ]; then
echo "> Preparing benchmark support data..."
# create an index so benchmark data preparation commands can run faster
"$qsv_benchmarker_bin" index "$data"
echo " data_to_exclude.csv..."
"$qsv_benchmarker_bin" sample --seed 42 1000 "$data" -o data_to_exclude.csv
echo " data_unsorted.csv..."
"$qsv_benchmarker_bin" sort --seed 42 --random --faster "$data" -o data_unsorted.csv
echo " data_sorted.csv..."
"$qsv_benchmarker_bin" sort "$data" -o data_sorted.csv
echo " benchmark_data.xlsx..."
"$qsv_benchmarker_bin" to xlsx benchmark_data.xlsx "$data"
echo " benchmark_data.jsonl..."
"$qsv_benchmarker_bin" tojsonl "$data" --output benchmark_data.jsonl
echo " benchmark_data.json..."
"$qsv_benchmarker_bin" sqlp --format json "$data" -Q 'select * from _t_1' --infer-len 127000 --rnull-values 'N/A' --output benchmark_data.json
echo " benchmark_data.schema.json..."
"$qsv_benchmarker_bin" schema "$data" --stdout >benchmark_data.csv.schema.json
echo " benchmark_data.snappy..."
"$qsv_benchmarker_bin" snappy compress "$data" --output benchmark_data.snappy
echo " searchset_patterns.txt..."
printf "homeless\npark\nNoise\n" >searchset_patterns.txt
echo " searchset_patterns_unicode.txt..."
printf "homeless\n💩\nNoise\n" >searchset_patterns_unicode.txt
echo ""
fi
schema=benchmark_data.csv.schema.json
dynenum_schema=benchmark_data-dynenum.csv.schema.json
commands_without_index=()
commands_without_index_name=()
commands_with_index=()
commands_with_index_name=()
function add_command {
local dest_array="$1"
shift
local cmd="$*"
if [[ "$dest_array" == "without_index" ]]; then
commands_without_index+=("$cmd")
else
commands_with_index+=("$cmd")
fi
}
function run {
local index=
while true; do
case "$1" in
--index)
index="yes"
shift
;;
*)
break
;;
esac
done
local name="$1"
shift
if [[ "$name" == *"$arg_pat"* ]]; then
if [ -z "$index" ]; then
commands_without_index_name+=("$name")
add_command "without_index" "$@"
else
commands_with_index_name+=("$name")
add_command "with_index" "$@"
fi
fi
}
# ---------------------------------------
# Queue commands for benchmarking
# commands with an --index prefix will be benchmarked with an index and a stats cache
# template: run <benchmark name> <qsv command> <qsv command args>
# Note that several benchmarks assume the the benchmark_data is using the NYC 311 dataset, so
# the column names are hardcoded.
# If you're using a different dataset, you will need to modify the commands below to use the
# appropriate column names.
run apply_calcconv "$qsv_bin apply calcconv --formatstr \"{Unique Key} meters in miles\" --new-column new_col $data"
run apply_dynfmt "$qsv_bin apply dynfmt --formatstr \"{Created Date} {Complaint Type} - {BBL} {City}\" --new-column new_col $data"
run apply_emptyreplace "$qsv_bin" apply emptyreplace \"Bridge Highway Name\" --replacement Unspecified "$data"
run apply_op_eudex "$qsv_bin apply operations lower,eudex Agency --comparand Queens --new-column Agency_queens_soundex $data"
run apply_op_string "$qsv_bin apply operations lower Agency $data"
run apply_op_sentiment "$qsv_bin apply operations sentiment \"Resolution Description\" --new-column Sentiment_Score $data"
run apply_op_similarity "$qsv_bin apply operations lower,simdln Agency --comparand brooklyn --new-column Agency_sim-brooklyn_score $data"
run apply_op_similarity_batchall "$qsv_bin apply operations lower,simdln Agency --comparand brooklyn --batch 0 --new-column Agency_sim-brooklyn_score $data"
run behead "$qsv_bin" behead "$data"
run behead_flexible "$qsv_bin" behead --flexible "$data"
run cat_columns "$qsv_bin" cat columns "$data" data_unsorted.csv
run cat_rows "$qsv_bin" cat rows "$data" data_unsorted.csv
run cat_rows_flexible "$qsv_bin" cat rows --flexible "$data" data_unsorted.csv
run cat_rowskey "$qsv_bin" cat rowskey "$data" data_unsorted.csv
run count "$qsv_bin" count "$data"
run count_flexible "$qsv_bin" count --flexible "$data"
run count_polars_lowmem "$qsv_bin" count --low-memory "$data"
run count_no_polars "$qsv_bin" count --no-polars "$data"
run --index count_index "$qsv_bin" count "$data"
run count_width "$qsv_bin" count --width "$data"
run --index count_width_index "$qsv_bin" count --width "$data"
run datefmt "$qsv_bin datefmt \"Created Date\" $data"
run datefmt_multi "$qsv_bin datefmt \"Created Date,Closed Date,Due Date\" $data"
run datefmt_multi_batchall "$qsv_bin datefmt \"Created Date,Closed Date,Due Date\" --batch 0 $data"
run datefmt_multi_select "$qsv_bin datefmt '/(?i) date$/' $data"
run datefmt_formatstr_newcol "$qsv_bin datefmt --formatstr '%V' \"Created Date\" --new-column week_number $data"
run dedup "$qsv_bin" dedup "$data"
run dedup_sorted "$qsv_bin" dedup data_sorted.csv
run diff "$qsv_bin" diff "$data" data_unsorted.csv
run enum "$qsv_bin" enum "$data"
run enum_hash "$qsv_bin" enum --hash 1- "$data"
run enum_uuid "$qsv_bin" enum --uuid4 "$data"
run enum_uuid7 "$qsv_bin" enum --uuid7 "$data"
run enum_constant "$qsv_bin" enum --constant "NYC" "$data"
run enum_copy "$qsv_bin" enum --copy Agency "$data"
run excel "$qsv_bin" excel benchmark_data.xlsx
run excel_j1 "$qsv_bin" excel -j 1 benchmark_data.xlsx
run excel_trim "$qsv_bin" excel --trim benchmark_data.xlsx
run excel_trim_j1 "$qsv_bin" excel --trim -j 1 benchmark_data.xlsx
run excel_error_format_formula "$qsv_bin" excel --error-format both benchmark_data.xlsx
run excel_metadata "$qsv_bin" excel --metadata c benchmark_data.xlsx
run excel_metadata_short "$qsv_bin" excel --metadata s benchmark_data.xlsx
run exclude "$qsv_bin" exclude \'Incident Zip\' "$data" \'Incident Zip\' data_to_exclude.csv
run --index exclude_index "$qsv_bin" exclude \'Incident Zip\' "$data" \'Incident Zip\' data_to_exclude.csv
run exclude_casei "$qsv_bin" exclude --ignore-case \'Incident Zip\' "$data" \'Incident Zip\' data_to_exclude.csv
run --index exclude_casei_index "$qsv_bin" exclude --ignore-case \'Incident Zip\' "$data" \'Incident Zip\' data_to_exclude.csv
run exclude_multi "$qsv_bin" exclude \'Incident Zip,Community Board,Agency\' "$data" \'Incident Zip,Community Board,Agency\' data_to_exclude.csv
run --index exclude_multi_index "$qsv_bin" exclude \'Incident Zip,Community Board,Agency\' "$data" \'Incident Zip,Community Board,Agency\' data_to_exclude.csv
run exclude_multi_casei "$qsv_bin" exclude --ignore-case \'Incident Zip,Community Board,Agency\' "$data" \'Incident Zip,Community Board,Agency\' data_to_exclude.csv
run --index exclude_multi_casei_index "$qsv_bin" exclude --ignore-case \'Incident Zip,Community Board,Agency\' "$data" \'Incident Zip,Community Board,Agency\' data_to_exclude.csv
run explode "$qsv_bin" explode City "-" "$data"
run extdedup "$qsv_bin" extdedup "$data"
run extdedup_csv "$qsv_bin" extdedup "$data" --select 1-5
run extsort "$qsv_bin" extsort data_unsorted.csv extsort_sorted.csv
run extsort_csv env QSV_AUTOINDEX_SIZE=1 bash -c \'"$qsv_bin" extsort data_unsorted.csv --select 1-5 extsort_sorted.csv\'
run fill "$qsv_bin" fill -v Unspecified \'Address Type\' "$data"
run fixlengths "$qsv_bin" fixlengths "$data"
run flatten "$qsv_bin" flatten "$data"
run flatten_condensed "$qsv_bin" flatten "$data" --condense 50
run fmt "$qsv_bin" fmt --crlf "$data"
run fmt_no_crlf "$qsv_bin" fmt "$data"
run fmt_no_final_newline "$qsv_bin" fmt --no-final-newline "$data"
run foreach "$qsv_bin" foreach City "echo {}" "$data"
run frequency "$qsv_bin" frequency "$data"
run --index frequency_index "$qsv_bin" frequency "$data"
run --index frequency_index_stats_mode_auto "$qsv_bin" frequency --stats-mode auto "$data"
run --index frequency_index_stats_mode_force "$qsv_bin" frequency --stats-mode force "$data"
run --index frequency_index_stats_mode_none "$qsv_bin" frequency --stats-mode none "$data"
run frequency_no_limit "$qsv_bin" frequency --limit 0 "$data"
run --index frequency_no_limit_index "$qsv_bin" frequency --limit 0 "$data"
run frequency_other_sorted "$qsv_bin" frequency --other-sorted "$data"
run --index frequency_other_sorted_index "$qsv_bin" frequency --other-sorted "$data"
run frequency_selregex "$qsv_bin" frequency -s /^R/ "$data"
run frequency_j1 "$qsv_bin" frequency -j 1 "$data"
run frequency_ignorecase "$qsv_bin" frequency -i "$data"
run --index frequency_ignorecase_index "$qsv_bin" frequency -i "$data"
run frequency_selregex_ignorecase "$qsv_bin" frequency -s /^R/ -i "$data"
run frequency_j1_ignorecase "$qsv_bin" frequency -j 1 -i "$data"
run frequency_notrim "$qsv_bin" frequency --no-trim "$data"
run --index frequency_notrim_index "$qsv_bin" frequency --no-trim "$data"
run frequency_limit20 "$qsv_bin" frequency --limit 20 "$data"
run --index frequency_limit20_index "$qsv_bin" frequency --limit 20 "$data"
run frequency_sorted "$qsv_bin" frequency data_sorted.csv
run --index frequency_sorted_index "$qsv_bin" frequency data_sorted.csv
run geocode_suggest "$qsv_bin" geocode suggest City --new-column geocoded_city "$data"
run geocode_suggest_batchall "$qsv_bin" geocode suggest City --new-column geocoded_city --batch 0 "$data"
run geocode_reverse "$qsv_bin" geocode reverse Location --new-column geocoded_location "$data"
run geocode_reverse_batchall "$qsv_bin" geocode reverse Location --new-column geocoded_location --batch 0 "$data"
run index "$qsv_bin" index "$data"
run input "$qsv_bin" input "$data"
run json "$qsv_bin" json benchmark_data.json
run join "$qsv_bin" join \'Community Board\' "$data" community_board communityboards.csv
run join_casei "$qsv_bin" join \'Community Board\' "$data" community_board --ignore-case communityboards.csv
run joinp "$qsv_bin" joinp \'Community Board\' "$data" community_board communityboards.csv
run joinp_streaming "$qsv_bin" joinp \'Community Board\' "$data" community_board communityboards.csv --streaming
run jsonl "$qsv_bin" jsonl benchmark_data.jsonl
run jsonl_batchall "$qsv_bin" jsonl benchmark_data.jsonl --batch 0
run jsonl_j1 "$qsv_bin" jsonl -j 1 benchmark_data.jsonl
run luau_filter "$qsv_bin" luau filter \"Location == \'\'\" "$data"
run luau_filter_colidx "$qsv_bin" luau filter --colindex \"Location == \'\'\" "$data"
run luau_filter_no_globals "$qsv_bin" luau filter --no-globals \"Location == \'\'\" "$data"
run luau_filter_no_globals_colidx "$qsv_bin" luau filter --no-globals --colindex \"Location == \'\'\" "$data"
run luau_multi "$qsv_bin" luau map dow,hourday,weekno "file:dt_format.luau" "$data"
run luau_multi_colidx "$qsv_bin" luau map dow,hourday,weekno "file:dt_format.luau" --colindex "$data"
run luau_filter_no_globals_colidx "$qsv_bin" luau filter --no-globals --colindex \"Location == \'\'\" "$data"
run luau_filter_no_globals_no_colidx "$qsv_bin" luau filter --no-globals \"Location == \'\'\" "$data"
run luau_multi_no_globals "$qsv_bin" luau map dow,hourday,weekno --no-globals "file:dt_format.luau" "$data"
run luau_multi_no_globals_colidx "$qsv_bin" luau map dow,hourday,weekno --no-globals --colindex "file:dt_format.luau" "$data"
run luau_script "$qsv_bin" luau map turnaround_time "file:turnaround_time.luau" "$data"
run luau_script_colidx "$qsv_bin" luau map turnaround_time --colindex "file:turnaround_time.luau" "$data"
run luau_script_no_globals "$qsv_bin" luau map turnaround_time --no-globals "file:turnaround_time.luau" "$data"
run luau_script_no_globals_colidx "$qsv_bin" luau map turnaround_time --no-globals --colindex "file:turnaround_time.luau" "$data"
run partition "$qsv_bin" partition \'Community Board\' /tmp/partitioned "$data"
run pseudo "$qsv_bin" pseudo \'Unique Key\' "$data"
run pseudo_formatstr "$qsv_bin" pseudo \'Unique Key\' --formatstr 'ID-{}' --increment 5 "$data"
run rename "$qsv_bin" rename \'unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,loctype,zip,addr1,street,xstreet1,xstreet2,inter1,inter2,addrtype,city,landmark,facility_type,status,due_date,res_desc,res_act_date,comm_board,bbl,boro,xcoord,ycoord,opendata_type,parkname,parkboro,vehtype,taxi_boro,taxi_loc,bridge_hwy_name,bridge_hwy_dir,ramp,bridge_hwy_seg,lat,long,loc\' "$data"
run replace "$qsv_bin" replace \'zip\' \'postal\' "$data"
run reverse "$qsv_bin" reverse "$data"
run --index reverse_index "$qsv_bin" reverse "$data"
run safenames "$qsv_bin" safenames "$data"
run sample_10 "$qsv_bin" sample 10 "$data"
run --index sample_10_index "$qsv_bin" sample 10 "$data"
run sample_1000 "$qsv_bin" sample 1000 "$data"
run --index sample_1000_index "$qsv_bin" sample 1000 "$data"
run sample_100000 "$qsv_bin" sample 100000 "$data"
run --index sample_100000_index "$qsv_bin" sample 100000 "$data"
run sample_100000_seeded "$qsv_bin" sample 100000 --seed 42 "$data"
run sample_100000_seeded_faster "$qsv_bin" sample 100000 --rng faster --seed 42 "$data"
run sample_100000_seeded_secure "$qsv_bin" sample 100000 --rng cryptosecure --seed 42 "$data"
run --index sample_100000_seeded_index "$qsv_bin" sample --seed 42 100000 "$data"
run --index sample_100000_seeded_index_faster "$qsv_bin" sample --rng faster --seed 42 100000 "$data"
run --index sample_100000_seeded_index_secure "$qsv_bin" sample --rng cryptosecure --seed 42 100000 "$data"
run --index sample_25pct_index "$qsv_bin" sample 0.25 "$data"
run --index sample_25pct_seeded_index "$qsv_bin" sample 0.25 --seed 42 "$data"
run schema "$qsv_bin" schema --force "$data"
run --index schema_index "$qsv_bin" schema "$data"
run search "$qsv_bin" search -s \'Agency Name\' "'(?i)us'" "$data"
run search_unicode "$qsv_bin" search --unicode -s \'Agency Name\' "'(?i)us'" "$data"
run search_file "$qsv_bin" search "'(?i)us'" "$data"
run search_file_literal "$qsv_bin" search --literal "'$'" "$data"
run search_file_unicode "$qsv_bin" search --unicode "'(?i)us'" "$data"
run search_file_case_sensitive "$qsv_bin" search "'us'" "$data"
run search_file_case_sensitive_unicode "$qsv_bin" search --unicode "'us'" "$data"
run search_file_flag "$qsv_bin" search "'(?i)us'" "$data" --flag flagged
run search_file_flag_matchonly "$qsv_bin" search "'(?i)us'" "$data" --flag M
run searchset "$qsv_bin" searchset searchset_patterns.txt "$data"
run searchset_ignorecase "$qsv_bin" searchset --ignore-case searchset_patterns.txt "$data"
run searchset_unicode "$qsv_bin" searchset searchset_patterns_unicode.txt --unicode "$data"
run select "$qsv_bin" select \'Agency,Community Board\' "$data"
run select_regex "$qsv_bin" select /^L/ "$data"
run slice_one_middle "$qsv_bin" slice -i 500000 "$data"
run slice_last_1k "$qsv_bin" slice -s 1000 -l 1000 "$data"
run slice_last_1k_json "$qsv_bin" slice -s 1000 -l 1000 --json "$data"
run --index slice_one_middle_index "$qsv_bin" slice -i 500000 "$data"
run --index slice_last_1k_index "$qsv_bin" slice -s 1000 -l 1000 "$data"
run --index slice_last_1k_json_index "$qsv_bin" slice -s 1000 -l 1000 --json "$data"
run snappy_compress "$qsv_bin" snappy compress "$data" --output benchmark_data.snappy
run snappy_decompress "$qsv_bin" snappy decompress benchmark_data.snappy
run snappy_validate "$qsv_bin" snappy validate benchmark_data.snappy
run sort "$qsv_bin" sort -s \'Incident Zip\' "$data"
run sort_random_seeded "$qsv_bin" sort --random --seed 42 "$data"
run sort_random_seeded_faster "$qsv_bin" sort --random --rng faster --seed 42 "$data"
run sort_random_seeded_secure "$qsv_bin" sort --random --rng cryptosecure --seed 42 "$data"
run sortcheck_sorted "$qsv_bin" sortcheck data_sorted.csv
run sortcheck_unsorted "$qsv_bin" sortcheck data_unsorted.csv
run sortcheck_unsorted_all "$qsv_bin" sortcheck --all data_unsorted.csv
run split "$qsv_bin" split --size 50000 split_tempdir "$data"
run split_chunks "$qsv_bin" split --chunks 20 split_tempdir_chunks "$data"
run split_kbsize "$qsv_bin" split --kb-size 10000 split_tempdir_kbs "$data"
run --index split_index "$qsv_bin" split --size 50000 split_tempdir_idx "$data"
run --index split_index_j1 "$qsv_bin" split --size 50000 -j 1 split_tempdir_idx_j1 "$data"
run --index split_chunks_index "$qsv_bin" split --chunks 20 split_tempdir_chunks_idx "$data"
run --index split_chunks_index_j1 "$qsv_bin" split --chunks 20 -j 1 split_tempdir_chunks_idx_j1
run sqlp "$qsv_bin" sqlp "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_aggregations "$qsv_bin" sqlp "$data" -Q --infer-len 100000 '"select Borough, count(*) from _t_1 where \"Complaint Type\"='\''Noise'\'' group by Borough"'
run sqlp_aggregations_use_schema_cache "$qsv_bin" sqlp "$data" -Q --infer-len 100000 --cache-schema '"select Borough, count(*) from _t_1 where \"Complaint Type\"='\''Noise'\'' group by Borough"'
run sqlp_aggregations_vs_duckdb duckdb :memory: '"select Borough, count(*) from read_csv_auto('\'''$data''\'') where \"Complaint Type\"='\''Noise'\'' group by Borough"'
run sqlp_aggregations_expensive "$qsv_bin" sqlp SKIP_INPUT -Q --infer-len 100000 expensive.sql
run sqlp_aggregations_expensive_streaming "$qsv_bin" sqlp SKIP_INPUT -Q --infer-len 100000 --streaming expensive.sql
run sqlp_aggregations_expensive_vs_duckdb duckdb :memory: -c \".read expensiveduckdb.sql\"
run sqlp_format_arrow "$qsv_bin" sqlp --format arrow "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_avro "$qsv_bin" sqlp --format avro "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_json "$qsv_bin" sqlp --format json "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_jsonl "$qsv_bin" sqlp --format jsonl "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_parquet "$qsv_bin" sqlp --format parquet "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_format_parquet_statistics "$qsv_bin" sqlp --format parquet --statistics "$data" -Q --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_lowmemory "$qsv_bin" sqlp "$data" -Q --low-memory --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_nooptimizations "$qsv_bin" sqlp "$data" -Q --no-optimizations --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates "$qsv_bin" sqlp "$data" -Q --try-parsedates --infer-len 100000 '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run sqlp_tryparsedates_inferlen "$qsv_bin" sqlp "$data" -Q --infer-len 10000 --try-parsedates '"select * from _t_1 where \"Complaint Type\"='\''Noise'\'' and Borough='\''BROOKLYN'\''"'
run stats "$qsv_bin" stats --force "$data"
run stats_create_cache "$qsv_bin" stats --force "$data"
run --index stats_index "$qsv_bin" stats --force "$data"
run --index stats_index_with_cache "$qsv_bin" stats "$data"
run --index stats_index_j1 "$qsv_bin" stats -j 1 --force "$data"
run --index stats_index_j1_with_cache "$qsv_bin" stats -j 1 "$data"
run stats_everything "$qsv_bin" stats "$data" --force --everything
run stats_everything_create_cache "$qsv_bin" stats "$data" --force --everything
run stats_everything_infer_dates "$qsv_bin" stats "$data" --force --everything --infer-dates
run stats_everything_j1 "$qsv_bin" stats "$data" --force --everything -j 1
run stats_everything_sorted "$qsv_bin" stats data_sorted.csv --force --everything
run --index stats_everything_index "$qsv_bin" stats "$data" --force --everything
run --index stats_everything_index_with_cache "$qsv_bin" stats "$data" --everything
run --index stats_everything_infer_dates_index "$qsv_bin" stats "$data" --force --everything --infer-dates
run --index stats_everything_infer_dates_index_with_cache "$qsv_bin" stats "$data" --everything --infer-dates
run --index stats_everything_index_j1 "$qsv_bin" stats "$data" --force --everything -j 1
run --index stats_everything_index_j1_with_cache "$qsv_bin" stats "$data" --everything -j 1
run --index stats_everything_sorted_index "$qsv_bin" stats data_sorted.csv --force --everything
run table "$qsv_bin" table "$data"
run to_xlsx "$qsv_bin" to xlsx benchmark_work.xlsx "$data"
run to_sqlite "$qsv_bin" to sqlite benchmark_work.db "$data"
run to_datapackage "$qsv_bin" to datapackage benchmark_work.json "$data"
run tojsonl "$qsv_bin" tojsonl "$data"
run tojsonl_batchall "$qsv_bin" tojsonl --batch 0 "$data"
run tojsonl_j1 "$qsv_bin" tojsonl -j 1 "$data"
run tojsonl_trim "$qsv_bin" tojsonl --trim "$data"
run tojsonl_trim_j1 "$qsv_bin" tojsonl --trim -j 1 "$data"
run --index tojsonl_index "$qsv_bin" tojsonl "$data"
run --index tojsonl_index_j1 "$qsv_bin" tojsonl --jobs 1 "$data"
run transpose "$qsv_bin" transpose "$data"
run transpose_multipass "$qsv_bin" transpose --multipass "$data"
run validate "$qsv_bin" validate "$data" "$schema"
run validate_batchall "$qsv_bin" validate --batch 0 "$data" "$schema"
run validate_no_schema "$qsv_bin" validate "$data"
run validate_valid_output "$qsv_bin" validate "$data" "$schema" --valid-output -
run validate_dynenum "$qsv_bin" validate "$data" "$dynenum_schema"
run validate_dynenum_batchall "$qsv_bin" validate --batch 0 "$data" "$dynenum_schema"
run validate_dynenum_no_schema "$qsv_bin" validate "$data" "$dynenum_schema"
run validate_dynenum_valid_output "$qsv_bin" validate "$data" "$dynenum_schema" --valid-output -
run --index validate_index "$qsv_bin" validate "$data" "$schema"
run --index validate_batchall_index "$qsv_bin" validate --batch 0 "$data" "$schema"
run --index validate_no_schema_index "$qsv_bin" validate "$data"
run --index validate_valid_output_index "$qsv_bin" validate "$data" "$schema" --valid-output -
run --index validate_dynenum_index "$qsv_bin" validate "$data" "$dynenum_schema"
run --index validate_dynenum_batchall_index "$qsv_bin" validate --batch 0 "$data" "$dynenum_schema"
run --index validate_dynenum_no_schema_index "$qsv_bin" validate "$data" "$dynenum_schema"
run --index validate_dynenum_valid_output_index "$qsv_bin" validate "$data" "$dynenum_schema" --valid-output -
# show count of commands to be benchmarked
with_index_count=${#commands_with_index[@]}
wo_index_count=${#commands_without_index[@]}
total_count=$((with_index_count + wo_index_count))
printf "> Commands to benchmark: %s, w/o index: %s, with index: %s\n\n" "$total_count" "$wo_index_count" "$with_index_count"
# ---------------------------------------
# Prepare benchmark results directory
# Check if a results directory exists, if it doesn't create it
if [ ! -d "results" ]; then
mkdir results
fi
# Init latest_results.csv. It stores the benchmark results for this run
rm -f results/latest_results.csv
echo "version,tstamp,name,mean,stddev,median,user,system,min,max" >results/latest_results.csv
# check if the file benchmark_results.csv exists, if it doesn't create it
# by copying the empty latest_results.csv
if [ ! -f "results/benchmark_results.csv" ]; then
cp results/latest_results.csv results/benchmark_results.csv
fi
# get current time to the nearest hour
now=$(date +"%Y-%m-%d-%H")
# get current time to the nearest second
now_sec=$(date +"%Y-%m-%d-%H-%M-%S")
# ---------------------------------------
# Run hyperfine to compile benchmark results. Append each individual result to the latest_results.csv
# by dogfooding qsv's cat, luau, select & sort commands.
# first, run benchmarking without an index
# each command is run five times. Two warm-up runs & three benchmarked runs.
echo "> Benchmarking WITHOUT INDEX..."
idx=0
name_idx=1
for command_no_index in "${commands_without_index[@]}"; do
# remove the index file and the stats cache files
rm -f "$data".idx
rm -f "$filestem".stats.*
pct_complete=$(((name_idx - 1) * 100 / total_count))
echo "$name_idx. ${commands_without_index_name[$idx]} ($pct_complete%)"
hyperfine -N --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \
"$command_no_index"
# prepend version, tstamp & benchmark name to the hyperfine results
echo "version,tstamp,name" >results/results_work.csv
echo "$version,$now,${commands_without_index_name[$idx]}" >>results/results_work.csv
# remove the command column from the hyperfine results, we just need the name
"$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv
# the entry.csv file is the expanded benchmark result for this command
"$qsv_bin" cat columns results/results_work.csv results/hf_result_nocmd.csv \
-o results/entry.csv
# append the entry.csv to latest_results.csv
"$qsv_bin" cat rowskey results/latest_results.csv results/entry.csv \
-o results/results_work.csv
mv results/results_work.csv results/latest_results.csv
((idx++))
((name_idx++))
done
# ---------------------------------------
# then, run benchmarks with an index and stats cache
# an index enables random access and unlocks multi-threading in several commands
# the stats cache enables faster stats computation as it will use the cached stats
# when its valid and available, instead of computing the stats from scratch
if [ "$with_index_count" -gt 0 ]; then
echo "> Benchmarking WITH INDEX and STATS CACHE..."
echo " Preparing index and stats cache..."
rm -f "$data".idx
"$qsv_bin" index "$data"
"$qsv_bin" stats "$data" --everything --infer-dates --stats-jsonl --force \
--output benchmark_work.stats.csv
fi
idx=0
for command_with_index in "${commands_with_index[@]}"; do
pct_complete=$(((name_idx - 1) * 100 / total_count))
echo "$name_idx. ${commands_with_index_name[$idx]} ($pct_complete%)"
hyperfine -N --warmup "$warmup_runs" -i --runs "$benchmark_runs" --export-csv results/hf_result.csv \
"$command_with_index"
echo "version,tstamp,name" >results/results_work.csv
echo "$version,$now,${commands_with_index_name[$idx]}" >>results/results_work.csv
"$qsv_bin" select '!command' results/hf_result.csv -o results/hf_result_nocmd.csv
"$qsv_bin" cat columns results/results_work.csv results/hf_result_nocmd.csv \
-o results/entry.csv
"$qsv_bin" cat rowskey results/latest_results.csv results/entry.csv \
-o results/results_work.csv
mv results/results_work.csv results/latest_results.csv
((idx++))
((name_idx++))
done
# ---------------------------------------
# Finalize benchmark results. Sort the latest results by version, tstamp & name.
# compute and add records per second for each benchmark using qsv's luau command.
# We compute recs_per_sec by dividing the number of rows in the benchmark data
# by the mean run time of the three runs.
# We then append/concatenate the latest results to benchmark_results.csv - which is
# a historical archive, so we can track performance over multiple releases.
echo ""
# sort the benchmark results by version, tstamp & name
"$qsv_bin" sort --select version,tstamp,name results/latest_results.csv \
-o results/results_work.csv
# compute records per second for each benchmark using luau by dividing rowcount by mean
# we then round the result to a whole number. We also compute the total mean
# we set the QSVBM_ROWCOUNT environment variable to the rowcount so it can be used
# by the luau script by using the qsv.get_env() function
export QSVBM_ROWCOUNT=$rowcount
# we run the benchmark_aggregations.luau script using qsv's luau command
# total_mean is the total mean of all the benchmarks
# it is computed in the END block of the script and is sent to stderr
# which we redirect to a file named total_mean.txt
"$qsv_benchmarker_bin" luau map recs_per_sec "file:benchmark_aggregations.luau" \
results/results_work.csv -o results/latest_results.csv 2>total_mean.txt
# we read the total_mean from the total_mean.txt file
total_mean=$(<total_mean.txt)
# Concatenate the final results of this run to results/bechmark_results.csv
"$qsv_bin" cat rowskey results/latest_results.csv results/benchmark_results.csv \
-o results/results_work.csv
mv results/results_work.csv results/benchmark_results.csv
# make "display" versions of the results
# i.e. number of decimal places is reduced to 3, and column order is changed so it's easier to read
# with recs_per_sec moved from the back after mean followed by the rest of the stats columns
# first - for benchmark_results_display.csv, move the recs_per_sec column after the
# mean column using the `qsv select` command
"$qsv_bin" select version,tstamp,name,mean,recs_per_sec,stddev,median,user,system,min,max \
results/benchmark_results.csv -o results/benchmark_results_display.csv
# then, round the stats columns to 3 decimal places using the `qsv apply operations round` command
# it defaults to 3 decimal places if the --formatstr option is not specified
"$qsv_benchmarker_bin" apply operations round mean,stddev,median,user,system,min,max \
results/benchmark_results_display.csv -o results/results_work.csv
mv results/results_work.csv results/benchmark_results_display.csv
# do the same for latest_results_display.csv
"$qsv_bin" select version,tstamp,name,mean,recs_per_sec,stddev,median,user,system,min,max \
results/latest_results.csv -o results/latest_results_display.csv
"$qsv_benchmarker_bin" apply operations round mean,stddev,median,user,system,min,max \
results/latest_results_display.csv -o results/results_work.csv
mv results/results_work.csv results/latest_results_display.csv
# Clean up temporary files
cleanup_files
# ---------------------------------------
# Finalize benchmark run info. Append the run info to results/run_info_history.csv
# we use the TSV format as some of the data has commas/quotes/whitespace/semicolon, etc.
# get the environment variables used by qsv
qsv_envvars=$("$qsv_bin" --envlist)
elapsed=$SECONDS
# Init latest_run_info.csv. It stores the benchmark run info for this run
rm -f results/latest_run_info.tsv
echo -e "version\ttstamp\tlogtime\tbm_version\tplatform\tcores\tmem\tbinary\tkind\targument\ttotal_count\two_index_count\twith_index_count\twarmup_runs\tbenchmark_runs\telapsed_secs\ttotal_mean\tqsv_env\tversion_info" >results/latest_run_info.tsv
# check if the file run_info_history.csv exists, if it doesn't create it
# by copying the empty latest_run_info.csv
if [ ! -f "results/run_info_history.tsv" ]; then
cp results/latest_run_info.tsv results/run_info_history.tsv
fi
# append the run info to latest_run_info.csv
echo -e "$version\t$now\t$now_sec\t$bm_version\t$platform\t$num_cores\t$mem_size\t$qsv_bin\t$kind\t$arg_pat\t$total_count\t$wo_index_count\t$with_index_count\t$warmup_runs\t$benchmark_runs\t$elapsed\t$total_mean\t$qsv_envvars\t$raw_version" >>results/latest_run_info.tsv
# now update the run_info_history.tsv
"$qsv_bin" cat rowskey results/latest_run_info.tsv results/run_info_history.tsv \
-o results/run_info_work.tsv
mv results/run_info_work.tsv results/run_info_history.tsv
echo "> 100% DONE! $total_count benchmarks executed. Elapsed time: $elapsed seconds. Total mean: $total_mean"