-
Notifications
You must be signed in to change notification settings - Fork 167
/
Copy pathset_image_mtimes.go
178 lines (148 loc) · 5.09 KB
/
set_image_mtimes.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
package main
//
// UPDATE: This whole script turned out to not be needed because I found the
// `--size-only` options for `aws s3 sync` which ignores mtimes and doesn't
// exactly the right thing as far as image assets are concerned. I've left this
// script here for historical interest only.
//
// sorg pushes to S3 automatically from its GitHub Actions build so that it has
// an autodeploy mechanism on every push.
//
// Syncing is accomplished via the AWS CLI's `aws s3 sync`, which should be
// adequate except for the fact that it decides what to sync up using a file's
// modification time, and Git doesn't preserve modification times. So every
// build has the effect of cloning down a repository, having every file get a
// new mtime, then syncing everything from scratch.
//
// At some point I realized that every build was pushing 100 MB+ of images and
// running on cron every hour, which was getting expensive -- not *hugely*
// expensive, but on the order of single dollar digits a month, which is more
// than the value I was getting out of it.
//
// This script solves at least part of the problem by looking at every image in
// the repository, checking when its last commit was, and then changing the
// modification time of the file to that commit's timestamp. This has the
// effect of giving the file a stable mtime so that it's not pushed to S3 over
// and over again.
//
// Unfortunately it has a downside, which is that `git log` is not very fast,
// and there's no way I can find of efficiently batching up a lot of these
// commands for multiple files at once. As I write this, the script takes just
// over a minute to iterate every file and get its commit time.
//
// A better answer to this might be to stop storing images in the repository
// (which will be unsustainable eventually anyway) and instead put them in
// their own S3 bucket like which is already done for photographs.
//
import (
"fmt"
"math"
"os"
"os/exec"
"strings"
"sync"
"time"
"golang.org/x/xerrors"
)
const (
imagePath = "./content/images/"
// Number of parallel workers to runw which extract commit timestamps and
// set modtimes on files.
//
// There's a very definite diminishing return to increasing this number,
// but it does help. Some rough numbers from Mac OS:
//
// 1 --> 62s
// 5 --> 28s
// 10 --> 21s
// 20 --> 21s
// 100 --> 20s
//
parallelWorkers = 10
)
//
// ---
//
func main() {
allImages, err := getAllImages()
abortOnErr(err)
imageBatches := batchImages(allImages)
var wg sync.WaitGroup
wg.Add(parallelWorkers)
for workerNum := range parallelWorkers {
go func() {
for _, path := range imageBatches[workerNum] {
lastCommitTime, err := getLastCommitTime(path)
abortOnErr(err)
fmt.Printf("%v --> %v\n", path, lastCommitTime)
err = os.Chtimes(path, *lastCommitTime, *lastCommitTime)
abortOnErr(err)
}
wg.Done()
}()
}
wg.Wait()
}
//
// ---
//
func abortOnErr(err error) {
if err == nil {
return
}
fmt.Fprintf(os.Stderr, "Error encountered: %v", err)
os.Exit(1)
}
// Breaks a set of images into groups for N parallel workers roughly evenly.
func batchImages(allImages []string) [][]string {
batches := make([][]string, parallelWorkers)
imagesPerWorker := int(math.Ceil(float64(len(allImages)) / float64(parallelWorkers-1)))
for i := range parallelWorkers {
startIndex := i * imagesPerWorker
endIndex := minInt((i+1)*imagesPerWorker, len(allImages))
// fmt.Printf("worker %v: start = %v end = %v (per worker = %v, total = %v)\n",
// i, startIndex, endIndex, imagesPerWorker, len(allImages))
// Thanks for our ceiling math, for cases where there are many workers
// compared to the amount of work needing to be done, it's possible for
// the last worker to be beyond the limits of the slice.
if startIndex >= len(allImages) {
break
}
batches[i] = allImages[startIndex:endIndex]
}
return batches
}
// Gets a list of all image paths by using a `git ls-tree` command on the
// target directory.
func getAllImages() ([]string, error) {
out, err := runCommand("git", "ls-tree", "-r", "--name-only", "HEAD", imagePath)
if err != nil {
return nil, xerrors.Errorf("error getting images with `git ls-tree`: %w", err)
}
return strings.Split(out, "\n"), nil
}
// Gets the last commit time on a particular image path by using a `git log`
// command.
func getLastCommitTime(path string) (*time.Time, error) {
out, err := runCommand("git", "log", "--max-count=1", `--pretty=format:%aI`, path)
if err != nil {
return nil, xerrors.Errorf("error getting commit time for '%s': %w", path, err)
}
lastCommitTime, err := time.Parse("2006-01-02T15:04:05-07:00", out)
if err != nil {
return nil, xerrors.Errorf("error parsing time '%s': %w", out, err)
}
return &lastCommitTime, nil
}
func minInt(a, b int) int {
return int(math.Min(float64(a), float64(b)))
}
func runCommand(name string, arg ...string) (string, error) {
cmd := exec.Command(name, arg...)
cmd.Stderr = os.Stderr
out, err := cmd.Output()
if err != nil {
return "", xerrors.Errorf("error executing command '%s': %w", name, err)
}
return strings.TrimSpace(string(out)), nil
}