From b40957d0f3a524f92f514903dad58df1cbead1d6 Mon Sep 17 00:00:00 2001 From: Stephen J Day Date: Wed, 19 Nov 2014 13:23:01 -0800 Subject: [PATCH] Move Digest type into discrete package The Digest type will be fairly central for blob and layer management. The type presented in this package provides a number of core features that should enable reliable use within the registry. This commit will be followed by others that convert the storage layer and webapp to use this type as the primary layer/blob CAS identifier. --- digest.go | 145 ++++++++++++++++++++++++++++++++++++++++++++++ digest_test.go | 80 +++++++++++++++++++++++++ doc.go | 52 +++++++++++++++++ verifiers.go | 131 +++++++++++++++++++++++++++++++++++++++++ verifiers_test.go | 71 +++++++++++++++++++++++ 5 files changed, 479 insertions(+) create mode 100644 digest.go create mode 100644 digest_test.go create mode 100644 doc.go create mode 100644 verifiers.go create mode 100644 verifiers_test.go diff --git a/digest.go b/digest.go new file mode 100644 index 0000000..f2ce021 --- /dev/null +++ b/digest.go @@ -0,0 +1,145 @@ +package digest + +import ( + "bytes" + "crypto/sha256" + "fmt" + "hash" + "io" + "io/ioutil" + "strings" + + "github.com/docker/docker-registry/common" + "github.com/docker/docker/pkg/tarsum" +) + +// Digest allows simple protection of hex formatted digest strings, prefixed +// by their algorithm. Strings of type Digest have some guarantee of being in +// the correct format and it provides quick access to the components of a +// digest string. +// +// The following is an example of the contents of Digest types: +// +// sha256:7173b809ca12ec5dee4506cd86be934c4596dd234ee82c0662eac04a8c2c71dc +// +// More important for this code base, this type is compatible with tarsum +// digests. For example, the following would be a valid Digest: +// +// tarsum+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b +// +// This allows to abstract the digest behind this type and work only in those +// terms. +type Digest string + +// NewDigest returns a Digest from alg and a hash.Hash object. +func NewDigest(alg string, h hash.Hash) Digest { + return Digest(fmt.Sprintf("%s:%x", alg, h.Sum(nil))) +} + +var ( + // ErrDigestInvalidFormat returned when digest format invalid. + ErrDigestInvalidFormat = fmt.Errorf("invalid checksum digest format") + + // ErrDigestUnsupported returned when the digest algorithm is unsupported by registry. + ErrDigestUnsupported = fmt.Errorf("unsupported digest algorithm") +) + +// ParseDigest parses s and returns the validated digest object. An error will +// be returned if the format is invalid. +func ParseDigest(s string) (Digest, error) { + // Common case will be tarsum + _, err := common.ParseTarSum(s) + if err == nil { + return Digest(s), nil + } + + // Continue on for general parser + + i := strings.Index(s, ":") + if i < 0 { + return "", ErrDigestInvalidFormat + } + + // case: "sha256:" with no hex. + if i+1 == len(s) { + return "", ErrDigestInvalidFormat + } + + switch s[:i] { + case "md5", "sha1", "sha256": + break + default: + return "", ErrDigestUnsupported + } + + return Digest(s), nil +} + +// DigestReader returns the most valid digest for the underlying content. +func DigestReader(rd io.Reader) (Digest, error) { + + // TODO(stevvooe): This is pretty inefficient to always be calculating a + // sha256 hash to provide fallback, but it provides some nice semantics in + // that we never worry about getting the right digest for a given reader. + // For the most part, we can detect tar vs non-tar with only a few bytes, + // so a scheme that saves those bytes would probably be better here. + + h := sha256.New() + tr := io.TeeReader(rd, h) + + ts, err := tarsum.NewTarSum(tr, true, tarsum.Version1) + if err != nil { + return "", err + } + + // Try to copy from the tarsum, if we fail, copy the remaining bytes into + // hash directly. + if _, err := io.Copy(ioutil.Discard, ts); err != nil { + if err.Error() != "archive/tar: invalid tar header" { + return "", err + } + + if _, err := io.Copy(h, rd); err != nil { + return "", err + } + + return NewDigest("sha256", h), nil + } + + d, err := ParseDigest(ts.Sum(nil)) + if err != nil { + return "", err + } + + return d, nil +} + +func DigestBytes(p []byte) (Digest, error) { + return DigestReader(bytes.NewReader(p)) +} + +// Algorithm returns the algorithm portion of the digest. This will panic if +// the underlying digest is not in a valid format. +func (d Digest) Algorithm() string { + return string(d[:d.sepIndex()]) +} + +// Hex returns the hex digest portion of the digest. This will panic if the +// underlying digest is not in a valid format. +func (d Digest) Hex() string { + return string(d[d.sepIndex()+1:]) +} + +func (d Digest) String() string { + return string(d) +} + +func (d Digest) sepIndex() int { + i := strings.Index(string(d), ":") + + if i < 0 { + panic("invalid digest: " + d) + } + + return i +} diff --git a/digest_test.go b/digest_test.go new file mode 100644 index 0000000..127f787 --- /dev/null +++ b/digest_test.go @@ -0,0 +1,80 @@ +package digest + +import "testing" + +func TestParseDigest(t *testing.T) { + for _, testcase := range []struct { + input string + err error + algorithm string + hex string + }{ + { + input: "tarsum+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + algorithm: "tarsum+sha256", + hex: "e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + }, + { + input: "tarsum.dev+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + algorithm: "tarsum.dev+sha256", + hex: "e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + }, + { + input: "tarsum.v1+sha256:220a60ecd4a3c32c282622a625a54db9ba0ff55b5ba9c29c7064a2bc358b6a3e", + algorithm: "tarsum.v1+sha256", + hex: "220a60ecd4a3c32c282622a625a54db9ba0ff55b5ba9c29c7064a2bc358b6a3e", + }, + { + input: "sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + algorithm: "sha256", + hex: "e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b", + }, + { + input: "md5:d41d8cd98f00b204e9800998ecf8427e", + algorithm: "md5", + hex: "d41d8cd98f00b204e9800998ecf8427e", + }, + { + // empty hex + input: "sha256:", + err: ErrDigestInvalidFormat, + }, + { + // just hex + input: "d41d8cd98f00b204e9800998ecf8427e", + err: ErrDigestInvalidFormat, + }, + { + input: "foo:d41d8cd98f00b204e9800998ecf8427e", + err: ErrDigestUnsupported, + }, + } { + digest, err := ParseDigest(testcase.input) + if err != testcase.err { + t.Fatalf("error differed from expected while parsing %q: %v != %v", testcase.input, err, testcase.err) + } + + if testcase.err != nil { + continue + } + + if digest.Algorithm() != testcase.algorithm { + t.Fatalf("incorrect algorithm for parsed digest: %q != %q", digest.Algorithm(), testcase.algorithm) + } + + if digest.Hex() != testcase.hex { + t.Fatalf("incorrect hex for parsed digest: %q != %q", digest.Hex(), testcase.hex) + } + + // Parse string return value and check equality + newParsed, err := ParseDigest(digest.String()) + + if err != nil { + t.Fatalf("unexpected error parsing input %q: %v", testcase.input, err) + } + + if newParsed != digest { + t.Fatalf("expected equal: %q != %q", newParsed, digest) + } + } +} diff --git a/doc.go b/doc.go new file mode 100644 index 0000000..2ce7698 --- /dev/null +++ b/doc.go @@ -0,0 +1,52 @@ +// This package provides a generalized type to opaquely represent message +// digests and their operations within the registry. The Digest type is +// designed to serve as a flexible identifier in a content-addressable system. +// More importantly, it provides tools and wrappers to work with tarsums and +// hash.Hash-based digests with little effort. +// +// Basics +// +// The format of a digest is simply a string with two parts, dubbed the +// "algorithm" and the "digest", separated by a colon: +// +// : +// +// An example of a sha256 digest representation follows: +// +// sha256:7173b809ca12ec5dee4506cd86be934c4596dd234ee82c0662eac04a8c2c71dc +// +// In this case, the string "sha256" is the algorithm and the hex bytes are +// the "digest". A tarsum example will be more illustrative of the use case +// involved in the registry: +// +// tarsum+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b +// +// For this, we consider the algorithm to be "tarsum+sha256". Prudent +// applications will favor the ParseDigest function to verify the format over +// using simple type casts. However, a normal string can be cast as a digest +// with a simple type conversion: +// +// Digest("tarsum+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b") +// +// Because the Digest type is simply a string, once a valid Digest is +// obtained, comparisons are cheap, quick and simple to express with the +// standard equality operator. +// +// Verification +// +// The main benefit of using the Digest type is simple verification against a +// given digest. The Verifier interface, modeled after the stdlib hash.Hash +// interface, provides a common write sink for digest verification. After +// writing is complete, calling the Verifier.Verified method will indicate +// whether or not the stream of bytes matches the target digest. +// +// Missing Features +// +// In addition to the above, we intend to add the following features to this +// package: +// +// 1. A Digester type that supports write sink digest calculation. +// +// 2. Suspend and resume of ongoing digest calculations to support efficient digest verification in the registry. +// +package digest diff --git a/verifiers.go b/verifiers.go new file mode 100644 index 0000000..e738026 --- /dev/null +++ b/verifiers.go @@ -0,0 +1,131 @@ +package digest + +import ( + "crypto/md5" + "crypto/sha1" + "crypto/sha256" + "hash" + "io" + "io/ioutil" + + "github.com/docker/docker/pkg/tarsum" +) + +type Verifier interface { + io.Writer + + // Verified will return true if the content written to Verifier matches + // the digest. + Verified() bool + + // Planned methods: + // Err() error + // Reset() +} + +func DigestVerifier(d Digest) Verifier { + alg := d.Algorithm() + switch alg { + case "md5", "sha1", "sha256": + return hashVerifier{ + hash: newHash(alg), + digest: d, + } + default: + // Assume we have a tarsum. + version, err := tarsum.GetVersionFromTarsum(string(d)) + if err != nil { + panic(err) // Always assume valid tarsum at this point. + } + + pr, pw := io.Pipe() + + // TODO(stevvooe): We may actually want to ban the earlier versions of + // tarsum. That decision may not be the place of the verifier. + + ts, err := tarsum.NewTarSum(pr, true, version) + if err != nil { + panic(err) + } + + // TODO(sday): Ick! A goroutine per digest verification? We'll have to + // get the tarsum library to export an io.Writer variant. + go func() { + io.Copy(ioutil.Discard, ts) + pw.Close() + }() + + return &tarsumVerifier{ + digest: d, + ts: ts, + pr: pr, + pw: pw, + } + } + + panic("unsupported digest: " + d) +} + +// LengthVerifier returns a verifier that returns true when the number of read +// bytes equals the expected parameter. +func LengthVerifier(expected int64) Verifier { + return &lengthVerifier{ + expected: expected, + } +} + +type lengthVerifier struct { + expected int64 // expected bytes read + len int64 // bytes read +} + +func (lv *lengthVerifier) Write(p []byte) (n int, err error) { + n = len(p) + lv.len += int64(n) + return n, err +} + +func (lv *lengthVerifier) Verified() bool { + return lv.expected == lv.len +} + +func newHash(name string) hash.Hash { + switch name { + case "sha256": + return sha256.New() + case "sha1": + return sha1.New() + case "md5": + return md5.New() + default: + panic("unsupport algorithm: " + name) + } +} + +type hashVerifier struct { + digest Digest + hash hash.Hash +} + +func (hv hashVerifier) Write(p []byte) (n int, err error) { + return hv.hash.Write(p) +} + +func (hv hashVerifier) Verified() bool { + return hv.digest == NewDigest(hv.digest.Algorithm(), hv.hash) +} + +type tarsumVerifier struct { + digest Digest + ts tarsum.TarSum + pr *io.PipeReader + pw *io.PipeWriter +} + +func (tv *tarsumVerifier) Write(p []byte) (n int, err error) { + return tv.pw.Write(p) +} + +func (tv *tarsumVerifier) Verified() bool { + return tv.digest == Digest(tv.ts.Sum(nil)) +} diff --git a/verifiers_test.go b/verifiers_test.go new file mode 100644 index 0000000..77b02ed --- /dev/null +++ b/verifiers_test.go @@ -0,0 +1,71 @@ +package digest + +import ( + "bytes" + "crypto/rand" + "io" + "os" + "testing" + + "github.com/docker/docker-registry/common/testutil" +) + +func TestDigestVerifier(t *testing.T) { + p := make([]byte, 1<<20) + rand.Read(p) + digest, err := DigestBytes(p) + if err != nil { + t.Fatalf("unexpected error digesting bytes: %#v", err) + } + + verifier := DigestVerifier(digest) + io.Copy(verifier, bytes.NewReader(p)) + + if !verifier.Verified() { + t.Fatalf("bytes not verified") + } + + tf, tarSum, err := testutil.CreateRandomTarFile() + if err != nil { + t.Fatalf("error creating tarfile: %v", err) + } + + digest, err = DigestReader(tf) + if err != nil { + t.Fatalf("error digesting tarsum: %v", err) + } + + if digest.String() != tarSum { + t.Fatalf("unexpected digest: %q != %q", digest.String(), tarSum) + } + + expectedSize, _ := tf.Seek(0, os.SEEK_END) // Get tar file size + tf.Seek(0, os.SEEK_SET) // seek back + + // This is the most relevant example for the registry application. It's + // effectively a read through pipeline, where the final sink is the digest + // verifier. + verifier = DigestVerifier(digest) + lengthVerifier := LengthVerifier(expectedSize) + rd := io.TeeReader(tf, lengthVerifier) + io.Copy(verifier, rd) + + if !lengthVerifier.Verified() { + t.Fatalf("verifier detected incorrect length") + } + + if !verifier.Verified() { + t.Fatalf("bytes not verified") + } +} + +// TODO(stevvooe): Add benchmarks to measure bytes/second throughput for +// DigestVerifier. We should be tarsum/gzip limited for common cases but we +// want to verify this. +// +// The relevant benchmarks for comparison can be run with the following +// commands: +// +// go test -bench . crypto/sha1 +// go test -bench . github.com/docker/docker/pkg/tarsum +//