Skip to content

Commit 4785a9e

Browse files
committed
feat(dsfs.CreateDataset): initial implementaiton of dsfs.CreateDataset
This commit renames SaveDataset to WriteDataset, and adds a new exported function on the dsfs package. CreateDataset provides a clean, canonical implementation of how to write a dataset to a cafs. I'm excited about this implementation b/c it's a sign of our platform coalescing into a complete base system. It's clear to me now that while I started this journey believing that what the world needed was 'the distribed web as a database', it now seems it's smarter to explain what qri's becoming as a 'global data version control system', and the first "feature" built on top of this 'dvcs' is being aware of data transformations. dope. a.f. Still to do: setting structure length & row fields, data validation.
1 parent 74e6d63 commit 4785a9e

15 files changed

+309
-170
lines changed

commit.go

+19-6
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@ import (
1313
// to be directly analogous to the concept of a Commit Message in the
1414
// git version control system
1515
type Commit struct {
16-
path datastore.Key
17-
Author *User `json:"author,omitempty"`
18-
Kind Kind `json:"kind,omitempty"`
16+
path datastore.Key
17+
Author *User `json:"author,omitempty"`
18+
Kind Kind `json:"kind,omitempty"`
19+
// Message is an optional
1920
Message string `json:"message,omitempty"`
20-
// Time this dataset was created. Required. Datasets are immutable, so no "updated"
21-
Timestamp time.Time `json:"timestamp,omitempty"`
22-
Title string `json:"title"`
21+
// Signature is a base58 encoded privateKey signing of Title
22+
Signature string `json:"signature,omitempty"`
23+
// Time this dataset was created. Required.
24+
Timestamp time.Time `json:"timestamp"`
25+
// Title of the commit. Required.
26+
Title string `json:"title"`
2327
}
2428

2529
// NewCommitRef creates an empty struct with it's
@@ -38,6 +42,11 @@ func (cm *Commit) Path() datastore.Key {
3842
return cm.path
3943
}
4044

45+
// SignableBytes produces the portion of a commit message used for signing
46+
func (cm *Commit) SignableBytes() []byte {
47+
return []byte(fmt.Sprintf("%s\n%s", cm.Timestamp.Format(time.RFC3339), cm.Title))
48+
}
49+
4150
// Assign collapses all properties of a set of Commit onto one.
4251
// this is directly inspired by Javascript's Object.assign
4352
func (cm *Commit) Assign(msgs ...*Commit) {
@@ -61,6 +70,9 @@ func (cm *Commit) Assign(msgs ...*Commit) {
6170
if m.Message != "" {
6271
cm.Message = m.Message
6372
}
73+
if m.Signature != "" {
74+
cm.Signature = m.Signature
75+
}
6476
if m.Kind.String() != "" {
6577
cm.Kind = m.Kind
6678
}
@@ -84,6 +96,7 @@ func (cm *Commit) MarshalJSON() ([]byte, error) {
8496
Author: cm.Author,
8597
Kind: kind,
8698
Message: cm.Message,
99+
Signature: cm.Signature,
87100
Timestamp: cm.Timestamp,
88101
Title: cm.Title,
89102
}

commit_test.go

+15
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ func TestCommitAssign(t *testing.T) {
3030
Timestamp: t1,
3131
Title: "expect title",
3232
Message: "expect message",
33+
Signature: "sig",
3334
}
3435
got := &Commit{
3536
Author: &User{ID: "maha_id", Email: "maha@example.com"},
@@ -45,6 +46,7 @@ func TestCommitAssign(t *testing.T) {
4546
path: datastore.NewKey("a"),
4647
Timestamp: t1,
4748
Message: "expect message",
49+
Signature: "sig",
4850
})
4951

5052
if err := CompareCommits(expect, got); err != nil {
@@ -63,6 +65,19 @@ func TestCommitAssign(t *testing.T) {
6365
}
6466
}
6567

68+
func TestCommitSignableBytes(t *testing.T) {
69+
expect := []byte("2001-01-01T01:01:01Z\nI'm a commit message")
70+
cm := &Commit{
71+
Timestamp: time.Date(2001, 01, 01, 01, 01, 01, 0, time.UTC),
72+
Title: "I'm a commit message",
73+
}
74+
got := cm.SignableBytes()
75+
76+
if !bytes.Equal(expect, got) {
77+
t.Errorf("mismatch. expected:\n'%s',got:\n'%s'", string(expect), string(got))
78+
}
79+
}
80+
6681
func TestCommitMarshalJSON(t *testing.T) {
6782
ts := time.Date(2001, 01, 01, 01, 01, 01, 0, time.UTC)
6883
cases := []struct {

compare.go

+3
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ func CompareCommits(a, b *Commit) error {
239239
if !a.Timestamp.Equal(b.Timestamp) {
240240
return fmt.Errorf("Timestamp: %s != %s", a.Timestamp, b.Timestamp)
241241
}
242+
if a.Signature != b.Signature {
243+
return fmt.Errorf("Signature: %s != %s", a.Signature, b.Signature)
244+
}
242245
if a.Message != b.Message {
243246
return fmt.Errorf("Message: %s != %s", a.Message, b.Message)
244247
}

compare_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ func TestCompareCommits(t *testing.T) {
169169
{&Commit{Title: "a"}, &Commit{Title: "b"}, "Title: a != b"},
170170
{&Commit{Message: "a"}, &Commit{Message: "b"}, "Message: a != b"},
171171
{&Commit{Kind: "a"}, &Commit{Kind: "b"}, "Kind: a != b"},
172+
{&Commit{Signature: "a"}, &Commit{Signature: "b"}, "Signature: a != b"},
172173
}
173174

174175
for i, c := range cases {

dsfs/dataset.go

+111-120
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,16 @@ package dsfs
33
import (
44
"encoding/json"
55
"fmt"
6+
// "io/ioutil"
7+
"time"
68

79
"github.com/ipfs/go-datastore"
8-
// "github.com/libp2p/go-libp2p-crypto"
10+
"github.com/libp2p/go-libp2p-crypto"
11+
"github.com/mr-tron/base58/base58"
912
"github.com/qri-io/cafs"
1013
"github.com/qri-io/cafs/memfs"
1114
"github.com/qri-io/dataset"
15+
"github.com/qri-io/dataset/validate"
1216
)
1317

1418
// LoadDataset reads a dataset from a cafs and dereferences structure, transform, and commitMsg if they exist,
@@ -18,17 +22,7 @@ func LoadDataset(store cafs.Filestore, path datastore.Key) (*dataset.Dataset, er
1822
if err != nil {
1923
return nil, fmt.Errorf("error loading dataset: %s", err.Error())
2024
}
21-
22-
if err := DerefDatasetMetadata(store, ds); err != nil {
23-
return nil, err
24-
}
25-
if err := DerefDatasetStructure(store, ds); err != nil {
26-
return nil, err
27-
}
28-
if err := DerefDatasetTransform(store, ds); err != nil {
29-
return nil, err
30-
}
31-
if err := DerefDatasetCommit(store, ds); err != nil {
25+
if err := DerefDataset(store, ds); err != nil {
3226
return nil, err
3327
}
3428

@@ -65,6 +59,23 @@ func LoadDatasetRefs(store cafs.Filestore, path datastore.Key) (*dataset.Dataset
6559
return ds, nil
6660
}
6761

62+
// DerefDataset attempts to fully dereference a dataset
63+
func DerefDataset(store cafs.Filestore, ds *dataset.Dataset) error {
64+
if err := DerefDatasetMetadata(store, ds); err != nil {
65+
return err
66+
}
67+
if err := DerefDatasetStructure(store, ds); err != nil {
68+
return err
69+
}
70+
if err := DerefDatasetTransform(store, ds); err != nil {
71+
return err
72+
}
73+
if err := DerefDatasetCommit(store, ds); err != nil {
74+
return err
75+
}
76+
return nil
77+
}
78+
6879
// DerefDatasetStructure derferences a dataset's structure element if required
6980
// should be a no-op if ds.Structure is nil or isn't a reference
7081
func DerefDatasetStructure(store cafs.Filestore, ds *dataset.Dataset) error {
@@ -125,111 +136,72 @@ func DerefDatasetCommit(store cafs.Filestore, ds *dataset.Dataset) error {
125136
return nil
126137
}
127138

128-
// CreateDatasetParams defines parmeters for the CreateDataset function
129-
// type CreateDatasetParams struct {
130-
// // Store is where we're going to
131-
// Store cafs.Filestore
132-
// //
133-
// Dataset *dataset.Dataset
134-
// DataFile cafs.File
135-
// PrivKey crypto.PrivKey
136-
// }
137-
138-
// CreateDataset is the canonical method for getting a dataset pointer & it's data into a store
139-
// func CreateDataset(p *CreateDatasetParams) (path datastore.Key, err error) {
140-
// // TODO - need a better strategy for huge files
141-
// data, err := ioutil.ReadAll(rdr)
142-
// if err != nil {
143-
// return fmt.Errorf("error reading file: %s", err.Error())
144-
// }
145-
146-
// if err = PrepareDataset(p.Store, p.Dataset, p.DataFile); err != nil {
147-
// return
148-
// }
149-
150-
// // Ensure that dataset is well-formed
151-
// // format, err := detect.ExtensionDataFormat(filename)
152-
// // if err != nil {
153-
// // return fmt.Errorf("error detecting format extension: %s", err.Error())
154-
// // }
155-
// // if err = validate.DataFormat(format, bytes.NewReader(data)); err != nil {
156-
// // return fmt.Errorf("invalid data format: %s", err.Error())
157-
// // }
158-
159-
// // TODO - check for errors in dataset and warn user if errors exist
160-
161-
// datakey, err := store.Put(memfs.NewMemfileBytes("data."+st.Format.String(), data), false)
162-
// if err != nil {
163-
// return fmt.Errorf("error putting data file in store: %s", err.Error())
164-
// }
165-
166-
// ds.Timestamp = time.Now().In(time.UTC)
167-
// if ds.Title == "" {
168-
// ds.Title = name
169-
// }
170-
// ds.Data = datakey.String()
171-
172-
// if err := validate.Dataset(ds); err != nil {
173-
// return err
174-
// }
175-
176-
// dskey, err := SaveDataset(store, ds, true)
177-
// if err != nil {
178-
// return fmt.Errorf("error saving dataset: %s", err.Error())
179-
// }
180-
// }
139+
// CreateDataset places a new dataset in the store. Admittedly, this isn't a simple process.
140+
// Store is where we're going to
141+
// Dataset to be saved
142+
// Pin the dataset if the underlying store supports the pinning interface
143+
func CreateDataset(store cafs.Filestore, ds *dataset.Dataset, df cafs.File, pk crypto.PrivKey, pin bool) (path datastore.Key, err error) {
144+
if err = DerefDataset(store, ds); err != nil {
145+
return
146+
}
147+
if err = validate.Dataset(ds); err != nil {
148+
return
149+
}
150+
if err = prepareDataset(store, ds, df, pk); err != nil {
151+
return
152+
}
153+
path, err = WriteDataset(store, ds, df, pin)
154+
if err != nil {
155+
err = fmt.Errorf("error writing dataset: %s", err.Error())
156+
}
157+
return
158+
}
159+
160+
// timestamp is a function for getting commit timestamps
161+
// we replace this with a static function for testing purposes
162+
var timestamp = func() time.Time {
163+
return time.Now()
164+
}
181165

182166
// prepareDataset modifies a dataset in preparation for adding to a dsfs
183-
// func PrepareDataset(store cafs.Filestore, ds *dataset.Dataset, data cafs.File) error {
184-
185-
// st, err := detect.FromReader(data.FileName(), data)
186-
// if err != nil {
187-
// return fmt.Errorf("error determining dataset schema: %s", err.Error())
188-
// }
189-
// if ds.Structure == nil {
190-
// ds.Structure = &dataset.Structure{}
191-
// }
192-
// ds.Structure.Assign(st, ds.Structure)
193-
194-
// // Ensure that dataset contains valid field names
195-
// if err = validate.Structure(st); err != nil {
196-
// return fmt.Errorf("invalid structure: %s", err.Error())
197-
// }
198-
// if err := validate.DataFormat(st.Format, bytes.NewReader(data)); err != nil {
199-
// return fmt.Errorf("invalid data format: %s", err.Error())
200-
// }
201-
202-
// // generate abstract form of dataset
203-
// ds.Abstract = dataset.Abstract(ds)
204-
205-
// if ds.AbstractTransform != nil {
206-
// // convert abstract transform to abstract references
207-
// for name, ref := range ds.AbstractTransform.Resources {
208-
// // data, _ := ref.MarshalJSON()
209-
// // fmt.Println(string(data))
210-
// if ref.Abstract != nil {
211-
// ds.AbstractTransform.Resources[name] = ref.Abstract
212-
// } else {
213-
214-
// absf, err := JSONFile(PackageFileAbstract.String(), dataset.Abstract(ref))
215-
// if err != nil {
216-
// return err
217-
// }
218-
// path, err := store.Put(absf, true)
219-
// if err != nil {
220-
// return err
221-
// }
222-
// ds.AbstractTransform.Resources[name] = dataset.NewDatasetRef(path)
223-
// }
224-
// }
225-
// }
226-
227-
// return nil
228-
// }
229-
230-
// SaveDataset writes a dataset to a cafs, replacing subcomponents of a dataset with hash references
231-
// during the write process. Directory structure is according to PackageFile naming conventions
232-
func SaveDataset(store cafs.Filestore, ds *dataset.Dataset, pin bool) (datastore.Key, error) {
167+
func prepareDataset(store cafs.Filestore, ds *dataset.Dataset, df cafs.File, privKey crypto.PrivKey) error {
168+
if df == nil {
169+
return fmt.Errorf("data file is required")
170+
}
171+
172+
// TODO - need a better strategy for huge files. I think that strategy is to split
173+
// the reader into multiple consumers that are all performing their task on a stream
174+
// of byte slices
175+
// data, err := ioutil.ReadAll(df)
176+
// if err != nil {
177+
// return fmt.Errorf("error reading file: %s", err.Error())
178+
// }
179+
180+
// generate abstract form of dataset
181+
ds.Abstract = dataset.Abstract(ds)
182+
183+
// datakey, err := store.Put(memfs.NewMemfileBytes("data."+ds.Structure.Format.String(), data), false)
184+
// if err != nil {
185+
// return fmt.Errorf("error putting data file in store: %s", err.Error())
186+
// }
187+
188+
ds.Commit.Timestamp = timestamp()
189+
signedBytes, err := privKey.Sign(ds.Commit.SignableBytes())
190+
if err != nil {
191+
return fmt.Errorf("error signing commit title: %s", err.Error())
192+
}
193+
ds.Commit.Signature = base58.Encode(signedBytes)
194+
195+
// TODO - make sure file ending matches
196+
// "data."+ds.Structure.Format.String()
197+
return nil
198+
}
199+
200+
// WriteDataset writes a dataset to a cafs, replacing subcomponents of a dataset with path references
201+
// during the write process. Directory structure is according to PackageFile naming conventions.
202+
// This method is currently exported, but 99% of use cases should use CreateDataset instead of this
203+
// lower-level function
204+
func WriteDataset(store cafs.Filestore, ds *dataset.Dataset, dataFile cafs.File, pin bool) (datastore.Key, error) {
233205
// assign to a new dataset instance to avoid clobbering input dataset
234206
cp := &dataset.Dataset{}
235207
cp.Assign(ds)
@@ -247,11 +219,14 @@ func SaveDataset(store cafs.Filestore, ds *dataset.Dataset, pin bool) (datastore
247219
}
248220

249221
if ds.AbstractTransform != nil {
250-
// ensure all dataset references are abstract
251-
for key, r := range ds.AbstractTransform.Resources {
252-
if !r.IsEmpty() {
253-
return datastore.NewKey(""), fmt.Errorf("abstract transform resource '%s' is not a reference", key)
222+
// convert abstract transform to abstract references
223+
for name, ref := range ds.AbstractTransform.Resources {
224+
absrf, err := JSONFile(fmt.Sprintf("ref_%s.json", name), dataset.Abstract(ref))
225+
if err != nil {
226+
return datastore.NewKey(""), fmt.Errorf("error marshaling dataset resource '%s' to json: %s", name, err.Error())
254227
}
228+
fileTasks++
229+
adder.AddFile(absrf)
255230
}
256231
abstff, err := JSONFile(PackageFileAbstractTransform.String(), ds.AbstractTransform)
257232
if err != nil {
@@ -279,11 +254,17 @@ func SaveDataset(store cafs.Filestore, ds *dataset.Dataset, pin bool) (datastore
279254
// if err != nil {
280255
// return datastore.NewKey(""), fmt.Errorf("error marshaling dataset to json: %s", err.Error())
281256
// }
282-
283257
// fileTasks++
284258
// adder.AddFile(dsf)
285259
// addedDataset = true
286260
// }
261+
// data, err := store.Get(datastore.NewKey(ds.Data))
262+
// if err != nil {
263+
// return datastore.NewKey(""), fmt.Errorf("error getting dataset raw data: %s", err.Error())
264+
// }
265+
266+
fileTasks++
267+
adder.AddFile(dataFile)
287268

288269
if ds.Transform != nil {
289270
// all resources must be references
@@ -348,6 +329,16 @@ func SaveDataset(store cafs.Filestore, ds *dataset.Dataset, pin bool) (datastore
348329
ds.Metadata = dataset.NewMetadataRef(ao.Path)
349330
case PackageFileCommit.String():
350331
ds.Commit = dataset.NewCommitRef(ao.Path)
332+
case dataFile.FileName():
333+
ds.DataPath = ao.Path.String()
334+
default:
335+
if ds.AbstractTransform != nil {
336+
for name := range ds.AbstractTransform.Resources {
337+
if ao.Name == fmt.Sprintf("ref_%s.json", name) {
338+
ds.AbstractTransform.Resources[name] = dataset.NewDatasetRef(ao.Path)
339+
}
340+
}
341+
}
351342
}
352343

353344
fileTasks--

0 commit comments

Comments
 (0)