@@ -6,9 +6,10 @@ import { join } from "node:path";
6
6
import { writeFile , readFile , stat , mkdir } from "node:fs/promises" ;
7
7
import type { RepoId } from "../src/types/public.js" ;
8
8
import { toRepoId } from "../src/utils/toRepoId.js" ;
9
- import { commitIter } from "../src/index.js" ;
9
+ import type { CommitOperation } from "../src/index.js" ;
10
+ import { commitIter , downloadFile } from "../src/index.js" ;
11
+ import { SplicedBlob } from "../src/utils/SplicedBlob.js" ;
10
12
import { pathToFileURL } from "node:url" ;
11
- import { WebBlob } from "../src/utils/WebBlob.js" ;
12
13
13
14
/**
14
15
* This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
@@ -38,6 +39,23 @@ const FILES_TO_DOWNLOAD = [
38
39
} ,
39
40
] ;
40
41
42
+ const FILES_TO_EDIT = [
43
+ {
44
+ url : "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true" ,
45
+ filename : "64-8bits.tflite.edited" ,
46
+ sha256 : "c2b116ccc9e5362d55dd60b344a4b93156594feeef312b5b8833151f0732aa0a" ,
47
+ edits : [
48
+ {
49
+ start : 0 ,
50
+ end : 1000 ,
51
+ content : new Blob ( [
52
+ "Adding a new prefix to this TFLite file. Will xet still be efficient in deduplicating the file?" ,
53
+ ] ) ,
54
+ } ,
55
+ ] ,
56
+ } ,
57
+ ] ;
58
+
41
59
async function downloadFileIfNotExists ( url : string , filepath : string ) : Promise < void > {
42
60
try {
43
61
await stat ( filepath ) ;
@@ -58,13 +76,25 @@ async function downloadFileIfNotExists(url: string, filepath: string): Promise<v
58
76
console . log ( `Downloaded ${ filepath } (${ buffer . byteLength } bytes)` ) ;
59
77
}
60
78
61
- async function * createFileSource (
62
- files : Array < { filepath : string ; filename : string } >
63
- ) : AsyncGenerator < { content : Blob ; path : string ; sha256 : string } > {
79
+ async function * createFileSource ( files : Array < { filepath : string ; filename : string } > ) : AsyncGenerator < {
80
+ content : Blob ;
81
+ path : string ;
82
+ sha256 : string ;
83
+ edits ?: Array < { start : number ; end : number ; content : Blob } > ;
84
+ } > {
64
85
for ( const file of files ) {
65
86
console . log ( `Processing ${ file . filename } ...` ) ;
66
87
const buffer = await readFile ( file . filepath ) ;
67
- const blob = new Blob ( [ buffer ] ) ;
88
+ let blob = new Blob ( [ buffer ] ) ;
89
+
90
+ if ( file . filename . endsWith ( ".edited" ) ) {
91
+ const edits = FILES_TO_EDIT . find ( ( f ) => f . filename === file . filename ) ?. edits ;
92
+ if ( edits !== undefined ) {
93
+ for ( const edit of edits ) {
94
+ blob = SplicedBlob . create ( blob , [ { insert : edit . content , start : edit . start , end : edit . end } ] ) ;
95
+ }
96
+ }
97
+ }
68
98
69
99
// Calculate sha256
70
100
console . log ( `Calculating SHA256 for ${ file . filename } ...` ) ;
@@ -77,12 +107,11 @@ async function* createFileSource(
77
107
78
108
console . log ( `SHA256 for ${ file . filename } : ${ sha256Hash } ` ) ;
79
109
80
- if ( sha256Hash !== FILES_TO_DOWNLOAD . find ( ( f ) => f . filename === file . filename ) ?. sha256 ) {
81
- throw new Error (
82
- `SHA256 mismatch for ${ file . filename } : ${ sha256Hash } !== ${ FILES_TO_DOWNLOAD . find (
83
- ( f ) => f . filename === file . filename
84
- ) ?. sha256 } `
85
- ) ;
110
+ const sha256ToCheck =
111
+ FILES_TO_DOWNLOAD . find ( ( f ) => f . filename === file . filename ) ?. sha256 ||
112
+ FILES_TO_EDIT . find ( ( f ) => f . filename === file . filename ) ?. sha256 ;
113
+ if ( sha256Hash !== sha256ToCheck ) {
114
+ throw new Error ( `SHA256 mismatch for ${ file . filename } : ${ sha256Hash } !== ${ sha256ToCheck } ` ) ;
86
115
}
87
116
88
117
yield {
@@ -215,6 +244,12 @@ async function main() {
215
244
files . push ( { filepath, filename : fileInfo . filename } ) ;
216
245
}
217
246
247
+ for ( const fileInfo of FILES_TO_EDIT ) {
248
+ const filepath = join ( downloadDir , fileInfo . filename ) ;
249
+ await downloadFileIfNotExists ( fileInfo . url , filepath ) ;
250
+ files . push ( { filepath, filename : fileInfo . filename } ) ;
251
+ }
252
+
218
253
// Parse repo
219
254
const repoName = args . repo ;
220
255
@@ -302,13 +337,25 @@ async function main() {
302
337
303
338
if ( args . commit ) {
304
339
console . log ( "\n=== Committing files ===" ) ;
340
+ const operations : CommitOperation [ ] = [ ] ;
341
+ for ( const fileInfo of FILES_TO_DOWNLOAD ) {
342
+ operations . push ( {
343
+ operation : "addOrUpdate" ,
344
+ content : pathToFileURL ( join ( downloadDir , fileInfo . filename ) ) ,
345
+ path : fileInfo . filename ,
346
+ } ) ;
347
+ }
348
+ for ( const fileInfo of FILES_TO_EDIT ) {
349
+ operations . push ( {
350
+ operation : "edit" ,
351
+ originalContent : new Blob ( [ await readFile ( join ( downloadDir , fileInfo . filename ) ) ] ) ,
352
+ edits : fileInfo . edits ,
353
+ path : fileInfo . filename ,
354
+ } ) ;
355
+ }
305
356
const iterator = commitIter ( {
306
357
repo,
307
- operations : files . map ( ( file ) => ( {
308
- operation : "addOrUpdate" ,
309
- content : pathToFileURL ( file . filepath ) ,
310
- path : file . filename ,
311
- } ) ) ,
358
+ operations,
312
359
accessToken : args . token ,
313
360
title : "Upload xet files with JS lib" ,
314
361
useXet : true ,
@@ -325,7 +372,16 @@ async function main() {
325
372
326
373
console . log ( "Redownloading files and verifying SHA256 integrity" ) ;
327
374
for ( const file of FILES_TO_DOWNLOAD ) {
328
- const fileBlob = await WebBlob . create ( new URL ( file . url ) ) ;
375
+ const fileBlob = await downloadFile ( {
376
+ repo,
377
+ path : file . filename ,
378
+ accessToken : args . token ,
379
+ } ) ;
380
+
381
+ if ( ! fileBlob ) {
382
+ throw new Error ( `Failed to download ${ file . filename } ` ) ;
383
+ }
384
+
329
385
const sha256Hash = sha256 ( fileBlob , { useWebWorker : false } ) ;
330
386
let res : IteratorResult < number , string > ;
331
387
do {
@@ -335,6 +391,26 @@ async function main() {
335
391
336
392
console . log ( `${ file . filename } : ${ finalHash } === ${ file . sha256 } ${ finalHash === file . sha256 ? "✅" : "❌" } ` ) ;
337
393
}
394
+
395
+ for ( const file of FILES_TO_EDIT ) {
396
+ const fileBlob = await downloadFile ( {
397
+ repo,
398
+ path : file . filename ,
399
+ accessToken : args . token ,
400
+ } ) ;
401
+
402
+ if ( ! fileBlob ) {
403
+ throw new Error ( `Failed to download ${ file . filename } ` ) ;
404
+ }
405
+
406
+ const sha256Hash = sha256 ( fileBlob , { useWebWorker : false } ) ;
407
+ let res : IteratorResult < number , string > ;
408
+ do {
409
+ res = await sha256Hash . next ( ) ;
410
+ } while ( ! res . done ) ;
411
+ const finalHash = res . value ;
412
+ console . log ( `${ file . filename } : ${ finalHash } === ${ file . sha256 } ${ finalHash === file . sha256 ? "✅" : "❌" } ` ) ;
413
+ }
338
414
}
339
415
}
340
416
0 commit comments