Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9c72f72
Fix repeated token due to wrong text preprocessing
JulienVig Apr 8, 2024
133ce56
Clean wikitext example
JulienVig Apr 8, 2024
94d664a
Refactor and comment gpt-tfjs implementation
JulienVig Apr 9, 2024
b9e1edc
Refactor gpt-tfjs to support config as intit argument
JulienVig Apr 9, 2024
6265316
Fix linting errors
JulienVig Apr 9, 2024
c3134be
Use tf.CustomCallbacks rather than redefining it
JulienVig Apr 9, 2024
20700f3
Fix tensor allocation leak in training loop
JulienVig Apr 9, 2024
0b27fe7
Create a compile method to init the gpt-tfjs optimizer
JulienVig Apr 9, 2024
7fa3c12
Add tfjs-node as dev dependencies to discojs-core to run gpt-tfjs uni…
JulienVig Apr 9, 2024
37fbbed
Add support and test for gpt config serilization
JulienVig Apr 9, 2024
f9f932d
Create node.js model save and load functions
JulienVig Apr 9, 2024
1f9271a
Make tokenize preprocessing return a number[] rather than tensor
JulienVig Apr 9, 2024
0ed7a1f
Fix linting errors
JulienVig Apr 9, 2024
433959b
Fix linting error and increase gpt test timeout
JulienVig Apr 10, 2024
2e6e834
Create text preprocessing test cases
JulienVig Apr 10, 2024
2faae1c
discojs-core/package.json: fix tfjs-node major version
JulienVig Apr 10, 2024
d808d8e
Change variable function to actual function
JulienVig Apr 11, 2024
5eba6b2
Use dot notion
JulienVig Apr 11, 2024
cb40e6b
Use dot notion
JulienVig Apr 11, 2024
73300b6
Use dot notion
JulienVig Apr 11, 2024
d287fd3
Remove test error log when expected
JulienVig Apr 11, 2024
1650349
Remove test error log when expected
JulienVig Apr 11, 2024
1677af8
Remove try catch around file read
JulienVig Apr 11, 2024
a7655e6
Use async functions to save and load models from disk
JulienVig Apr 11, 2024
21f371c
Fix linting error
JulienVig Apr 11, 2024
6721717
improve wikitext example
JulienVig Apr 15, 2024
1f0c526
Merge with develop
JulienVig Apr 15, 2024
a19fd2b
Fixup merge with develop
JulienVig Apr 15, 2024
5c5ecde
Rework training logs
JulienVig Apr 15, 2024
a80c403
Fix training logs web-client
JulienVig Apr 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion discojs/discojs-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
"ws": "8"
},
"devDependencies": {
"@tensorflow/tfjs-node": "4",
"@types/chai": "4",
"@types/mocha": "10",
"@types/msgpack-lite": "0.1",
"@types/simple-peer": "9",
"chai": "5",
"mocha": "10",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import { TEXT_PREPROCESSING } from './index.js'
import { expect } from 'chai'

import type { Task } from '../../../index.js'
import * as tf from '@tensorflow/tfjs'

describe('text preprocessing', function () {
const [tokenize, leftPadding] = TEXT_PREPROCESSING
// Use a function to create different task object for each test (otherwise the tokenizer gets cached)
function initMockTask(): Task {
return {
id: 'mock-task-id',
displayInformation: {},
trainingInformation: {
modelID: 'model-id',
epochs: 1,
roundDuration: 1,
validationSplit: 0,
batchSize: 8,
scheme: 'local',
dataType: 'text',
tokenizer: 'Xenova/gpt2',
}}
}

const text = "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia written and maintained by a community \n of volunteers, known as Wikipedians. Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]"
const expectedTokens = [15496, 995, 11, 257, 47125, 352, 2242, 2231, 11, 705, 30, 860, 4304, 13, 15312, 318, 257, 1479, 2695, 2691, 45352, 3194, 290, 9456, 416, 257, 2055, 220, 198, 286, 11661, 11, 1900, 355, 11145, 46647, 1547, 13, 4062, 276, 416, 12963, 11769, 290, 13633, 311, 2564, 319, 3269, 1315, 11, 5878, 11, 15312, 318, 12007, 416, 262, 44877, 5693, 11, 281, 1605, 15346, 4009, 326, 24803, 257, 3085, 286, 625, 13037, 661, 3693, 22, 60]

it('can tokenize text', async () => {
const { tokens } = await tokenize.apply(Promise.resolve(text), initMockTask()) as { tokens: number[]}
expect(tokens).to.be.deep.equal(expectedTokens)
})

it('can truncate inputs when tokenizing', async () => {
const truncationTask = initMockTask()
truncationTask.trainingInformation.maxSequenceLength = 10
const { tokens } = await tokenize.apply(Promise.resolve(text), truncationTask) as { tokens: number[] }
const expectedLength = truncationTask.trainingInformation.maxSequenceLength + 1 // + 1 because tokenization includes an extra token label for next label prediction
expect(tokens.length).to.be.equal(expectedLength)
expect(tokens).to.be.deep.equal(expectedTokens.slice(0, expectedLength))
})

it('can left pad tokens', async () => {
// Create a task where output token sequence should all have length 20
const paddingTask = initMockTask()
paddingTask.trainingInformation.maxSequenceLength = 20

// Create a token sequence of length 10
const tokens = { tokens: [0,1,2,3,4,5,6,7,8,9] }
const { xs, ys } = await leftPadding.apply(Promise.resolve(tokens), paddingTask) as { xs: tf.Tensor1D, ys: tf.Tensor2D }
const xsArray = await xs.array()
const ysArray = await ys.array()

// Output sequences should have shape (20) and (20, 50258), 50258 being the size of the vocab for gpt2
expect(xsArray.length).to.be.equal(paddingTask.trainingInformation.maxSequenceLength)
expect(ysArray.length).to.be.equal(paddingTask.trainingInformation.maxSequenceLength)
expect(ysArray[0].length).to.be.equal(50258)

// xs should be left pad with gpt2's padding token 50256 to be of length 20.
// We expect the last token of input token sequence (9) to not be included in xs since it doesn't have a next token to be predicted
const paddingToken = 50256
const expectedXs = Array.from({length:11}).map(_ => paddingToken).concat(tokens.tokens.slice(0,9))
expect(xsArray).to.be.deep.equal(expectedXs)

// ys should be a one hot encoding of the next token in xs
// if the input tokens are [0,1,2,3] then the labels are [1,2,3] which are then one-hot encoded
// So the sum of each row should be equal to 1
const expectedOneHot = Array.from({ length: 20 }).map(_ => 1)
expect(await ys.sum(-1).array()).to.be.deep.equal(expectedOneHot)

// In each row, the index of the 1 should be the token id
const expectedYs = Array.from({length:10}).map(_ => paddingToken).concat(tokens.tokens)
expect(await ys.argMax(-1).array()).to.be.deep.equal(expectedYs)
})

it('throws an error if no tokenizer is specified', async () => {
const invalidTask = initMockTask()
invalidTask.trainingInformation.tokenizer = undefined;
try {
await tokenize.apply(Promise.resolve("input text doesn't matter"), invalidTask)
} catch {
return
}
throw new Error("undefined tokenizer should have thrown an error")
})
it('throws an error if the tokenizer name is invalid', async () => {
const invalidTask = initMockTask()
invalidTask['trainingInformation']['tokenizer'] = 'invalid-tokenizer-name'
try {
await tokenize.apply(Promise.resolve("input text doesn't matter"), invalidTask)
} catch {
return
}
throw new Error("invalid tokenizer name should have thrown an error")
})

})
Original file line number Diff line number Diff line change
Expand Up @@ -14,43 +14,58 @@ export enum TextPreprocessing {
}

interface TokenizedEntry extends tf.TensorContainerObject {
xs: tf.Tensor1D
tokens: number []
}

/**
* LeftPadding pads all incoming inputs to be a fixed length, which should be specified
* in `task.trainingInformation.maxSequenceLength`.
*
* We are currently only implementing left padding for text generation
* https://huggingface.co/docs/transformers/en/llm_tutorial#wrong-padding-side
* The function can easily be extended to support right padding once the need arise
* The function can easily be extended to support right padding if needed
*
* Once Transformers.js supports left padding, it will be possible to pad inputs
* directly when tokenizing
* https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517
*/
const leftPadding: PreprocessingFunction = {
type: TextPreprocessing.LeftPadding,
apply: async (x: Promise<tf.TensorContainer>, task: Task): Promise<tf.TensorContainer> => {
let { xs } = await x as TokenizedEntry
if (xs === undefined || !(xs instanceof tf.tensor) ||xs.rankType !== tf.Rank.R1) {
new Error("The leftPadding preprocessing expects a 1D tensor named 'xs' as input")
if (x === undefined || !Array.isArray(x) || x.length == 0 || typeof(x[0] != 'number')) {
new Error("The leftPadding preprocessing expects a non empty 1D array of number")
Comment on lines +35 to +36
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does nothing as x is a Promise and do not throw created Error.

}
const { tokens } = await x as TokenizedEntry
const tokenizer = await models.getTaskTokenizer(task)


const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Should never happen because tokenization truncates inputs
if (xs.size > maxLength) {
xs = xs.slice([0], [maxLength])
} else if (xs.size < maxLength) {
const paddingToken = tokenizer.pad_token_id
xs = xs.pad([[Math.max(0, maxLength - xs.size), 0]], paddingToken)
}
// if xs.size == maxLength we can leave it as it is
return {
xs,
ys: tf.oneHot(xs, tokenizer.model.vocab.length + 1) // gpt-tfjs expects a one-hot encoded token label
}
return tf.tidy(() => {
// maxLength is the final length of xs
// Because ys the contains the tokens in xs shifted by one (to predict the next token), we need
// to include one more token than maxSequenceLength in order to have the next token's label of the maxSequenceLength'th token
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
const maxLengthPlusLabel = maxLength + 1

let fixedLengthTokens = tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs
if (fixedLengthTokens.size > maxLengthPlusLabel) { // Should never happen because tokenization truncates inputs
throw Error("There are more tokens than expected after tokenization and truncation")
} else if (fixedLengthTokens.size < maxLengthPlusLabel) { // Pad inputs to fixed length
const paddingToken = tokenizer.pad_token_id
fixedLengthTokens = fixedLengthTokens.pad([[Math.max(0, maxLengthPlusLabel - fixedLengthTokens.size), 0]], paddingToken)
}
// if tokens.size == maxLengthPlusLabel we can leave it as it is

// ys is a one-hot encoding of the next token (i.e. xs shifted by one)
const ys = tf.oneHot(fixedLengthTokens.slice([1]), tokenizer.model.vocab.length + 1)
// remove the extra token now that ys is created
const xs = fixedLengthTokens.slice([0], maxLength)
return { xs, ys }
})
}
}

interface TokenizerOutput {
input_ids: number[]
}

/**
* Tokenize and truncates input strings
*/
Expand All @@ -62,7 +77,10 @@ const tokenize: PreprocessingFunction = {
}
const xs = await x as string // tf.TextLineDataset yields strings
const tokenizer = await models.getTaskTokenizer(task)
const maxLength = task.trainingInformation.maxSequenceLength ?? tokenizer.model_max_length as number
// Add plus one to include the next token label of the last token in the input sequence
// The inputs are truncated down to exactly maxSequenceLength in leftPadding
const maxLength = task.trainingInformation.maxSequenceLength ?? (tokenizer.model_max_length as number)
const maxLengthPlusLabel = maxLength + 1

const {input_ids: tokens} = tokenizer(xs, {
// Transformers.js currently only supports right padding while we need left for text generation
Expand All @@ -71,11 +89,9 @@ const tokenize: PreprocessingFunction = {
padding: false,
truncation: true,
return_tensor: false,
max_length: maxLength,
max_length: maxLengthPlusLabel,
}) as TokenizerOutput
return {
xs: tf.tensor(tokens, undefined, 'int32') // cast tokens from float to int for gpt-tfjs}
}
return { tokens }
}
}

Expand Down
26 changes: 17 additions & 9 deletions discojs/discojs-core/src/models/gpt/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,12 @@ type ModelType =
| 'gpt-micro'
| 'gpt-nano'

export interface ModelSize {
nLayer?: number
nHead?: number
nEmbd?: number
}

export interface GPTConfig {
lr: number
blockSize: number
vocabSize: number
modelType: ModelType
name?: string,
evaluate?: boolean
maxEvalBatches?: number
evaluateEvery?: number
Expand All @@ -30,13 +26,16 @@ export interface GPTConfig {
embdDrop?: number
tokEmb?: boolean
lmHead?: boolean
modelType: ModelType
nLayer?: number
nHead?: number
nEmbd?: number
}

export const DEFAULT_CONFIG: Required<GPTConfig> = {
name: 'transformer',
lr: 0.001,
weightDecay: 0,
maxIter: 10_000,
maxIter: 5,
verbose: 0,
modelType: 'gpt-nano',
evaluate: true,
Expand All @@ -50,7 +49,16 @@ export const DEFAULT_CONFIG: Required<GPTConfig> = {
residDrop: 0.2,
embdDrop: 0.2,
tokEmb: true,
lmHead: true
lmHead: true,
nLayer: 3,
nHead: 3,
nEmbd: 48,
}

export type ModelSize = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's equivalent but nicer to use interface when creating a type for an object.

Suggested change
export type ModelSize = {
export interface ModelSize {

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was actually searching for pros and cons and didn't find anything significant, why do you prefer interface?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hum, it's a habit mostly, I got it from one of eslint rule. now that I reread the documentation, there aren't much differences. the only plus for interface that I found myself agreeing with it that it shows better error messages

nLayer: number
nHead: number
nEmbd: number
}

export function getModelSizes (modelType: ModelType): Required<ModelSize> {
Expand Down
44 changes: 44 additions & 0 deletions discojs/discojs-core/src/models/gpt/gpt.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { expect } from 'chai'
import * as tf from '@tensorflow/tfjs-node'
import { AutoTokenizer } from '@xenova/transformers';
import { GPT } from './index.js'
import { type GPTConfig } from './config.js'

describe('gpt-tfjs', function() {
this.timeout(50_000)
const data = "Lorem ipsum dolor sit"

const config: GPTConfig = {
modelType: 'gpt-nano',
lr: 0.01,
maxIter: 10,
evaluateEvery:10,
maxEvalBatches: 10,
blockSize: 8,
vocabSize: 50258
}

it('can overfit one sentence', async () => {
const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2')
const datasetSource = new tf.data.FileDataSource(Buffer.from(data))
const textDataset = new tf.data.TextLineDataset(datasetSource)
const tokenDataset = textDataset.map((text: string) => {
const { input_ids: tokens } = tokenizer(text, {
padding: true,
truncation: true,
return_tensor: false,
max_length: config.blockSize + 1,
}) as { input_ids: number[] }
const ys = tf.oneHot(tokens.slice(1), tokenizer.model.vocab.length + 1)
const xs = tf.tensor(tokens.slice(0, config.blockSize), undefined, 'int32')
return {xs, ys}
}).repeat().batch(64)

const model = new GPT(config)
const logGenerator = model.train(tokenDataset, undefined, 5) // 5 epochs
for await (const _ of logGenerator); // Await the end of training
const generation = await model.generate("Lorem ipsum dolor", tokenizer, 1)
console.log(generation)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
console.log(generation)

expect(generation).equal(data) // Assert that the model completes 'Lorem ipsum dolor' with 'sit'
})
})
Loading