Skip to content

Commit

Permalink
WIP on (no branch): 5da385d Fix up Tokenizer/TextEncoder inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
saddam213 committed Apr 25, 2024
2 parents 5da385d + c3743a0 commit d54393e
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 74 deletions.
36 changes: 18 additions & 18 deletions OnnxStack.Console/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,29 @@
"ExecutionProvider": "DirectML",
"SchedulerOptions": {
"InferenceSteps": 22,
"GuidanceScale": 8
"GuidanceScale": 8
},
"TokenizerConfig": {
"PadTokenId": 49407,
"BlankTokenId": 49407,
"TokenizerLimit": 77,
"TokenizerLength": 768,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\cliptokenizer.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5-onnx\\cliptokenizer.onnx"
},
"TextEncoderConfig": {
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\text_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5-onnx\\text_encoder\\model.onnx"
},
"UnetConfig": {
"ModelType": "Base",
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\unet\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5-onnx\\unet\\model.onnx"
},
"VaeDecoderConfig": {
"ScaleFactor": 0.18215,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\vae_decoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5-onnx\\vae_decoder\\model.onnx"
},
"VaeEncoderConfig": {
"ScaleFactor": 0.18215,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\vae_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5-onnx\\vae_encoder\\model.onnx"
}
},
{
Expand All @@ -70,22 +70,22 @@
"BlankTokenId": 49407,
"TokenizerLimit": 77,
"TokenizerLength": 768,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\cliptokenizer.onnx"
"OnnxModelPath": "D:\\Repositories\\LCM_Dreamshaper_v7-onnx\\tokenizer\\model.onnx"
},
"TextEncoderConfig": {
"OnnxModelPath": "D:\\Repositories\\lcm-dreamshaper-v7-f16\\text_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\LCM_Dreamshaper_v7-onnx\\text_encoder\\model.onnx"
},
"UnetConfig": {
"ModelType": "Base",
"OnnxModelPath": "D:\\Repositories\\lcm-dreamshaper-v7-f16\\unet\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\LCM_Dreamshaper_v7-onnx\\unet\\model.onnx"
},
"VaeDecoderConfig": {
"ScaleFactor": 0.18215,
"OnnxModelPath": "D:\\Repositories\\lcm-dreamshaper-v7-f16\\vae_decoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\LCM_Dreamshaper_v7-onnx\\vae_decoder\\model.onnx"
},
"VaeEncoderConfig": {
"ScaleFactor": 0.18215,
"OnnxModelPath": "D:\\Repositories\\lcm-dreamshaper-v7-f16\\vae_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\LCM_Dreamshaper_v7-onnx\\vae_encoder\\model.onnx"
}
},
{
Expand All @@ -108,32 +108,32 @@
"BlankTokenId": 49407,
"TokenizerLimit": 77,
"TokenizerLength": 768,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\cliptokenizer.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\tokenizer\\model.onnx"
},
"Tokenizer2Config": {
"PadTokenId": 1,
"BlankTokenId": 49407,
"TokenizerLimit": 77,
"TokenizerLength": 1280,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-v1-5\\cliptokenizer.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\tokenizer_2\\model.onnx"
},
"TextEncoderConfig": {
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-Olive-Onnx\\text_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\text_encoder\\model.onnx"
},
"TextEncoder2Config": {
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-Olive-Onnx\\text_encoder_2\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\text_encoder_2\\model.onnx"
},
"UnetConfig": {
"ModelType": "Base",
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-Olive-Onnx\\unet\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\unet\\model.onnx"
},
"VaeDecoderConfig": {
"ScaleFactor": 0.13025,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-Olive-Onnx\\vae_decoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\vae_decoder\\model.onnx"
},
"VaeEncoderConfig": {
"ScaleFactor": 0.13025,
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-Olive-Onnx\\vae_encoder\\model.onnx"
"OnnxModelPath": "D:\\Repositories\\stable-diffusion-xl-base-1.0-onnx\\vae_encoder\\model.onnx"
}
}
]
Expand Down
17 changes: 1 addition & 16 deletions OnnxStack.StableDiffusion/Pipelines/StableCascadePipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -196,26 +196,11 @@ private async Task<PromptEmbeddingsResult> GenerateEmbedsAsync(TokenizerResult i
}

var promptTensor = new DenseTensor<float>(promptEmbeddings.ToArray(), new[] { 1, promptEmbeddings.Count / _tokenizer.TokenizerLength, _tokenizer.TokenizerLength });
var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.ToArray(), new[] { 1, tokenBatches.Count, 1280 });
var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.ToArray(), new[] { 1, tokenBatches.Count, _tokenizer.TokenizerLength });
return new PromptEmbeddingsResult(promptTensor, pooledTensor);
}


/// <summary>
/// Pads the input array with blank tokens.
/// </summary>
/// <param name="inputs">The inputs.</param>
/// <param name="requiredLength">Length of the required.</param>
/// <returns></returns>
private IEnumerable<long> PadWithBlankTokens(IEnumerable<long> inputs, int requiredLength, int padTokenId)
{
var count = inputs.Count();
if (requiredLength > count)
return inputs.Concat(Enumerable.Repeat((long)padTokenId, requiredLength - count));
return inputs;
}


/// <summary>
/// Creates the pipeline from a ModelSet configuration.
/// </summary>
Expand Down
16 changes: 3 additions & 13 deletions OnnxStack.StableDiffusion/Pipelines/StableDiffusionPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Threading;
using System.Threading.Tasks;

Expand Down Expand Up @@ -561,8 +560,8 @@ protected async Task<EncoderResult> EncodePromptTokensAsync(TokenizerResult toke
inferenceParameters.AddOutputBuffer(new int[] { 1, _tokenizer.TokenizerLength });

var results = await _textEncoder.RunInferenceAsync(inferenceParameters);
using (var promptEmbeds = results.Last())
using (var promptEmbedsPooled = results.First())
using (var promptEmbeds = results.First())
using (var promptEmbedsPooled = results.Last())
{
return new EncoderResult(promptEmbeds.ToDenseTensor(), promptEmbedsPooled.ToDenseTensor());
}
Expand Down Expand Up @@ -593,7 +592,6 @@ protected async Task<PromptEmbeddingsResult> GeneratePromptEmbedsAsync(Tokenizer
foreach (var attentionBatch in inputTokens.AttentionMask.Batch(_tokenizer.TokenizerLimit))
attentionBatches.Add(PadWithBlankTokens(attentionBatch, _tokenizer.TokenizerLimit, 1).ToArray());


var promptEmbeddings = new List<float>();
var pooledPromptEmbeddings = new List<float>();
for (int i = 0; i < tokenBatches.Count; i++)
Expand All @@ -603,16 +601,8 @@ protected async Task<PromptEmbeddingsResult> GeneratePromptEmbedsAsync(Tokenizer
pooledPromptEmbeddings.AddRange(result.PooledPromptEmbeds);
}


//var embeddingsDim = new[] { 1, promptEmbeddings.Count / _tokenizer2.TokenizerLength, _tokenizer2.TokenizerLength };
//var promptTensor = new DenseTensor<float>(promptEmbeddings.ToArray(), embeddingsDim);

////TODO: Pooled embeds do not support more than 77 tokens, just grab first set
//var pooledDim = new[] { 1, _tokenizer2.TokenizerLength };
//var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.Take(_tokenizer2.TokenizerLength).ToArray(), pooledDim);

var promptTensor = new DenseTensor<float>(promptEmbeddings.ToArray(), new[] { 1, promptEmbeddings.Count / _tokenizer.TokenizerLength, _tokenizer.TokenizerLength });
var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.ToArray(), new[] { 1, _tokenizer.TokenizerLimit, _tokenizer.TokenizerLength });
var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.ToArray(), new[] { 1, tokenBatches.Count, _tokenizer.TokenizerLength });
return new PromptEmbeddingsResult(promptTensor, pooledTensor);
}

Expand Down
34 changes: 7 additions & 27 deletions OnnxStack.StableDiffusion/Pipelines/StableDiffusionXLPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,17 @@ private async Task<EncoderResult> EncodeTokensAsync(TokenizerResult tokenizedInp
{
int hiddenStateIndex = metadata.Outputs.Count - 2;
inferenceParameters.AddInputTensor(inputTensor);

// text_embeds + hidden_states.31 ("31" because SDXL always indexes from the penultimate layer.)
inferenceParameters.AddOutputBuffer(new[] { 1, _tokenizer2.TokenizerLength });
inferenceParameters.AddOutputBuffer(hiddenStateIndex, new[] { 1, tokenizedInput.InputIds.Length, _tokenizer2.TokenizerLength });

var results = await _textEncoder2.RunInferenceAsync(inferenceParameters);
var promptEmbeds = results.Last().ToDenseTensor();
var promptEmbedsPooled = results.First().ToDenseTensor();
return new EncoderResult(promptEmbeds, promptEmbedsPooled);
using (var promptEmbeds = results.Last())
using (var promptEmbedsPooled = results.First())
{
return new EncoderResult(promptEmbeds.ToDenseTensor(), promptEmbedsPooled.ToDenseTensor());
}
}
}

Expand Down Expand Up @@ -279,7 +283,6 @@ private async Task<PromptEmbeddingsResult> GenerateEmbedsAsync(TokenizerResult i
foreach (var attentionBatch in inputTokens.AttentionMask.Batch(_tokenizer.TokenizerLimit))
attentionBatches.Add(PadWithBlankTokens(attentionBatch, _tokenizer.TokenizerLimit, 1).ToArray());


var promptEmbeddings = new List<float>();
var pooledPromptEmbeddings = new List<float>();
for (int i = 0; i < tokenBatches.Count; i++)
Expand All @@ -288,36 +291,13 @@ private async Task<PromptEmbeddingsResult> GenerateEmbedsAsync(TokenizerResult i
promptEmbeddings.AddRange(result.PromptEmbeds);
pooledPromptEmbeddings.AddRange(result.PooledPromptEmbeds);
}


//var embeddingsDim = new[] { 1, promptEmbeddings.Count / _tokenizer2.TokenizerLength, _tokenizer2.TokenizerLength };
//var promptTensor = new DenseTensor<float>(promptEmbeddings.ToArray(), embeddingsDim);

////TODO: Pooled embeds do not support more than 77 tokens, just grab first set
//var pooledDim = new[] { 1, _tokenizer2.TokenizerLength };
//var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.Take(_tokenizer2.TokenizerLength).ToArray(), pooledDim);

var promptTensor = new DenseTensor<float>(promptEmbeddings.ToArray(), new[] { 1, promptEmbeddings.Count / _tokenizer2.TokenizerLength, _tokenizer2.TokenizerLength });
var pooledTensor = new DenseTensor<float>(pooledPromptEmbeddings.ToArray(), new[] { 1, pooledPromptEmbeddings.Count });
return new PromptEmbeddingsResult(promptTensor, pooledTensor);
}


/// <summary>
/// Pads the input array with blank tokens.
/// </summary>
/// <param name="inputs">The inputs.</param>
/// <param name="requiredLength">Length of the required.</param>
/// <returns></returns>
private IEnumerable<long> PadWithBlankTokens(IEnumerable<long> inputs, int requiredLength, int padTokenId)
{
var count = inputs.Count();
if (requiredLength > count)
return inputs.Concat(Enumerable.Repeat((long)padTokenId, requiredLength - count));
return inputs;
}


/// <summary>
/// Creates the pipeline from a ModelSet configuration.
/// </summary>
Expand Down

0 comments on commit d54393e

Please sign in to comment.