Skip to content

[webgl]Add functions for parallel compilation #5826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 102 additions & 14 deletions tfjs-backend-webgl/src/backend_webgl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
import './flags_webgl';

import * as tf from '@tensorflow/tfjs-core';
import {backend_util, BackendValues, buffer, DataId, DataStorage, DataToGPUWebGLOption, DataType, DataValues, engine, env, GPUData, kernel_impls, KernelBackend, MemoryInfo, NumericDataType, Rank, RecursiveArray, scalar, ShapeMap, Tensor, Tensor2D, TensorBuffer, TensorInfo, tidy, TimingInfo, TypedArray, util} from '@tensorflow/tfjs-core';

import {backend_util, BackendValues, buffer, DataId, DataStorage, DataToGPUWebGLOption, DataType, DataValues, engine, env, GPUData, kernel_impls, KernelBackend, MemoryInfo, nextFrame, NumericDataType, Rank, RecursiveArray, scalar, ShapeMap, Tensor, Tensor2D, TensorBuffer, TensorInfo, tidy, TimingInfo, TypedArray, util} from '@tensorflow/tfjs-core';
import {getWebGLContext} from './canvas_util';
import {DecodeMatrixProgram} from './decode_matrix_gpu';
import {DecodeMatrixPackedProgram} from './decode_matrix_packed_gpu';
Expand All @@ -30,7 +29,7 @@ import {EncodeMatrixProgram} from './encode_matrix_gpu';
import {EncodeMatrixPackedProgram} from './encode_matrix_packed_gpu';
import {GPGPUContext} from './gpgpu_context';
import * as gpgpu_math from './gpgpu_math';
import {GPGPUBinary, GPGPUProgram, TensorData} from './gpgpu_math';
import {getUniformLocations, GPGPUBinary, GPGPUProgram, TensorData} from './gpgpu_math';
import {simpleAbsImplCPU} from './kernel_utils/shared';
import {PackProgram} from './pack_gpu';
import {ReshapePackedProgram} from './reshape_packed_gpu';
Expand Down Expand Up @@ -549,15 +548,16 @@ export class MathBackendWebGL extends KernelBackend {
};

return (async () => {
if (env()
.getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') > 0) {
if (env().getNumber('WEBGL_DISJOINT_QUERY_TIMER_EXTENSION_RELIABLE') >
0) {
const kernelMs = await Promise.all(flattenedActiveTimerQueries);

res['kernelMs'] = util.sum(kernelMs);
res['getExtraProfileInfo'] = () =>
kernelMs.map((d, i) => ({name: flattenedActiveTimerNames[i], ms: d}))
.map(d => `${d.name}: ${d.ms}`)
.join(', ');
kernelMs
.map((d, i) => ({name: flattenedActiveTimerNames[i], ms: d}))
.map(d => `${d.name}: ${d.ms}`)
.join(', ');
} else {
res['kernelMs'] = {
error: 'WebGL query timers are not supported in this environment.'
Expand Down Expand Up @@ -949,8 +949,10 @@ export class MathBackendWebGL extends KernelBackend {
query = this.startTimer();
}

gpgpu_math.runProgram(
this.gpgpu, binary, inputsData, outputData, customUniformValues);
if (!env().get('ENGINE_COMPILE_ONLY')) {
gpgpu_math.runProgram(
this.gpgpu, binary, inputsData, outputData, customUniformValues);
}

dataToDispose.forEach(info => this.disposeIntermediateTensorInfo(info));

Expand Down Expand Up @@ -1130,16 +1132,21 @@ export class MathBackendWebGL extends KernelBackend {

// Have the original texture assume the identity of the encoded output.
const outputTexData = this.texData.get(encodedOutputTarget.dataId);
texData.texture = outputTexData.texture;
texData.texShape = outputTexData.texShape;
texData.isPacked = outputTexData.isPacked;
texData.usage = outputTexData.usage;

if (!env().get('ENGINE_COMPILE_ONLY')) {
texData.texture = outputTexData.texture;
// Once uploaded, don't store the values on cpu.
texData.values = null;
this.texData.delete(encodedOutputTarget.dataId);
} else {
this.disposeData(encodedOutputTarget.dataId);
}

this.disposeIntermediateTensorInfo(tempDenseInputHandle);
this.texData.delete(encodedOutputTarget.dataId);

// Once uploaded, don't store the values on cpu.
texData.values = null;
if (shouldTimeProgram) {
this.uploadWaitMs += util.now() - start;
}
Expand Down Expand Up @@ -1180,6 +1187,87 @@ export class MathBackendWebGL extends KernelBackend {
private computeBytes(shape: [number, number], dtype: DataType) {
return shape[0] * shape[1] * util.bytesPerElement(dtype);
}

checkCompileCompletion() {
for (const [, binary] of Object.entries(this.binaryCache)) {
this.checkCompletion_(binary);
}
}

async checkCompileCompletionAsync(): Promise<boolean[]> {
const ps = [];
if (this.gpgpu.parallelCompilationExtension) {
for (const [, binary] of Object.entries(this.binaryCache)) {
ps.push(this.checkCompletionAsync_(binary));
}
return Promise.all(ps);
} else {
for (const [, binary] of Object.entries(this.binaryCache)) {
const p: Promise<boolean> = new Promise((resolve) => {
try {
this.checkCompletion_(binary);
resolve(true);
} catch (error) {
throw error;
}
});
ps.push(p);
}
return Promise.all(ps);
}
}

private async checkCompletionAsync_(binary: GPGPUBinary): Promise<boolean> {
if (this.gpgpu.gl.getProgramParameter(
binary.webGLProgram,
this.gpgpu.parallelCompilationExtension.COMPLETION_STATUS_KHR)) {
return this.checkCompletion_(binary);
} else {
await nextFrame();
return this.checkCompletionAsync_(binary);
}
}

private checkCompletion_(binary: GPGPUBinary): boolean {
if (this.gpgpu.gl.getProgramParameter(
binary.webGLProgram, this.gpgpu.gl.LINK_STATUS) === false) {
console.log(this.gpgpu.gl.getProgramInfoLog(binary.webGLProgram));
if (this.gpgpu.gl.getShaderParameter(
binary.fragmentShader, this.gpgpu.gl.COMPILE_STATUS) === false) {
webgl_util.logShaderSourceAndInfoLog(
binary.source,
this.gpgpu.gl.getShaderInfoLog(binary.fragmentShader));
throw new Error('Failed to compile fragment shader.');
}
throw new Error('Failed to link vertex and fragment shaders.');
}
return true;
}

getUniformLocations() {
for (const [, binary] of Object.entries(this.binaryCache)) {
const {
uniformLocations,
customUniformLocations,
infLoc,
nanLoc,
inShapesLocations,
inTexShapesLocations,
outShapeLocation,
outShapeStridesLocation,
outTexShapeLocation
} = getUniformLocations(this.gpgpu, binary.program, binary.webGLProgram);
binary.uniformLocations = uniformLocations;
binary.customUniformLocations = customUniformLocations;
binary.infLoc = infLoc;
binary.nanLoc = nanLoc;
binary.inShapesLocations = inShapesLocations;
binary.inTexShapesLocations = inTexShapesLocations;
binary.outShapeLocation = outShapeLocation;
binary.outShapeStridesLocation = outShapeStridesLocation;
binary.outTexShapeLocation = outTexShapeLocation;
}
}
}

function float32ToTypedArray<D extends NumericDataType>(
Expand Down
81 changes: 81 additions & 0 deletions tfjs-backend-webgl/src/backend_webgl_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1033,3 +1033,84 @@ describeWithFlags('custom canvas ', WEBGL_ENVS, () => {
tf.removeBackend(customBackendName);
});
});
describeWithFlags('Parallel compilation', WEBGL_ENVS, () => {
// TODO(lina128): Also test async after parallel compilation flag is
// implemented in context object. We have to keep the test sync for now,
// because it's a global flag, the async test will affect other tests.
it('does not have memory leak.', () => {
const savedWebGLCPUForward = tf.env().get('WEBGL_CPU_FORWARD');
tf.env().set('WEBGL_CPU_FORWARD', false);

const customWebGLBackendName = 'my-webgl';
tf.copyRegisteredKernels('webgl', customWebGLBackendName);
tf.registerBackend(customWebGLBackendName, () => {
return new MathBackendWebGL();
});
tf.setBackend(customWebGLBackendName);

const a0 = tf.tensor1d([1, 1, 1]);
const b0 = tf.tensor1d([1, 1, 1]);
const c0 = tf.add(a0, b0);
const data = c0.dataSync();
const numOfBinaryCacheNoParallelCompillation =
Object.keys(getBinaryCache(tf.ENV.getNumber('WEBGL_VERSION'))).length;
expectArraysClose(data, [2, 2, 2]);
tf.dispose([a0, b0, c0]);
tf.removeBackend(customWebGLBackendName);

// TODO(lina128): Also test use an existing backend after parallel
// compilation flag is implemented in context object. The current approach
// assumes there's no binary cache, and it doesn't check existing cache.
const customWebGLBackendName1 = 'my-webgl1';
tf.copyRegisteredKernels('webgl', customWebGLBackendName1);
tf.registerBackend(customWebGLBackendName1, () => {
return new MathBackendWebGL();
});
tf.setBackend(customWebGLBackendName1);
const webGLBackend = tf.backend() as MathBackendWebGL;

const startNumBytes = (tf.memory() as WebGLMemoryInfo).numBytesInGPU;
const startTensor = tf.memory().numTensors;
const startDataBuckets = webGLBackend.numDataIds();

const a1 = tf.tensor1d([1, 1, 1]);
const b1 = tf.tensor1d([1, 1, 1]);

// Pre-compile round.
tf.env().set('ENGINE_COMPILE_ONLY', true);
const c1 = tf.add(a1, b1);
webGLBackend.checkCompileCompletion();
webGLBackend.getUniformLocations();

// Warm-up upload and download round.
tf.env().set('ENGINE_COMPILE_ONLY', false);
const c2 = tf.add(a1, b1);
c2.dataSync();

// Actual inference.
const c3 = tf.add(a1, b1);
expectArraysEqual(c3.dataSync(), [2, 2, 2]);

tf.dispose([a1, b1, c1, c2, c3]);
const endNumBytes = (tf.memory() as WebGLMemoryInfo).numBytesInGPU;
const endTensor = tf.memory().numTensors;
const endDataBuckets = webGLBackend.numDataIds();

// We only check numBytesInGPU. For parallel compilation,
// numBytesInGPUAllocated will be more because of the two pass uploadToGPU,
// but they will all be freed, resulting in endNumbytes equal to
// startNumBytes.
expect(startNumBytes).toEqual(endNumBytes);
expect(startTensor).toEqual(endTensor);
expect(endDataBuckets).toEqual(startDataBuckets);

const numOfBinaryCacheWithParallelCompillation =
Object.keys(getBinaryCache(tf.ENV.getNumber('WEBGL_VERSION'))).length;
expect(numOfBinaryCacheWithParallelCompillation)
.toEqual(numOfBinaryCacheNoParallelCompillation);

tf.removeBackend(customWebGLBackendName1);

tf.env().set('WEBGL_CPU_FORWARD', savedWebGLCPUForward);
});
});
5 changes: 4 additions & 1 deletion tfjs-backend-webgl/src/gpgpu_context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import {getWebGLContext, setWebGLContext} from './canvas_util';
import * as gpgpu_util from './gpgpu_util';
import * as tex_util from './tex_util';
import {Texture, TextureConfig} from './tex_util';
import {WebGL1DisjointQueryTimerExtension, WebGL2DisjointQueryTimerExtension} from './webgl_types';
import {WebGL1DisjointQueryTimerExtension, WebGL2DisjointQueryTimerExtension, WebGLParallelCompilationExtension} from './webgl_types';
import * as webgl_util from './webgl_util';

export interface FenceContext {
Expand All @@ -37,6 +37,7 @@ export class GPGPUContext {
colorBufferHalfFloatExtension: {};
disjointQueryTimerExtension: WebGL2DisjointQueryTimerExtension|
WebGL1DisjointQueryTimerExtension;
parallelCompilationExtension: WebGLParallelCompilationExtension;
vertexBuffer: WebGLBuffer;
indexBuffer: WebGLBuffer;
framebuffer: WebGLFramebuffer;
Expand All @@ -58,6 +59,8 @@ export class GPGPUContext {
// WebGL 2.0 enables texture floats without an extension.
let COLOR_BUFFER_FLOAT = 'WEBGL_color_buffer_float';
const COLOR_BUFFER_HALF_FLOAT = 'EXT_color_buffer_half_float';
this.parallelCompilationExtension =
this.gl.getExtension('KHR_parallel_shader_compile');
if (env().getNumber('WEBGL_VERSION') === 1) {
const TEXTURE_FLOAT = 'OES_texture_float';
const TEXTURE_HALF_FLOAT = 'OES_texture_half_float';
Expand Down
72 changes: 57 additions & 15 deletions tfjs-backend-webgl/src/gpgpu_math.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ export interface GPGPUBinary {
outTexShapeLocation?: WebGLUniformLocation;
}

export interface GPGPUBinaryLocations {
uniformLocations: {[name: string]: WebGLUniformLocation};
customUniformLocations?: WebGLUniformLocation[];
infLoc: WebGLUniformLocation;
nanLoc: WebGLUniformLocation;
inShapesLocations?: {[name: string]: WebGLUniformLocation};
inTexShapesLocations?: {[name: string]: WebGLUniformLocation};
outShapeLocation?: WebGLUniformLocation;
outShapeStridesLocation?: WebGLUniformLocation;
outTexShapeLocation?: WebGLUniformLocation;
}

export interface TensorData {
shape: number[];
texData: TextureData;
Expand Down Expand Up @@ -101,18 +113,58 @@ export function compileProgram<T extends Tensor, K extends Tensor>(
const fragmentShader = createFragmentShader(gpgpu.gl, source);
const webGLProgram = gpgpu.createProgram(fragmentShader);

// Add special uniforms (NAN, INFINITY)
if (!env().get('ENGINE_COMPILE_ONLY')) {
return {
program,
fragmentShader,
source,
webGLProgram,
inShapeInfos,
outShapeInfo,
...getUniformLocations(gpgpu, program, webGLProgram)
};
} else {
return {
program,
fragmentShader,
source,
webGLProgram,
inShapeInfos,
outShapeInfo,
uniformLocations: null,
customUniformLocations: null,
infLoc: null,
nanLoc: null,
inShapesLocations: null,
inTexShapesLocations: null,
outShapeLocation: null,
outShapeStridesLocation: null,
outTexShapeLocation: null
};
}
}

export function getUniformLocations(
gpgpu: GPGPUContext, program: GPGPUProgram,
webGLProgram: WebGLProgram): GPGPUBinaryLocations {
const uniformLocations: {[name: string]: WebGLUniformLocation} = {};
const inShapesLocations: {[name: string]: WebGLUniformLocation} = {};
const inTexShapesLocations: {[name: string]: WebGLUniformLocation} = {};
const customUniformLocations: WebGLUniformLocation[] = [];
let outShapeLocation: WebGLUniformLocation;
let outTexShapeLocation: WebGLUniformLocation;
let outShapeStridesLocation: WebGLUniformLocation;
let infLoc: WebGLUniformLocation = null;
const nanLoc = gpgpu.getUniformLocation(webGLProgram, 'NAN', false);
let nanLoc: WebGLUniformLocation = null;

// Add special uniforms (NAN, INFINITY)
nanLoc = gpgpu.getUniformLocation(webGLProgram, 'NAN', false);
if (env().getNumber('WEBGL_VERSION') === 1) {
infLoc = gpgpu.getUniformLocation(webGLProgram, 'INFINITY', false);
}

// Add user-defined uniforms
const shouldThrow = false;
const uniformLocations: {[name: string]: WebGLUniformLocation} = {};
const inShapesLocations: {[name: string]: WebGLUniformLocation} = {};
const inTexShapesLocations: {[name: string]: WebGLUniformLocation} = {};
for (let i = 0; i < program.variableNames.length; i++) {
const varName = program.variableNames[i];
uniformLocations[varName] =
Expand All @@ -127,9 +179,6 @@ export function compileProgram<T extends Tensor, K extends Tensor>(
}
}

let outShapeLocation: WebGLUniformLocation;
let outTexShapeLocation: WebGLUniformLocation;
let outShapeStridesLocation: WebGLUniformLocation;
if (program.enableShapeUniforms) {
outShapeLocation =
gpgpu.getUniformLocation(webGLProgram, 'outShape', shouldThrow);
Expand All @@ -139,7 +188,6 @@ export function compileProgram<T extends Tensor, K extends Tensor>(
gpgpu.getUniformLocation(webGLProgram, 'outTexShape', shouldThrow);
}

const customUniformLocations: WebGLUniformLocation[] = [];
if (program.customUniforms) {
program.customUniforms.forEach((d, i) => {
customUniformLocations[i] =
Expand All @@ -148,14 +196,8 @@ export function compileProgram<T extends Tensor, K extends Tensor>(
}

return {
program,
fragmentShader,
source,
webGLProgram,
uniformLocations,
customUniformLocations,
inShapeInfos,
outShapeInfo,
infLoc,
nanLoc,
inShapesLocations,
Expand Down
Loading