-
Notifications
You must be signed in to change notification settings - Fork 86
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* development based clang transpiler integration * added missing GitSubmodules.cmake * fixes for code review & OpenMP/Serial bug fix of non-polymorphic call was used * refactoring of integration, use function composition & callbacks strategy * make unchanged files unchanged * fix hipDeviceProp_t type to be the same as original HIP & revert back buildIncludes implementation * fix package build without occa-transpiler * update occa-transpiler version to v1.1 * update occa-transpiler to latest devel(fix cuda/hip intrinsics) * update occa-transpiler taggeed version * move to tag v1.1 occa-transpiler * added example with occa-transpiler and C++ featured okl kernel * fixes for code review, move getTranspilerVersion from options to bin/occa.cpp as local function * update INSTALL.md & README.md documentation files * update occa-transpiler repo * add option to build new transpiler with local installed clang * fix example of new oklt to support serial, openmp modes; remove debug print * add unsigned int to OCCA builtin types * update README and deps * update occa-transpiler to v1.1 * Remove occa-tranpiler as a submodule * Make changes to link occa-transpiler as a library * Add a link to occa-transpiler README in INSTALL.md * Fix a few typos * Add a link to occa-transpiler repo --------- Co-authored-by: Viktor Yastrebov <v.yastrebov90@gmail.com> Co-authored-by: Iurii Kobein <ikobein@softserveinc.com> Co-authored-by: Thilina Ratnayaka <thilinarmtb@gmail.com> Co-authored-by: Iurii Kobein <61540607+IuriiKobein@users.noreply.github.com>
- Loading branch information
1 parent
6c2e7d3
commit 0e177a1
Showing
21 changed files
with
937 additions
and
116 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
compile_cpp_example_with_modes(oklt_v3_moving_avg main.cpp) | ||
|
||
add_custom_target(cpp_example_oklt_v3_moving_avg_cpy ALL | ||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/constants.h constants.h | ||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/movingAverage.okl movingAverage.okl) | ||
add_dependencies(examples_cpp_oklt_v3_moving_avg cpp_example_oklt_v3_moving_avg_cpy) | ||
target_sources(examples_cpp_oklt_v3_moving_avg | ||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/movingAverage.okl | ||
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/constants.h | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#pragma once | ||
|
||
constexpr const int THREADS_PER_BLOCK = 1024; | ||
//INFO: it's not possible to setup dynamicaly extern @shared array for CUDA | ||
constexpr const int WINDOW_SIZE = 16; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include <iostream> | ||
#include <occa.hpp> | ||
#include <vector> | ||
#include "constants.h" | ||
|
||
std::vector<float> buildData(std::size_t size, | ||
float initialValue, | ||
float fluctuation) | ||
{ | ||
std::vector<float> buffer(size); | ||
float currentValue = initialValue; | ||
float longIncrement = 1.0f; | ||
float fluctuationIncrement = fluctuation; | ||
for(std::size_t i = 0; i < buffer.size(); ++i) { | ||
buffer[i] = currentValue; | ||
fluctuationIncrement = -fluctuationIncrement; | ||
if(i % WINDOW_SIZE == 0) { | ||
longIncrement = -longIncrement; | ||
} | ||
currentValue += longIncrement + fluctuationIncrement; | ||
} | ||
return buffer; | ||
} | ||
|
||
std::vector<float> goldMovingAverage(const std::vector<float> &hostVector) { | ||
std::vector<float> result(hostVector.size() - WINDOW_SIZE); | ||
for(std::size_t i = 0; i < result.size(); ++i) { | ||
float value = 0.0f; | ||
for(std::size_t j = 0; j < WINDOW_SIZE; ++j) { | ||
value += hostVector[i + j]; | ||
} | ||
result[i] = value / WINDOW_SIZE; | ||
} | ||
return result; | ||
} | ||
|
||
bool starts_with(const std::string &str, const std::string &substring) { | ||
return str.rfind(substring, 0) == 0; | ||
} | ||
|
||
occa::json getDeviceOptions(int argc, const char **argv) { | ||
for(int i = 0; i < argc; ++i) { | ||
std::string argument(argv[i]); | ||
if((starts_with(argument,"-d") || starts_with(argument, "--device")) && i + 1 < argc) | ||
{ | ||
std::string value(argv[i + 1]); | ||
return occa::json::parse(value); | ||
} | ||
} | ||
return occa::json::parse("{mode: 'Serial'}"); | ||
} | ||
|
||
int main(int argc, const char **argv) { | ||
|
||
occa::json deviceOpts = getDeviceOptions(argc, argv); | ||
auto inputHostBuffer = buildData(THREADS_PER_BLOCK * WINDOW_SIZE + WINDOW_SIZE, 10.0f, 4.0f); | ||
std::vector<float> outputHostBuffer(inputHostBuffer.size() - WINDOW_SIZE); | ||
|
||
occa::device device(deviceOpts); | ||
occa::memory deviceInput = device.malloc<float>(inputHostBuffer.size()); | ||
occa::memory deviceOutput = device.malloc<float>(outputHostBuffer.size()); | ||
|
||
occa::json buildProps({ | ||
{"transpiler-version", 3} | ||
}); | ||
|
||
occa::kernel movingAverageKernel = device.buildKernel("movingAverage.okl", "movingAverage32f", buildProps); | ||
|
||
deviceInput.copyFrom(inputHostBuffer.data(), inputHostBuffer.size()); | ||
|
||
movingAverageKernel(deviceInput, | ||
static_cast<int>(inputHostBuffer.size()), | ||
deviceOutput, | ||
static_cast<int>(deviceOutput.size())); | ||
|
||
// Copy result to the host | ||
deviceOutput.copyTo(&outputHostBuffer[0], outputHostBuffer.size()); | ||
|
||
auto goldValue = goldMovingAverage(inputHostBuffer); | ||
|
||
constexpr const float EPSILON = 0.001f; | ||
for(std::size_t i = 0; i < outputHostBuffer.size(); ++i) { | ||
bool isValid = std::abs(goldValue[i] - outputHostBuffer[i]) < EPSILON; | ||
if(!isValid) { | ||
std::cout << "Comparison with gold values has failed" << std::endl; | ||
return 1; | ||
} | ||
} | ||
std::cout << "Comparison with gold has passed" << std::endl; | ||
std::cout << "Moving average finished" << std::endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#include "constants.h" | ||
|
||
template<class T, | ||
int THREADS, | ||
int WINDOW> | ||
struct MovingAverage { | ||
MovingAverage(int inputSize, | ||
int outputSize, | ||
T *shared_input, | ||
T *shared_output) | ||
:_inputSize(inputSize) | ||
,_outputSize(outputSize) | ||
,_shared_data(shared_input) | ||
,_result_data(shared_output) | ||
{} | ||
|
||
void syncCopyFrom(const T *input, int block_idx, int thread_idx) { | ||
int linearIdx = block_idx * THREADS + thread_idx; | ||
//INFO: copy base chunk | ||
if(linearIdx < _inputSize) { | ||
_shared_data[thread_idx] = input[linearIdx]; | ||
} | ||
//INFO: copy WINDOW chunk | ||
int tailIdx = (block_idx + 1) * THREADS + thread_idx; | ||
if(tailIdx < _inputSize && thread_idx < WINDOW) { | ||
_shared_data[THREADS + thread_idx] = input[tailIdx]; | ||
} | ||
@barrier; | ||
} | ||
|
||
void process(int thread_idx) { | ||
T sum = T(); | ||
for(int i = 0; i < WINDOW; ++i) { | ||
sum += _shared_data[thread_idx + i]; | ||
} | ||
_result_data[thread_idx] = sum / WINDOW; | ||
@barrier; | ||
} | ||
|
||
void syncCopyTo(T *output, int block_idx, int thread_idx) { | ||
int linearIdx = block_idx * THREADS + thread_idx; | ||
if(linearIdx < _outputSize) { | ||
output[linearIdx] = _result_data[thread_idx]; | ||
} | ||
@barrier; | ||
} | ||
private: | ||
int _inputSize; | ||
int _outputSize; | ||
|
||
//INFO: not supported | ||
// @shared T _data[THREADS_PER_BLOCK + WINDOW_SIZE]; | ||
// @shared T _result[THREADS_PER_BLOCK]; | ||
|
||
T *_shared_data; | ||
T *_result_data; | ||
}; | ||
|
||
@kernel void movingAverage32f(@restrict const float *inputData, | ||
int inputSize, | ||
@restrict float *outputData, | ||
int outputSize) | ||
{ | ||
@outer(0) for (int block_idx = 0; block_idx < outputSize / THREADS_PER_BLOCK + 1; ++block_idx) { | ||
@shared float blockInput[THREADS_PER_BLOCK + WINDOW_SIZE]; | ||
@shared float blockResult[THREADS_PER_BLOCK]; | ||
MovingAverage<float, THREADS_PER_BLOCK, WINDOW_SIZE> ma{ | ||
inputSize, | ||
outputSize, | ||
blockInput, | ||
blockResult | ||
}; | ||
@inner(0) for(int thread_idx = 0; thread_idx < THREADS_PER_BLOCK; ++thread_idx) { | ||
ma.syncCopyFrom(inputData, block_idx, thread_idx); | ||
} | ||
|
||
@inner(0) for(int thread_idx = 0; thread_idx < THREADS_PER_BLOCK; ++thread_idx) { | ||
ma.process(thread_idx); | ||
} | ||
|
||
@inner(0) for(int thread_idx = 0; thread_idx < THREADS_PER_BLOCK; ++thread_idx) { | ||
ma.syncCopyTo(outputData, block_idx, thread_idx); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.