Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,15 @@ function copy_ops(){
return
fi

if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
if [ "$if_corex" = "True" ]; then
DEVICE_TYPE="iluvatar-gpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
return
fi

DEVICE_TYPE="cpu"
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
cd ../../../../
Expand Down
12 changes: 11 additions & 1 deletion custom_ops/gpu_ops/get_padding_offset.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/extension.h"
#include "helper.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
Expand Down Expand Up @@ -59,7 +60,12 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &token_num,
const paddle::Tensor &seq_len) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = input_ids.stream();
#endif
std::vector<int64_t> input_ids_shape = input_ids.shape();
const int bsz = seq_len.shape()[0];
const int seq_length = input_ids_shape[1];
Expand All @@ -75,7 +81,11 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
auto cu_seqlens_k =
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
int blockSize = min((token_num_data + 32 - 1) / 32 * 32, 128);
#ifdef PADDLE_WITH_COREX
int blockSize = std::min((token_num_data + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE, 128);
#else
int blockSize = min((token_num_data + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE, 128);
#endif
GetPaddingOffsetKernel<<<bsz, 128, 0, cu_stream>>>(
padding_offset.data<int>(),
cum_offsets_out.data<int>(),
Expand Down
19 changes: 19 additions & 0 deletions custom_ops/gpu_ops/helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

#pragma once

#ifndef PADDLE_WITH_COREX
#include "glog/logging.h"
#endif
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
Expand All @@ -35,22 +37,35 @@ namespace cub = hipcub;
#else
#include <cub/cub.cuh>
#endif
#ifndef PADDLE_WITH_COREX
#include "nlohmann/json.hpp"
#endif
#include <fstream>
#include <iostream>

#include "env.h"
#include "paddle/extension.h"
#include "paddle/phi/core/allocator.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/custom/custom_context.h"
#else
#include "paddle/phi/core/cuda_stream.h"
#endif
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/backends/gpu/gpu_info.h"

#ifdef PADDLE_WITH_COREX
#define WARP_SIZE 64
#else
#define WARP_SIZE 32
#endif
#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
#endif

#ifndef PADDLE_WITH_COREX
using json = nlohmann::json;
#endif

#define CUDA_CHECK(call) \
do { \
Expand Down Expand Up @@ -237,6 +252,7 @@ inline int GetBlockSize(int vocab_size) {
}
}

#ifndef PADDLE_WITH_COREX
inline json readJsonFromFile(const std::string &filePath) {
std::ifstream file(filePath);
if (!file.is_open()) {
Expand All @@ -247,6 +263,7 @@ inline json readJsonFromFile(const std::string &filePath) {
file >> j;
return j;
}
#endif

#define cudaCheckError() \
{ \
Expand Down Expand Up @@ -418,6 +435,7 @@ inline std::string base64_decode(const std::string &encoded_string) {
return ret;
}

#ifndef PADDLE_WITH_COREX
template <typename T>
inline T get_relative_best(nlohmann::json *json_data,
const std::string &target_key,
Expand All @@ -430,6 +448,7 @@ inline T get_relative_best(nlohmann::json *json_data,
return default_value;
}
}
#endif

__device__ inline bool is_in_end(const int64_t id, const int64_t *end_ids,
int length) {
Expand Down
1 change: 0 additions & 1 deletion custom_ops/gpu_ops/noaux_tc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <algorithm>
#include <optional>

#include "helper.h"
#include "noauxtc_kernel.h"

std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
Expand Down
2 changes: 1 addition & 1 deletion custom_ops/gpu_ops/noauxtc_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
#pragma once
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include "helper.h"

namespace cg = cooperative_groups;

constexpr unsigned FULL_WARP_MASK = 0xffffffff;
constexpr int32_t WARP_SIZE = 32;
constexpr int32_t BLOCK_SIZE = 512;
constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;

Expand Down
9 changes: 7 additions & 2 deletions custom_ops/gpu_ops/rebuild_padding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,12 @@ std::vector<paddle::Tensor> rebuild_padding(
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;

#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(tmp_out.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = tmp_out.stream();
#endif
std::vector<int64_t> tmp_out_shape = tmp_out.shape();
const int token_num = tmp_out_shape[0];
const int dim_embed = tmp_out_shape[1];
Expand Down Expand Up @@ -125,7 +130,7 @@ std::vector<paddle::Tensor> rebuild_padding(

if (output_padding_offset) {
RebuildAppendPaddingKernel<DataType_, PackSize>
<<<grid_size, blocksize, 0, tmp_out.stream()>>>(
<<<grid_size, blocksize, 0, cu_stream>>>(
reinterpret_cast<DataType_ *>(out.data<data_t>()),
reinterpret_cast<const DataType_ *>(tmp_out.data<data_t>()),
cum_offsets.data<int>(),
Expand All @@ -138,7 +143,7 @@ std::vector<paddle::Tensor> rebuild_padding(
elem_nums);
} else {
RebuildPaddingKernel<DataType_, PackSize>
<<<grid_size, blocksize, 0, tmp_out.stream()>>>(
<<<grid_size, blocksize, 0, cu_stream>>>(
reinterpret_cast<DataType_ *>(out.data<data_t>()),
reinterpret_cast<DataType_ *>(
const_cast<data_t *>(tmp_out.data<data_t>())),
Expand Down
1 change: 0 additions & 1 deletion custom_ops/gpu_ops/sample_kernels/air_top_p_sampling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@ __global__ void air_topp_sampling(Counter<T> *counters, T *histograms,
}

// scan/find
constexpr int WARP_SIZE = 32;
constexpr int WARP_COUNT = NumBuckets / WARP_SIZE;
namespace cg = cooperative_groups;
cg::thread_block block = cg::this_thread_block();
Expand Down
8 changes: 7 additions & 1 deletion custom_ops/gpu_ops/set_value_by_flags.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/extension.h"
#include "helper.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
Expand Down Expand Up @@ -51,13 +52,18 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
const paddle::Tensor &seq_lens_decoder,
const paddle::Tensor &step_idx,
const paddle::Tensor &stop_flags) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(stop_flags.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = stop_flags.stream();
#endif
std::vector<int64_t> pre_ids_all_shape = pre_ids_all.shape();

int bs = seq_lens_this_time.shape()[0];
int length = pre_ids_all_shape[1];
int length_input_ids = input_ids.shape()[1];
int block_size = (bs + 32 - 1) / 32 * 32;
int block_size = (bs + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
set_value_by_flag_and_id<<<1, block_size, 0, cu_stream>>>(
stop_flags.data<bool>(),
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
Expand Down
7 changes: 6 additions & 1 deletion custom_ops/gpu_ops/step.cu
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ __global__ void free_and_dispatch_block(bool *stop_flags,
? tmp_used_len + 1
: max_decoder_block_num_this_seq;
#ifdef DEBUG_STEP
printf("#### ori_step_len:%d, ori_free_list_len:%d, used_len:%d \n",
printf("#### ori_step_len:%d, ori_free_list_len:%d, used_len:%d \n",
ori_step_len, ori_free_list_len, used_len);
#endif
while (ori_step_len > 0 && ori_free_list_len >= used_len) {
Expand Down Expand Up @@ -323,7 +323,12 @@ void StepPaddle(const paddle::Tensor &stop_flags,
const paddle::Tensor &first_token_ids,
const int block_size,
const int encoder_decoder_block_num) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(seq_lens_this_time.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = seq_lens_this_time.stream();
#endif
const int bsz = seq_lens_this_time.shape()[0];
const int block_num_per_seq = block_tables.shape()[1];
const int length = input_ids.shape()[1];
Expand Down
7 changes: 6 additions & 1 deletion custom_ops/gpu_ops/stop_generation_multi_ends.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,16 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
}
}

#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = topk_ids.stream();
#endif
std::vector<int64_t> shape = topk_ids.shape();
int64_t bs_now = shape[0];
int64_t end_length = end_ids.shape()[0];
int block_size = (bs_now + 32 - 1) / 32 * 32;
int block_size = (bs_now + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
set_value_by_flags<<<1, block_size, 0, cu_stream>>>(
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int64_t *>(topk_ids.data<int64_t>()),
Expand Down
8 changes: 7 additions & 1 deletion custom_ops/gpu_ops/stop_generation_multi_stop_seqs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <sys/types.h>
#include <unistd.h>
#include "paddle/extension.h"
#include "helper.h"

#ifndef PD_BUILD_STATIC_OP
#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
Expand Down Expand Up @@ -88,15 +89,20 @@ void GetStopFlagsMultiSeqs(const paddle::Tensor &topk_ids,
PD_CHECK(topk_ids.dtype() == paddle::DataType::INT64);
PD_CHECK(stop_flags.dtype() == paddle::DataType::BOOL);

#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = topk_ids.stream();
#endif
std::vector<int64_t> shape = topk_ids.shape();
std::vector<int64_t> stop_seqs_shape = stop_seqs.shape();
int bs_now = shape[0];
int stop_seqs_bs = stop_seqs_shape[0];
int stop_seqs_max_len = stop_seqs_shape[1];
int pre_ids_len = pre_ids.shape()[1];

int block_size = (stop_seqs_bs + 31) / 32 * 32;
int block_size = (stop_seqs_bs + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
set_value_by_stop_seqs<<<bs_now, block_size, 0, cu_stream>>>(
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int64_t *>(topk_ids.data<int64_t>()),
Expand Down
25 changes: 21 additions & 4 deletions custom_ops/gpu_ops/token_penalty_multi_scores.cu
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,12 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
typedef PDTraits<D> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(logits.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = logits.stream();
#endif
std::vector<int64_t> shape = logits.shape();
auto repeat_times =
paddle::full(shape, 0, paddle::DataType::INT32, pre_ids.place());
Expand All @@ -143,7 +148,7 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,

int64_t end_length = eos_token_id.shape()[0];

int block_size = (bs + 32 - 1) / 32 * 32;
int block_size = (bs + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
min_length_logits_process<<<1, block_size, 0, cu_stream>>>(
reinterpret_cast<DataType_ *>(
const_cast<data_t *>(logits.data<data_t>())),
Expand All @@ -154,8 +159,12 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
length,
end_length);

block_size = (length_id + 32 - 1) / 32 * 32;
block_size = (length_id + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
#ifdef PADDLE_WITH_COREX
block_size = std::min(block_size, 512);
#else
block_size = min(block_size, 512);
#endif
update_repeat_times<<<bs, block_size, 0, cu_stream>>>(
pre_ids.data<int64_t>(),
cur_len.data<int64_t>(),
Expand All @@ -164,8 +173,12 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
length,
length_id);

block_size = (length + 32 - 1) / 32 * 32;
block_size = (length + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
#ifdef PADDLE_WITH_COREX
block_size = std::min(block_size, 512);
#else
block_size = min(block_size, 512);
#endif
update_value_by_repeat_times<DataType_><<<bs, block_size, 0, cu_stream>>>(
repeat_times.data<int>(),
reinterpret_cast<DataType_ *>(
Expand All @@ -180,8 +193,12 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
bs,
length);

block_size = (length_bad_words + 32 - 1) / 32 * 32;
block_size = (length_bad_words + WARP_SIZE - 1) / WARP_SIZE * WARP_SIZE;
#ifdef PADDLE_WITH_COREX
block_size = std::min(block_size, 512);
#else
block_size = min(block_size, 512);
#endif
ban_bad_words<DataType_><<<bs, block_size, 0, cu_stream>>>(
reinterpret_cast<DataType_ *>(
const_cast<data_t *>(logits.data<data_t>())),
Expand Down
8 changes: 7 additions & 1 deletion custom_ops/gpu_ops/update_inputs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,17 @@ void UpdateInputes(const paddle::Tensor &stop_flags,
const paddle::Tensor &stop_nums,
const paddle::Tensor &next_tokens,
const paddle::Tensor &is_block_step) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
auto cu_stream = dev_ctx->stream();
#else
auto cu_stream = input_ids.stream();
#endif
const int max_bsz = stop_flags.shape()[0];
const int now_bsz = seq_lens_this_time.shape()[0];
const int input_ids_stride = input_ids.shape()[1];
auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
update_inputs_kernel<1024><<<1, 1024, 0, input_ids.stream()>>>(
update_inputs_kernel<1024><<<1, 1024, 0, cu_stream>>>(
const_cast<bool *>(not_need_stop_gpu.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
const_cast<int *>(seq_lens_encoder.data<int>()),
Expand Down
Loading