Skip to content

Commit

Permalink
fix many bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
kroggen committed Sep 12, 2023
1 parent e0d9212 commit d7079eb
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
2 changes: 1 addition & 1 deletion convert_awq_to_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@
print(value.shape, value.dtype)
# Dump the tensor to a binary file with the same name as the key in the given directory
with open(os.path.join(dirname, key + '.bin'), 'wb') as f:
f.write(value.numpy().tobytes())
f.write(value.cpu().numpy().tobytes())
8 changes: 4 additions & 4 deletions llama2_q4.cu
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
int n = start_n + threadIdx.x;
int k = threadIdx.y;
int offset = k * w_row_stride + n;
loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;

float sum = 0;
// Loop over the matrix row and vector elements
Expand All @@ -226,7 +226,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
int start_k = e * 32;
k = start_k + threadIdx.x;
int buf_i = e & 1;
sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0);
sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0.0f);

// load for the next iteration
e++;
Expand All @@ -235,7 +235,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
n = start_n + threadIdx.x;
k = start_k + threadIdx.y;
int offset = k * w_row_stride + n;
loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
}

using WarpReduce = cub::WarpReduce<float>;
Expand Down Expand Up @@ -881,7 +881,7 @@ int main(int argc, char *argv[]) {
printf("\nachieved tok/s: %f. Tokens: %d, seconds: %g\n", timed_tokens / time, timed_tokens, time);

printf("enter next prompt: ");
gets_s(input_message);
fgets(input_message, sizeof(input_message), stdin);
}

// memory cleanup
Expand Down
8 changes: 4 additions & 4 deletions llama2_q4_opt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
int n = start_n + threadIdx.x;
int k = threadIdx.y;
int offset = k * w_row_stride + n;
loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;

float sum = 0;
// Loop over the matrix row and vector elements
Expand All @@ -231,7 +231,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
if (start_k >= K) break;
k = start_k + threadIdx.x;
int buf_i = e & 1;
sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0);
sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0.0f);

// load for the next iteration
e++;
Expand All @@ -240,7 +240,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
n = start_n + threadIdx.x;
k = start_k + threadIdx.y;
int offset = k * w_row_stride + n;
loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
}

using WarpReduce = cub::WarpReduce<float>;
Expand Down Expand Up @@ -953,7 +953,7 @@ int main(int argc, char *argv[]) {
printf("\nachieved tok/s: %f. Tokens: %d, seconds: %g\n", timed_tokens / time, timed_tokens, time);

printf("enter next prompt: ");
gets_s(input_message);
fgets(input_message, sizeof(input_message), stdin);
}

// memory cleanup
Expand Down
12 changes: 6 additions & 6 deletions weight_packer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,14 +197,14 @@ int main(int argc, char *argv[])

// read the config file
FILE* fp_config;
fopen_s(&fp_config, config_file_name, "rb");
fp_config = fopen(config_file_name, "rb");
if (!fp_config) { printf("unable to open config file\n"); return 0; }
if(fread(config_json, 1, sizeof(config_json), fp_config) == 0) { printf("unable to read config file\n"); return 0; }
fclose(fp_config);
getConfig(&g_config, config_json);

FILE* fp;
fopen_s(&fp, op_file_name, "wb+");
fp = fopen(op_file_name, "wb+");
if (!fp) { printf("unable to open output file\n"); return 0; }

// write the header
Expand All @@ -213,20 +213,20 @@ int main(int argc, char *argv[])
char fileNameBase[512];
char filename[512];

sprintf(filename, "%s\\model.embed_tokens.weight.bin", input_dir);
sprintf(filename, "%s/model.embed_tokens.weight.bin", input_dir);
copyInputFileToFile(fp, filename, g_config.vocab_size * g_config.dim * sizeof(uint16_t));

sprintf(filename, "%s\\lm_head.weight.bin", input_dir);
sprintf(filename, "%s/lm_head.weight.bin", input_dir);
copyInputFileToFile(fp, filename, g_config.vocab_size * g_config.dim * sizeof(uint16_t));

sprintf(filename, "%s\\model.norm.weight.bin", input_dir);
sprintf(filename, "%s/model.norm.weight.bin", input_dir);
copyInputFileToFile(fp, filename, g_config.dim * sizeof(uint16_t));

for (int i = 0; i < g_config.n_layers; i++)
{
printf("\nProcessing weights for layer: %d\n", i);

sprintf(fileNameBase, "%s\\model.layers.%d", input_dir, i);
sprintf(fileNameBase, "%s/model.layers.%d", input_dir, i);

repackQWeightByName(fp, fileNameBase, "self_attn.q_proj", g_config.dim, g_config.dim);
repackQWeightByName(fp, fileNameBase, "self_attn.k_proj", g_config.dim, g_config.dim);
Expand Down

0 comments on commit d7079eb

Please sign in to comment.