fix many bugs

ankan-ban · Sep 12, 2023 · d7079eb · d7079eb
1 parent e0d9212
commit d7079eb
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 15 deletions.
diff --git a/convert_awq_to_bin.py b/convert_awq_to_bin.py
@@ -28,4 +28,4 @@
       print(value.shape, value.dtype)
       # Dump the tensor to a binary file with the same name as the key in the given directory
       with open(os.path.join(dirname, key + '.bin'), 'wb') as f:
-        f.write(value.numpy().tobytes())
+        f.write(value.cpu().numpy().tobytes())
diff --git a/llama2_q4.cu b/llama2_q4.cu
@@ -216,7 +216,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
     int n = start_n + threadIdx.x;
     int k = threadIdx.y;
     int offset = k * w_row_stride + n;
-    loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
+    loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
 
     float sum = 0;
     // Loop over the matrix row and vector elements
@@ -226,7 +226,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
         int start_k = e * 32;
         k = start_k + threadIdx.x;
         int buf_i = e & 1;
-        sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0);
+        sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0.0f);
 
         // load for the next iteration
         e++;
@@ -235,7 +235,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
         n = start_n + threadIdx.x;
         k = start_k + threadIdx.y;
         int offset = k * w_row_stride + n;
-        loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
+        loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
     }
 
     using WarpReduce = cub::WarpReduce<float>;
@@ -881,7 +881,7 @@ int main(int argc, char *argv[]) {
         printf("\nachieved tok/s: %f. Tokens: %d, seconds: %g\n", timed_tokens / time, timed_tokens, time);
 
         printf("enter next prompt: ");
-        gets_s(input_message);
+        fgets(input_message, sizeof(input_message), stdin);
     }
 
     // memory cleanup

diff --git a/llama2_q4_opt.cu b/llama2_q4_opt.cu
@@ -220,7 +220,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
     int n = start_n + threadIdx.x;
     int k = threadIdx.y;
     int offset = k * w_row_stride + n;
-    loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
+    loaded_fragment[0][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
 
     float sum = 0;
     // Loop over the matrix row and vector elements
@@ -231,7 +231,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
         if (start_k >= K) break;
         k = start_k + threadIdx.x;
         int buf_i = e & 1;
-        sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0);
+        sum += float(loaded_fragment[buf_i][threadIdx.x][threadIdx.y]) * ((k < K) ? (float) input[k] : 0.0f);
 
         // load for the next iteration
         e++;
@@ -240,7 +240,7 @@ __global__ void vec_mat_kernel(half* op, const half* __restrict__ ip, const half
         n = start_n + threadIdx.x;
         k = start_k + threadIdx.y;
         int offset = k * w_row_stride + n;
-        loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : 0;
+        loaded_fragment[buf_i][threadIdx.y][threadIdx.x] = ((n < N) && (k < K)) ? weight[offset] : (half)0.0;
     }
 
     using WarpReduce = cub::WarpReduce<float>;
@@ -953,7 +953,7 @@ int main(int argc, char *argv[]) {
         printf("\nachieved tok/s: %f. Tokens: %d, seconds: %g\n", timed_tokens / time, timed_tokens, time);
 
         printf("enter next prompt: ");
-        gets_s(input_message);
+        fgets(input_message, sizeof(input_message), stdin);
     }
 
     // memory cleanup

diff --git a/weight_packer.cpp b/weight_packer.cpp
@@ -197,14 +197,14 @@ int main(int argc, char *argv[])
 
     // read the config file
     FILE* fp_config;
-    fopen_s(&fp_config, config_file_name, "rb");
+    fp_config = fopen(config_file_name, "rb");
     if (!fp_config) { printf("unable to open config file\n"); return 0; }
     if(fread(config_json, 1, sizeof(config_json), fp_config) == 0) { printf("unable to read config file\n"); return 0; }
     fclose(fp_config);
     getConfig(&g_config, config_json);
 
     FILE* fp;
-    fopen_s(&fp, op_file_name, "wb+");
+    fp = fopen(op_file_name, "wb+");
     if (!fp) { printf("unable to open output file\n"); return 0; }
 
     // write the header
@@ -213,20 +213,20 @@ int main(int argc, char *argv[])
     char fileNameBase[512];
     char filename[512];
 
-    sprintf(filename, "%s\\model.embed_tokens.weight.bin", input_dir);
+    sprintf(filename, "%s/model.embed_tokens.weight.bin", input_dir);
     copyInputFileToFile(fp, filename, g_config.vocab_size * g_config.dim * sizeof(uint16_t));
 
-    sprintf(filename, "%s\\lm_head.weight.bin", input_dir);
+    sprintf(filename, "%s/lm_head.weight.bin", input_dir);
     copyInputFileToFile(fp, filename, g_config.vocab_size * g_config.dim * sizeof(uint16_t));
 
-    sprintf(filename, "%s\\model.norm.weight.bin", input_dir);
+    sprintf(filename, "%s/model.norm.weight.bin", input_dir);
     copyInputFileToFile(fp, filename, g_config.dim * sizeof(uint16_t));
 
     for (int i = 0; i < g_config.n_layers; i++)
     {
         printf("\nProcessing weights for layer: %d\n", i);
 
-        sprintf(fileNameBase, "%s\\model.layers.%d", input_dir, i);
+        sprintf(fileNameBase, "%s/model.layers.%d", input_dir, i);
 
         repackQWeightByName(fp, fileNameBase, "self_attn.q_proj", g_config.dim, g_config.dim);
         repackQWeightByName(fp, fileNameBase, "self_attn.k_proj", g_config.dim, g_config.dim);