Skip to content

[NPU]:rms_norm kernel employs two-dimensional tensors#1108

Open
TianHao324 wants to merge 1 commit intolinkedin:mainfrom
TianHao324:rms_ext
Open

[NPU]:rms_norm kernel employs two-dimensional tensors#1108
TianHao324 wants to merge 1 commit intolinkedin:mainfrom
TianHao324:rms_ext

Conversation

@TianHao324
Copy link
Contributor

Summary

Since the tile size is smaller, we can process mulitple rows at once for each programs. Use 2D vector loading to maximize UB utilization (e.g., (1,2048), (2,1024), (4,512)). Processing multiple rows at once --> less iterations --> shorter runtime

Testing Done

image
  • Hardware Type: Atlas 800I A2
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@TianHao324
Copy link
Contributor Author

TianHao324 commented Feb 25, 2026

benckmark:

**************************************
     BENCHMARKING SPEED for RMS_NORM
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.27724000811576843,
      0.27250000834465027,
      0.2768400013446808
    ],
    "y_values_20": [
      0.26898398995399475,
      0.2648400068283081,
      0.2716279923915863
    ],
    "y_values_80": [
      0.28770798444747925,
      0.28314000368118286,
      0.2830759882926941
    ],
    "timestamp": "2026-02-25 07:56:53",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.17693999409675598,
      0.17553000152111053,
      0.1946599930524826
    ],
    "y_values_20": [
      0.17073599994182587,
      0.1723559945821762,
      0.193340003490448
    ],
    "y_values_80": [
      0.18898800015449524,
      0.18021999299526215,
      0.1972000002861023
    ],
    "timestamp": "2026-02-25 07:56:54",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.8903200030326843,
      0.8352400064468384,
      0.8312600255012512
    ],
    "y_values_20": [
      0.8820199966430664,
      0.8228960037231445,
      0.8222399950027466
    ],
    "y_values_80": [
      0.90447998046875,
      0.8753079771995544,
      0.8437600135803223
    ],
    "timestamp": "2026-02-25 07:56:56",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.7076200246810913,
      0.7546899914741516,
      0.7398599982261658
    ],
    "y_values_20": [
      0.6830400228500366,
      0.7448920011520386,
      0.7370200157165527
    ],
    "y_values_80": [
      0.7516599893569946,
      0.768392026424408,
      0.7432039976119995
    ],
    "timestamp": "2026-02-25 07:56:57",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.4216200113296509,
      0.4512999951839447,
      0.4390999972820282
    ],
    "y_values_20": [
      0.4081360101699829,
      0.44123998284339905,
      0.4342360198497772
    ],
    "y_values_80": [
      0.4346199929714203,
      0.4624920189380646,
      0.447627991437912
    ],
    "timestamp": "2026-02-25 07:56:59",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      0.4243600070476532,
      0.430620014667511,
      0.6086400151252747
    ],
    "y_values_20": [
      0.4162200093269348,
      0.4232120215892792,
      0.6060600280761719
    ],
    "y_values_80": [
      0.4380800127983093,
      0.4398840069770813,
      0.6116600036621094
    ],
    "timestamp": "2026-02-25 07:57:01",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING MEMORY for RMS_NORM
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      6.177734375,
      12.341796875,
      24.669921875
    ],
    "y_values_20": [
      6.177734375,
      12.341796875,
      24.669921875
    ],
    "y_values_80": [
      6.177734375,
      12.341796875,
      24.669921875
    ],
    "timestamp": "2026-02-25 07:57:01",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "rms_norm",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      512,
      1024,
      2048
    ],
    "y_values_50": [
      40.021484375,
      80.02734375,
      160.0390625
    ],
    "y_values_20": [
      40.021484375,
      80.02734375,
      160.0390625
    ],
    "y_values_80": [
      40.021484375,
      80.02734375,
      160.0390625
    ],
    "timestamp": "2026-02-25 07:57:01",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.0.0"
  }
]

@TianHao324 TianHao324 closed this Feb 25, 2026
@TianHao324 TianHao324 reopened this Feb 25, 2026
@TianHao324
Copy link
Contributor Author

@Tcc0403 would you mind having a preview?

Copy link
Collaborator

@Tcc0403 Tcc0403 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants