Skip to content

[NPU]:Added support for the jsd operator#1134

Merged
Tcc0403 merged 1 commit into
linkedin:mainfrom
TianHao324:jsd
Mar 6, 2026
Merged

[NPU]:Added support for the jsd operator#1134
Tcc0403 merged 1 commit into
linkedin:mainfrom
TianHao324:jsd

Conversation

@TianHao324

Copy link
Copy Markdown
Contributor

Summary

  • Grid-stride loop optimization: efficient multi-row processing with automatic grid size tuning
  • Memory access optimization: Column-blocked processing with configurable BLOCK_N, Dynamic block size selection based on tensor width
  • The size of the grid should not exceed the number of NPUs cores, fully leveraging the advantages of NPU.

Testing Done

image
  • Hardware Type: Atlas 800I A2
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@TianHao324

Copy link
Copy Markdown
Contributor Author

benchmark:


 BENCHMARKING MEMORY for JSD

********** Benchmark Data **********
[
{
"kernel_name": "jsd",
"kernel_provider": "liger",
"metric_name": "memory",
"metric_unit": "MB",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
192.015625,
384.015625,
768.015625,
1536.015625,
3072.015625,
6144.015625
],
"y_values_20": [
192.015625,
384.015625,
768.015625,
1536.015625,
3072.015625,
6144.015625
],
"y_values_80": [
192.015625,
384.015625,
768.015625,
1536.015625,
3072.015625,
6144.015625
],
"timestamp": "2026-03-06 08:54:55",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "torch",
"metric_name": "memory",
"metric_unit": "MB",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
352.03857421875,
704.03857421875,
1408.03857421875,
2816.03857421875,
5632.03857421875,
11264.0390625
],
"y_values_20": [
352.03857421875,
704.03857421875,
1408.03857421875,
2816.03857421875,
5632.03857421875,
11264.0390625
],
"y_values_80": [
352.03857421875,
704.03857421875,
1408.03857421875,
2816.03857421875,
5632.03857421875,
11264.0390625
],
"timestamp": "2026-03-06 08:54:55",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
}
]


 BENCHMARKING SPEED for JSD

********** Benchmark Data **********
[
{
"kernel_name": "jsd",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
0.478630006313324,
0.7067800164222717,
1.3792400360107422,
2.5661699771881104,
5.090069770812988,
9.940679550170898
],
"y_values_20": [
0.47011199593544006,
0.7052839994430542,
1.377336025238037,
2.5642600059509277,
5.085480213165283,
9.931424140930176
],
"y_values_80": [
0.4902479946613312,
0.7082239985466003,
1.3811800479888916,
2.5689198970794678,
5.094135761260986,
9.964380264282227
],
"timestamp": "2026-03-06 08:54:58",
"kernel_operation_mode": "forward",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
1.16048002243042,
3.0406599044799805,
6.026619911193848,
11.739540100097656,
23.267559051513672,
46.058799743652344
],
"y_values_20": [
1.1576240062713623,
3.038520097732544,
6.0222601890563965,
11.732372283935547,
23.2564754486084,
46.049163818359375
],
"y_values_80": [
1.1626839637756348,
3.04259991645813,
6.029759883880615,
11.74675178527832,
23.27593994140625,
46.06843566894531
],
"timestamp": "2026-03-06 08:54:59",
"kernel_operation_mode": "forward",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
0.5714399814605713,
0.9373999834060669,
1.6046600341796875,
2.894559860229492,
5.5177202224731445,
10.93474006652832
],
"y_values_20": [
0.5691199898719788,
0.9341719746589661,
1.6027599573135376,
2.89194393157959,
5.513200283050537,
10.917699813842773
],
"y_values_80": [
0.5740320086479187,
0.9402480125427246,
1.6085000038146973,
2.902996063232422,
5.521203994750977,
10.94302749633789
],
"timestamp": "2026-03-06 08:55:00",
"kernel_operation_mode": "backward",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
1.0624200105667114,
3.004240036010742,
6.02878999710083,
11.979249954223633,
24.077869415283203,
48.287208557128906
],
"y_values_20": [
1.0607399940490723,
3.0010199546813965,
6.026700019836426,
11.975540161132812,
24.070615768432617,
48.28386688232422
],
"y_values_80": [
1.0644999742507935,
3.00711989402771,
6.031099796295166,
11.982004165649414,
24.081411361694336,
48.29055404663086
],
"timestamp": "2026-03-06 08:55:02",
"kernel_operation_mode": "backward",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "liger",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
0.844480037689209,
1.5041699409484863,
2.815340042114258,
5.351160049438477,
10.401260375976562,
20.71086883544922
],
"y_values_20": [
0.8365039825439453,
1.4844000339508057,
2.8105480670928955,
5.347079753875732,
10.392600059509277,
20.695655822753906
],
"y_values_80": [
0.8565959930419922,
1.5103880167007446,
2.8189799785614014,
5.359784126281738,
10.40552043914795,
20.7326602935791
],
"timestamp": "2026-03-06 08:55:03",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
},
{
"kernel_name": "jsd",
"kernel_provider": "torch",
"metric_name": "speed",
"metric_unit": "ms",
"gpu_name": "Ascend910B4",
"x_name": "V",
"x_label": "vocab size",
"x_values": [
1024,
2048,
4096,
8192,
16384,
32768
],
"y_values_50": [
2.1105000972747803,
5.968270301818848,
11.979690551757812,
23.64215087890625,
47.256202697753906,
94.29144287109375
],
"y_values_20": [
2.105384111404419,
5.96396017074585,
11.974987983703613,
23.636568069458008,
47.251041412353516,
94.29144287109375
],
"y_values_80": [
2.113931894302368,
5.970839977264404,
11.984211921691895,
23.64492416381836,
47.26136016845703,
94.29144287109375
],
"timestamp": "2026-03-06 08:55:05",
"kernel_operation_mode": "full",
"extra_benchmark_config_str": "{"B": 4, "T": 2048}",
"liger_version": "0.0.0"
}
]

@TianHao324

Copy link
Copy Markdown
Contributor Author

@Tcc0403 would you mind having a preview?

@Tcc0403 Tcc0403 left a comment

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@Tcc0403 Tcc0403 added this pull request to the merge queue Mar 6, 2026
Merged via the queue into linkedin:main with commit 6e1ec4a Mar 6, 2026
5 of 7 checks passed
@TianHao324 TianHao324 deleted the jsd branch March 9, 2026 02:35
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants