Skip to content

Commit 457f7b8

Browse files
author
bozheng-hit
committed
Add Qwen3-Next.
1 parent 14b89fe commit 457f7b8

File tree

16 files changed

+2898
-2
lines changed

16 files changed

+2898
-2
lines changed

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,8 @@
653653
title: Qwen3
654654
- local: model_doc/qwen3_moe
655655
title: Qwen3MoE
656+
- local: model_doc/qwen3_next
657+
title: Qwen3Next
656658
- local: model_doc/rag
657659
title: RAG
658660
- local: model_doc/realm
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
<!--Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
12+
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13+
rendered properly in your Markdown viewer.
14+
15+
-->
16+
## Overview
17+
18+
The Qwen3-Next series represents our next-generation foundation models, optimized for extreme context length and large-scale parameter efficiency.
19+
The series introduces a suite of architectural innovations designed to maximize performance while minimizing computational cost:
20+
- **Hybrid Attention**: Replaces standard attention with the combination of **Gated DeltaNet** and **Gated Attention**, enabling efficient context modeling.
21+
- **High-Sparsity MoE**: Achieves an extreme low activation ratio as 1:50 in MoE layers — drastically reducing FLOPs per token while preserving model capacity.
22+
- **Multi-Token Prediction(MTP)**: Boosts pretraining model performance, and accelerates inference.
23+
- **Other Optimizations**: Includes techniques such as **zero-centered and weight-decayed layernorm**, **Gated Attention**, and other stabilizing enhancements for robust training.
24+
25+
Built on this architecture, we trained and open-sourced Qwen3-Next-80B-A3B — 80B total parameters, only 3B active — achieving extreme sparsity and efficiency.
26+
27+
Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — while requiring **less than 1/10 of the training cost**.
28+
Moreover, it delivers over **10x higher inference throughput** than Qwen3-32B when handling contexts longer than 32K tokens.
29+
30+
For more details, please visit our blog [Qwen3-Next](qwen3_next) ([blog post](https://qwenlm.github.io/blog/qwen3_next/)).
31+
## Usage examples
32+
33+
```python
34+
from transformers import AutoModelForCausalLM, AutoTokenizer
35+
36+
model_name = "Qwen/Qwen3-Next-80B-A3B-Instruct"
37+
38+
# load the tokenizer and the model
39+
tokenizer = AutoTokenizer.from_pretrained(model_name)
40+
model = AutoModelForCausalLM.from_pretrained(
41+
model_name,
42+
torch_dtype="auto",
43+
device_map="auto"
44+
)
45+
46+
# prepare the model input
47+
prompt = "Give me a short introduction to large language model."
48+
messages = [
49+
{"role": "user", "content": prompt}
50+
]
51+
text = tokenizer.apply_chat_template(
52+
messages,
53+
tokenize=False,
54+
add_generation_prompt=True,
55+
)
56+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
57+
58+
# conduct text completion
59+
generated_ids = model.generate(
60+
**model_inputs,
61+
max_new_tokens=16384
62+
)
63+
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
64+
65+
content = tokenizer.decode(output_ids, skip_special_tokens=True)
66+
67+
print("content:", content)
68+
```
69+
70+
## Qwen3NextConfig
71+
72+
[[autodoc]] Qwen3NextConfig
73+
74+
## Qwen3NextModel
75+
76+
[[autodoc]] Qwen3NextModel
77+
- forward
78+
79+
## Qwen3NextForCausalLM
80+
81+
[[autodoc]] Qwen3NextForCausalLM
82+
- forward
83+
84+
## Qwen3NextForSequenceClassification
85+
86+
[[autodoc]] Qwen3NextForSequenceClassification
87+
- forward
88+
89+
## Qwen3NextForQuestionAnswering
90+
91+
[[autodoc]] Qwen3NextForQuestionAnswering
92+
- forward
93+
94+
## Qwen3NextForTokenClassification
95+
96+
[[autodoc]] Qwen3NextForTokenClassification
97+
- forward

src/transformers/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@
276276
from .qwen2_vl import *
277277
from .qwen3 import *
278278
from .qwen3_moe import *
279+
from .qwen3_next import *
279280
from .rag import *
280281
from .recurrent_gemma import *
281282
from .reformer import *

src/transformers/models/auto/configuration_auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@
322322
("qwen2_vl_text", "Qwen2VLTextConfig"),
323323
("qwen3", "Qwen3Config"),
324324
("qwen3_moe", "Qwen3MoeConfig"),
325+
("qwen3_next", "Qwen3NextConfig"),
325326
("rag", "RagConfig"),
326327
("realm", "RealmConfig"),
327328
("recurrent_gemma", "RecurrentGemmaConfig"),
@@ -757,6 +758,7 @@
757758
("qwen2_vl_text", "Qwen2VL"),
758759
("qwen3", "Qwen3"),
759760
("qwen3_moe", "Qwen3MoE"),
761+
("qwen3_next", "Qwen3Next"),
760762
("rag", "RAG"),
761763
("realm", "REALM"),
762764
("recurrent_gemma", "RecurrentGemma"),

src/transformers/models/auto/modeling_auto.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
316316
("qwen2_vl_text", "Qwen2VLTextModel"),
317317
("qwen3", "Qwen3Model"),
318318
("qwen3_moe", "Qwen3MoeModel"),
319+
("qwen3_next", "Qwen3NextModel"),
319320
("recurrent_gemma", "RecurrentGemmaModel"),
320321
("reformer", "ReformerModel"),
321322
("regnet", "RegNetModel"),
@@ -711,6 +712,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
711712
("qwen2_moe", "Qwen2MoeForCausalLM"),
712713
("qwen3", "Qwen3ForCausalLM"),
713714
("qwen3_moe", "Qwen3MoeForCausalLM"),
715+
("qwen3_next", "Qwen3NextForCausalLM"),
714716
("recurrent_gemma", "RecurrentGemmaForCausalLM"),
715717
("reformer", "ReformerModelWithLMHead"),
716718
("rembert", "RemBertForCausalLM"),
@@ -1260,6 +1262,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
12601262
("qwen2_moe", "Qwen2MoeForSequenceClassification"),
12611263
("qwen3", "Qwen3ForSequenceClassification"),
12621264
("qwen3_moe", "Qwen3MoeForSequenceClassification"),
1265+
("qwen3_next", "Qwen3NextForSequenceClassification"),
12631266
("reformer", "ReformerForSequenceClassification"),
12641267
("rembert", "RemBertForSequenceClassification"),
12651268
("roberta", "RobertaForSequenceClassification"),
@@ -1349,6 +1352,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
13491352
("qwen2_moe", "Qwen2MoeForQuestionAnswering"),
13501353
("qwen3", "Qwen3ForQuestionAnswering"),
13511354
("qwen3_moe", "Qwen3MoeForQuestionAnswering"),
1355+
("qwen3_next", "Qwen3NextForQuestionAnswering"),
13521356
("reformer", "ReformerForQuestionAnswering"),
13531357
("rembert", "RemBertForQuestionAnswering"),
13541358
("roberta", "RobertaForQuestionAnswering"),
@@ -1462,6 +1466,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
14621466
("qwen2_moe", "Qwen2MoeForTokenClassification"),
14631467
("qwen3", "Qwen3ForTokenClassification"),
14641468
("qwen3_moe", "Qwen3MoeForTokenClassification"),
1469+
("qwen3_next", "Qwen3NextForTokenClassification"),
14651470
("rembert", "RemBertForTokenClassification"),
14661471
("roberta", "RobertaForTokenClassification"),
14671472
("roberta-prelayernorm", "RobertaPreLayerNormForTokenClassification"),

src/transformers/models/auto/tokenization_auto.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,13 @@
575575
"Qwen2TokenizerFast" if is_tokenizers_available() else None,
576576
),
577577
),
578+
(
579+
"qwen3_next",
580+
(
581+
"Qwen2Tokenizer",
582+
"Qwen2TokenizerFast" if is_tokenizers_available() else None,
583+
),
584+
),
578585
("rag", ("RagTokenizer", None)),
579586
("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)),
580587
(
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2025 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from typing import TYPE_CHECKING
15+
16+
from ...utils import _LazyModule
17+
from ...utils.import_utils import define_import_structure
18+
19+
20+
if TYPE_CHECKING:
21+
from .configuration_qwen3_next import *
22+
from .modeling_qwen3_next import *
23+
else:
24+
import sys
25+
26+
_file = globals()["__file__"]
27+
sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

0 commit comments

Comments
 (0)