Skip to content

Commit 9958c21

Browse files
authored
Update TensorRT-LLM backend v0.14.0 (#637)
1 parent f80395e commit 9958c21

29 files changed

+1046
-612
lines changed

all_models/inflight_batcher_llm/ensemble/config.pbtxt

Lines changed: 24 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ input [
4444
data_type: TYPE_INT32
4545
dims: [ 1 ]
4646
},
47+
{
48+
name: "num_return_sequences"
49+
data_type: TYPE_INT32
50+
dims: [ 1 ]
51+
optional: true
52+
},
4753
{
4854
name: "bad_words"
4955
data_type: TYPE_STRING
@@ -213,6 +219,11 @@ output [
213219
name: "batch_index"
214220
data_type: TYPE_INT32
215221
dims: [ 1 ]
222+
},
223+
{
224+
name: "sequence_index"
225+
data_type: TYPE_INT32
226+
dims: [ 1 ]
216227
}
217228
]
218229
ensemble_scheduling {
@@ -388,6 +399,10 @@ ensemble_scheduling {
388399
key: "return_generation_logits"
389400
value: "return_generation_logits"
390401
}
402+
input_map {
403+
key: "num_return_sequences"
404+
value: "num_return_sequences"
405+
}
391406
input_map {
392407
key: "beam_width"
393408
value: "beam_width"
@@ -426,23 +441,27 @@ ensemble_scheduling {
426441
},
427442
output_map {
428443
key: "cum_log_probs"
429-
value: "_CUM_LOG_PROBS"
444+
value: "cum_log_probs"
430445
}
431446
output_map {
432447
key: "output_log_probs"
433-
value: "_OUTPUT_LOG_PROBS"
448+
value: "output_log_probs"
434449
},
435450
output_map {
436451
key: "context_logits"
437-
value: "_CONTEXT_LOGITS"
452+
value: "context_logits"
438453
},
439454
output_map {
440455
key: "generation_logits"
441-
value: "_GENERATION_LOGITS"
456+
value: "generation_logits"
442457
},
443458
output_map {
444459
key: "batch_index"
445-
value: "_BATCH_INDEX"
460+
value: "batch_index"
461+
},
462+
output_map {
463+
key: "sequence_index"
464+
value: "sequence_index"
446465
}
447466
},
448467
{
@@ -452,54 +471,14 @@ ensemble_scheduling {
452471
key: "TOKENS_BATCH"
453472
value: "_TOKENS_BATCH"
454473
}
455-
input_map {
456-
key: "CUM_LOG_PROBS"
457-
value: "_CUM_LOG_PROBS"
458-
}
459-
input_map {
460-
key: "OUTPUT_LOG_PROBS"
461-
value: "_OUTPUT_LOG_PROBS"
462-
}
463-
input_map {
464-
key: "CONTEXT_LOGITS"
465-
value: "_CONTEXT_LOGITS"
466-
}
467-
input_map {
468-
key: "GENERATION_LOGITS"
469-
value: "_GENERATION_LOGITS"
470-
}
471474
input_map {
472475
key: "SEQUENCE_LENGTH"
473476
value: "_SEQUENCE_LENGTH"
474477
}
475-
input_map {
476-
key: "BATCH_INDEX"
477-
value: "_BATCH_INDEX"
478-
}
479478
output_map {
480479
key: "OUTPUT"
481480
value: "text_output"
482481
}
483-
output_map {
484-
key: "OUT_OUTPUT_LOG_PROBS"
485-
value: "output_log_probs"
486-
}
487-
output_map {
488-
key: "OUT_CUM_LOG_PROBS"
489-
value: "cum_log_probs"
490-
}
491-
output_map {
492-
key: "OUT_CONTEXT_LOGITS"
493-
value: "context_logits"
494-
}
495-
output_map {
496-
key: "OUT_GENERATION_LOGITS"
497-
value: "generation_logits"
498-
}
499-
output_map {
500-
key: "OUT_BATCH_INDEX"
501-
value: "batch_index"
502-
}
503482
}
504483
]
505484
}

all_models/inflight_batcher_llm/postprocessing/1/model.py

Lines changed: 41 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -113,102 +113,53 @@ def execute(self, requests):
113113
be the same as `requests`
114114
"""
115115

116+
tokens_batch = []
117+
sequence_lengths = []
118+
for idx, request in enumerate(requests):
119+
for input_tensor in request.inputs():
120+
if input_tensor.name() == "TOKENS_BATCH":
121+
tokens_batch.append(input_tensor.as_numpy())
122+
elif input_tensor.name() == "SEQUENCE_LENGTH":
123+
sequence_lengths.append(input_tensor.as_numpy())
124+
else:
125+
raise ValueError(f"unknown input {input_tensor.name}")
126+
127+
# batch decode
128+
list_of_tokens = []
129+
req_idx_offset = 0
130+
req_idx_offsets = [req_idx_offset]
131+
for idx, token_batch in enumerate(tokens_batch):
132+
for batch_idx, beam_tokens in enumerate(token_batch):
133+
for beam_idx, tokens in enumerate(beam_tokens):
134+
seq_len = sequence_lengths[idx][batch_idx][beam_idx]
135+
# Exclude fake ids in multimodal models
136+
fake_id_len = 0
137+
for i in range(seq_len):
138+
if tokens[i] < self.tokenizer.vocab_size:
139+
fake_id_len = i
140+
break
141+
list_of_tokens.append(tokens[fake_id_len:seq_len])
142+
req_idx_offset += 1
143+
144+
req_idx_offsets.append(req_idx_offset)
145+
146+
all_outputs = self.tokenizer.batch_decode(
147+
list_of_tokens, skip_special_tokens=self.skip_special_tokens)
148+
149+
# construct responses
116150
responses = []
117-
118-
# Every Python backend must iterate over everyone of the requests
119-
# and create a pb_utils.InferenceResponse for each of them.
120151
for idx, request in enumerate(requests):
121-
# Get input tensors
122-
tokens_batch = pb_utils.get_input_tensor_by_name(
123-
request, 'TOKENS_BATCH').as_numpy()
124-
125-
# Get sequence length
126-
sequence_lengths = pb_utils.get_input_tensor_by_name(
127-
request, 'SEQUENCE_LENGTH').as_numpy()
128-
129-
# Get cum log probs
130-
cum_log_probs = pb_utils.get_input_tensor_by_name(
131-
request, 'CUM_LOG_PROBS')
132-
133-
# Get sequence length
134-
output_log_probs = pb_utils.get_input_tensor_by_name(
135-
request, 'OUTPUT_LOG_PROBS')
136-
137-
# Get context logits
138-
context_logits = pb_utils.get_input_tensor_by_name(
139-
request, 'CONTEXT_LOGITS')
140-
141-
# Get generation logits
142-
generation_logits = pb_utils.get_input_tensor_by_name(
143-
request, 'GENERATION_LOGITS')
144-
145-
# Get the batch index
146-
batch_index = pb_utils.get_input_tensor_by_name(
147-
request, 'BATCH_INDEX')
148-
149-
# Reshape Input
150-
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
151-
# tokens_batch = tokens_batch.T
152+
req_outputs = [
153+
x.encode('utf8')
154+
for x in all_outputs[req_idx_offsets[idx]:req_idx_offsets[idx +
155+
1]]
156+
]
152157

153-
# Postprocessing output data.
154-
outputs = self._postprocessing(tokens_batch, sequence_lengths)
155-
156-
# Create output tensors. You need pb_utils.Tensor
157-
# objects to create pb_utils.InferenceResponse.
158158
output_tensor = pb_utils.Tensor(
159159
'OUTPUT',
160-
np.array(outputs).astype(self.output_dtype))
160+
np.array(req_outputs).astype(self.output_dtype))
161161

162-
outputs = []
163-
outputs.append(output_tensor)
164-
165-
if cum_log_probs:
166-
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
167-
cum_log_probs.as_numpy())
168-
outputs.append(out_cum_log_probs)
169-
else:
170-
out_cum_log_probs = pb_utils.Tensor(
171-
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
172-
outputs.append(out_cum_log_probs)
173-
174-
if output_log_probs:
175-
out_output_log_probs = pb_utils.Tensor(
176-
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
177-
outputs.append(out_output_log_probs)
178-
else:
179-
out_output_log_probs = pb_utils.Tensor(
180-
'OUT_OUTPUT_LOG_PROBS',
181-
np.array([[[0.0]]], dtype=np.float32))
182-
outputs.append(out_output_log_probs)
183-
184-
if context_logits:
185-
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
186-
context_logits.as_numpy())
187-
outputs.append(out_context_logits)
188-
else:
189-
out_context_logits = pb_utils.Tensor(
190-
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
191-
dtype=np.float32))
192-
outputs.append(out_context_logits)
193-
194-
if generation_logits:
195-
out_generation_logits = pb_utils.Tensor(
196-
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
197-
outputs.append(out_generation_logits)
198-
else:
199-
out_generation_logits = pb_utils.Tensor(
200-
'OUT_GENERATION_LOGITS',
201-
np.array([[[[0.0]]]], dtype=np.float32))
202-
outputs.append(out_generation_logits)
203-
204-
if batch_index:
205-
out_batch_index = pb_utils.Tensor('OUT_BATCH_INDEX',
206-
batch_index.as_numpy())
207-
outputs.append(out_batch_index)
208-
else:
209-
out_batch_index = pb_utils.Tensor(
210-
'OUT_BATCH_INDEX', np.array([[0]], dtype=np.int32))
211-
outputs.append(out_batch_index)
162+
outputs = [output_tensor]
212163

213164
# Create InferenceResponse. You can set an error here in case
214165
# there was a problem with handling this inference request.
@@ -220,7 +171,6 @@ def execute(self, requests):
220171
inference_response = pb_utils.InferenceResponse(
221172
output_tensors=outputs)
222173
responses.append(inference_response)
223-
224174
# You should return a list of pb_utils.InferenceResponse. Length
225175
# of this list must match the length of `requests` list.
226176
return responses
@@ -231,20 +181,3 @@ def finalize(self):
231181
the model to perform any necessary clean ups before exit.
232182
"""
233183
print('Cleaning up...')
234-
235-
def _postprocessing(self, tokens_batch, sequence_lengths):
236-
outputs = []
237-
for batch_idx, beam_tokens in enumerate(tokens_batch):
238-
for beam_idx, tokens in enumerate(beam_tokens):
239-
seq_len = sequence_lengths[batch_idx][beam_idx]
240-
# Exclude fake ids in multimodal models
241-
fake_id_len = 0
242-
for i in range(seq_len):
243-
if tokens[i] < len(self.tokenizer.vocab):
244-
fake_id_len = i
245-
break
246-
output = self.tokenizer.decode(
247-
tokens[fake_id_len:seq_len],
248-
skip_special_tokens=self.skip_special_tokens)
249-
outputs.append(output.encode('utf8'))
250-
return outputs

all_models/inflight_batcher_llm/postprocessing/config.pbtxt

Lines changed: 0 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -38,68 +38,13 @@ input [
3838
name: "SEQUENCE_LENGTH"
3939
data_type: TYPE_INT32
4040
dims: [ -1 ]
41-
},
42-
{
43-
name: "CUM_LOG_PROBS"
44-
data_type: TYPE_FP32
45-
dims: [ -1 ]
46-
optional: true
47-
},
48-
{
49-
name: "OUTPUT_LOG_PROBS"
50-
data_type: TYPE_FP32
51-
dims: [ -1, -1 ]
52-
optional: true
53-
},
54-
{
55-
name: "CONTEXT_LOGITS"
56-
data_type: TYPE_FP32
57-
dims: [ -1, -1 ]
58-
optional: true
59-
},
60-
{
61-
name: "GENERATION_LOGITS"
62-
data_type: TYPE_FP32
63-
dims: [ -1, -1, -1 ]
64-
optional: true
65-
},
66-
{
67-
name: "BATCH_INDEX"
68-
data_type: TYPE_INT32
69-
dims: [ 1 ]
70-
optional: true
7141
}
7242
]
7343
output [
7444
{
7545
name: "OUTPUT"
7646
data_type: TYPE_STRING
7747
dims: [ -1 ]
78-
},
79-
{
80-
name: "OUT_CUM_LOG_PROBS"
81-
data_type: TYPE_FP32
82-
dims: [ -1 ]
83-
},
84-
{
85-
name: "OUT_OUTPUT_LOG_PROBS"
86-
data_type: TYPE_FP32
87-
dims: [ -1, -1 ]
88-
},
89-
{
90-
name: "OUT_CONTEXT_LOGITS"
91-
data_type: TYPE_FP32
92-
dims: [ -1, -1 ]
93-
},
94-
{
95-
name: "OUT_GENERATION_LOGITS"
96-
data_type: TYPE_FP32
97-
dims: [ -1, -1, -1 ]
98-
},
99-
{
100-
name: "OUT_BATCH_INDEX"
101-
data_type: TYPE_INT32
102-
dims: [ 1 ]
10348
}
10449
]
10550

0 commit comments

Comments
 (0)