1- <?php /** @noinspection PhpUnreachableStatementInspection */
1+ <?php
22
33declare (strict_types=1 );
44
5-
65namespace Codewithkyrian \Transformers \Tokenizers ;
76
87use ArrayObject ;
@@ -44,12 +43,19 @@ abstract class Tokenizer
4443 */
4544 protected ?string $ unkToken = null ;
4645
46+ /**
47+ * Whether to fuse the unknown token into the vocabulary.
48+ */
49+ protected bool $ fuseUnk = false ;
50+
4751 public function __construct (protected array $ config )
4852 {
4953 $ this ->continuingSubwordPrefix = $ config ['continuing_subword_prefix ' ] ?? null ;
5054 if ($ this ->continuingSubwordPrefix == "" ) {
5155 $ this ->continuingSubwordPrefix = null ;
5256 }
57+
58+ $ this ->fuseUnk = $ config ['fuse_unk ' ] ?? false ;
5359 }
5460
5561 /**
@@ -88,8 +94,7 @@ public static function load(
8894 string $ revision ,
8995 mixed $ legacy ,
9096 ?callable $ onProgress = null
91- ): array
92- {
97+ ): array {
9398 $ tokenizerJson = Hub::getJson (
9499 $ modelNameOrPath ,
95100 fileName: 'tokenizer.json ' ,
@@ -227,7 +232,7 @@ public function __invoke(array $tokens): array
227232 {
228233 $ ids = $ this ->encode ($ tokens );
229234
230- if ($ this ->fuseUnk () ) {
235+ if ($ this ->fuseUnk ) {
231236 $ ids = $ this ->fuse ($ ids , $ this ->unkTokenId , $ this ->tokenToIds );
232237 }
233238
@@ -242,43 +247,32 @@ public function __invoke(array $tokens): array
242247 */
243248 protected abstract function encode (array $ tokens ): array ;
244249
245- protected function fuseUnk (): bool
246- {
247- return $ this ->config ['fuse_unk ' ] ?? false ;
248- }
249-
250250 /**
251251 * Helper function to fuse consecutive values in an array equal to the specified value.
252252 *
253- * @param string[] $arr The input array
253+ * @param array $arr The input array.
254254 * @param mixed $value The value to fuse on.
255- * @param array<string, mixed> $mappings The mapping from input domain to value.
256- * @return array
255+ * @param array $mapping The mapping from input domain to value.
256+ * @return array The fused array.
257257 */
258- private static function fuse (array $ arr , mixed $ value , array $ mappings ): array
259- {
258+ protected function fuse (array $ arr , mixed $ value , array $ mapping ): array {
260259 $ fused = [];
261- $ fusedIds = [] ;
262- $ fusedLength = 0 ;
260+ $ i = 0 ;
261+ $ length = count ( $ arr ) ;
263262
264- foreach ($ arr as $ i => $ v ) {
265- if ($ v === $ value ) {
266- $ fusedLength ++;
267- } else {
268- if ($ fusedLength > 0 ) {
269- $ fused [] = $ mappings [$ value ];
270- $ fusedIds [] = $ value ;
271- $ fusedLength = 0 ;
272- }
263+ while ($ i < $ length ) {
264+ $ fused [] = $ arr [$ i ];
273265
274- $ fused [] = $ v ;
275- $ fusedIds [] = $ i ;
266+ // Check if the current element's mapping is not equal to the specified value
267+ if (($ mapping [$ arr [$ i ]] ?? $ value ) !== $ value ) {
268+ $ i ++;
269+ continue ;
276270 }
277- }
278271
279- if ($ fusedLength > 0 ) {
280- $ fused [] = $ mappings [$ value ];
281- $ fusedIds [] = $ value ;
272+ // Skip consecutive elements equal to the specified value
273+ while ($ i < $ length && ($ mapping [$ arr [$ i ]] ?? $ value ) === $ value ) {
274+ $ i ++;
275+ }
282276 }
283277
284278 return $ fused ;
@@ -343,7 +337,6 @@ public function convertTokensToIds(array $tokens): array
343337 $ ids [] = $ this ->tokenToIds [$ token ] ?? $ this ->unkTokenId ;
344338 }
345339
346-
347340 return $ ids ;
348341 }
349342
@@ -363,4 +356,4 @@ public function convertIdsToTokens(array $ids): array
363356
364357 return $ tokens ;
365358 }
366- }
359+ }
0 commit comments