From 1f9d7ee0c80b4f94946ee5650cb242aa01dd6278 Mon Sep 17 00:00:00 2001
From: Wenbing Li <10278425+wenbingl@users.noreply.github.com>
Date: Fri, 7 Mar 2025 16:20:55 -0800
Subject: [PATCH] Fix the added token decoding issue on spm based tokenizer
 (#908)

* fix the added token decoding issue on spm based tokenizer

* skip special
---
 operators/tokenizer/bpe_streaming.hpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/operators/tokenizer/bpe_streaming.hpp b/operators/tokenizer/bpe_streaming.hpp
index 33b1cdee..dab2268e 100644
--- a/operators/tokenizer/bpe_streaming.hpp
+++ b/operators/tokenizer/bpe_streaming.hpp
@@ -47,8 +47,6 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
     return {};
   }
 
-
-
   OrtxStatus Id2Token(extTokenId_t id,
                       std::string& token,
                       bool skip_special_tokens,
@@ -95,17 +93,21 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
   }
 
   OrtxStatus SpmId2Token(extTokenId_t id, std::string& token, bool& f_special_last) const {
-
-    std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : "";
     bool f_special = false;
-    if (piece.empty() || all_special_ids_.count(id)) {
-      token = "";
-      f_special = true;
-    } else if (IsSpmByteWord(piece)) {
-      char buf[3] = {piece[3], piece[4], 0};  // something like <0x20>
-      token = {static_cast<char>(strtol(buf, NULL, 16))};
+    if (added_tokens_.count(id)) {
+      f_special = all_special_ids_.count(id) ? true : false;
+      // special token was skipped
+      token = f_special ? "" : added_tokens_.at(id);
     } else {
-      token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " ");
+      std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : "";
+      if (piece.empty()) {
+        token = unk_token_;
+      } else if (IsSpmByteWord(piece)) {
+        char buf[3] = {piece[3], piece[4], 0};  // something like <0x20>
+        token = {static_cast<char>(strtol(buf, NULL, 16))};
+      } else {
+        token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " ");
+      }
     }
 
     if (!token.empty() && token[0] == ' ' && f_special_last && add_dummy_prefix_) {