From 1f9d7ee0c80b4f94946ee5650cb242aa01dd6278 Mon Sep 17 00:00:00 2001 From: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Date: Fri, 7 Mar 2025 16:20:55 -0800 Subject: [PATCH] Fix the added token decoding issue on spm based tokenizer (#908) * fix the added token decoding issue on spm based tokenizer * skip special --- operators/tokenizer/bpe_streaming.hpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/operators/tokenizer/bpe_streaming.hpp b/operators/tokenizer/bpe_streaming.hpp index 33b1cdee..dab2268e 100644 --- a/operators/tokenizer/bpe_streaming.hpp +++ b/operators/tokenizer/bpe_streaming.hpp @@ -47,8 +47,6 @@ class BpeStreamingDecoder : public KernelBpeDecoder { return {}; } - - OrtxStatus Id2Token(extTokenId_t id, std::string& token, bool skip_special_tokens, @@ -95,17 +93,21 @@ class BpeStreamingDecoder : public KernelBpeDecoder { } OrtxStatus SpmId2Token(extTokenId_t id, std::string& token, bool& f_special_last) const { - - std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : ""; bool f_special = false; - if (piece.empty() || all_special_ids_.count(id)) { - token = ""; - f_special = true; - } else if (IsSpmByteWord(piece)) { - char buf[3] = {piece[3], piece[4], 0}; // something like <0x20> - token = {static_cast(strtol(buf, NULL, 16))}; + if (added_tokens_.count(id)) { + f_special = all_special_ids_.count(id) ? true : false; + // special token was skipped + token = f_special ? "" : added_tokens_.at(id); } else { - token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " "); + std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : ""; + if (piece.empty()) { + token = unk_token_; + } else if (IsSpmByteWord(piece)) { + char buf[3] = {piece[3], piece[4], 0}; // something like <0x20> + token = {static_cast(strtol(buf, NULL, 16))}; + } else { + token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " "); + } } if (!token.empty() && token[0] == ' ' && f_special_last && add_dummy_prefix_) {