Skip to content

Commit

Permalink
Fix the added token decoding issue on spm based tokenizer (#908)
Browse files Browse the repository at this point in the history
* fix the added token decoding issue on spm based tokenizer

* skip special
  • Loading branch information
wenbingl authored Mar 8, 2025
1 parent bfeb3dd commit 1f9d7ee
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions operators/tokenizer/bpe_streaming.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
return {};
}



OrtxStatus Id2Token(extTokenId_t id,
std::string& token,
bool skip_special_tokens,
Expand Down Expand Up @@ -95,17 +93,21 @@ class BpeStreamingDecoder : public KernelBpeDecoder {
}

OrtxStatus SpmId2Token(extTokenId_t id, std::string& token, bool& f_special_last) const {

std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : "";
bool f_special = false;
if (piece.empty() || all_special_ids_.count(id)) {
token = "";
f_special = true;
} else if (IsSpmByteWord(piece)) {
char buf[3] = {piece[3], piece[4], 0}; // something like <0x20>
token = {static_cast<char>(strtol(buf, NULL, 16))};
if (added_tokens_.count(id)) {
f_special = all_special_ids_.count(id) ? true : false;
// special token was skipped
token = f_special ? "" : added_tokens_.at(id);
} else {
token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " ");
std::string piece = id < arr_vocab_.size() ? arr_vocab_[id] : "";
if (piece.empty()) {
token = unk_token_;
} else if (IsSpmByteWord(piece)) {
char buf[3] = {piece[3], piece[4], 0}; // something like <0x20>
token = {static_cast<char>(strtol(buf, NULL, 16))};
} else {
token = ReplaceAll(piece, std::string(ort_extensions::spm_escaped_space), " ");
}
}

if (!token.empty() && token[0] == ' ' && f_special_last && add_dummy_prefix_) {
Expand Down

0 comments on commit 1f9d7ee

Please sign in to comment.