Skip to content

Commit

Permalink
Merge branch 'main' into snnn-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
snnn authored Mar 1, 2025
2 parents defd6e4 + ee945bb commit e9104c3
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions operators/tokenizer/bpe_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,11 @@ class PreTokenizerWithRegEx {
return {};
}

if (j >= m_text.size()) {
auto res = m_text.substr(0, j);
m_text = m_text.substr(j);
return res;
}
i = j;
// (?i:'s|'t|'re|'ve|'m|'ll|'d)?
if ((m_text[i] == U'\'') && ((i + 1) < m_text.size())) {
Expand Down Expand Up @@ -442,6 +447,11 @@ class PreTokenizerWithRegEx {
} else {
return {};
}
if (i >= m_text.size()) {
auto res = m_text.substr(0, i);
m_text = m_text.substr(i);
return res;
}

// [\p{Ll}\p{Lm}\p{Lo}\p{M}]*
const ufal::unilib::unicode::category_t categories2 =
Expand All @@ -451,6 +461,11 @@ class PreTokenizerWithRegEx {
if (!IsCategory(m_text[i], categories2)) break;
}
}
if (i >= m_text.size()) {
auto res = m_text.substr(0, i);
m_text = m_text.substr(i);
return res;
}

// (?i:'s|'t|'re|'ve|'m|'ll|'d)?
if ((m_text[i] == U'\'') && ((i + 1) < m_text.size())) {
Expand Down

0 comments on commit e9104c3

Please sign in to comment.