Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22940 MF2 ICU4C: Error checking improvements in parser #3306

Merged
merged 1 commit into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 75 additions & 28 deletions icu4c/source/i18n/messageformat2_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ icu::UInitOnce gMF2ParseUniSetsInitOnce {};

UnicodeSet* initContentChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF
Expand All @@ -133,7 +133,7 @@ UnicodeSet* initContentChars(UErrorCode& status) {

UnicodeSet* initWhitespace(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* result = new UnicodeSet();
Expand All @@ -153,7 +153,7 @@ UnicodeSet* initWhitespace(UErrorCode& status) {
UnicodeSet* initBidiControls(UErrorCode& status) {
UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status);
if (U_FAILURE(status)) {
return {};
return nullptr;
}
result->add(0x200E, 0x200F);
result->add(0x2066, 0x2069);
Expand All @@ -164,7 +164,7 @@ UnicodeSet* initBidiControls(UErrorCode& status) {
UnicodeSet* initAlpha(UErrorCode& status) {
UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status);
if (U_FAILURE(status)) {
return {};
return nullptr;
}
result->freeze();
return result;
Expand All @@ -173,18 +173,22 @@ UnicodeSet* initAlpha(UErrorCode& status) {
UnicodeSet* initDigits(UErrorCode& status) {
UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status);
if (U_FAILURE(status)) {
return {};
return nullptr;
}
result->freeze();
return result;
}

UnicodeSet* initNameStartChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* result = new UnicodeSet(*unisets::getImpl(unisets::ALPHA));
UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status);
if (U_FAILURE(status)) {
return nullptr;
}
UnicodeSet* result = new UnicodeSet(*isAlpha);
if (result == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
Expand All @@ -209,16 +213,21 @@ UnicodeSet* initNameStartChars(UErrorCode& status) {

UnicodeSet* initNameChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status);
if (U_FAILURE(status)) {
return nullptr;
}
UnicodeSet* result = new UnicodeSet();
if (result == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
};
result->addAll(*unisets::getImpl(unisets::NAME_START));
result->addAll(*unisets::getImpl(unisets::DIGIT));
result->addAll(*nameStart);
result->addAll(*digit);
result->add(HYPHEN);
result->add(PERIOD);
result->add(0x00B7);
Expand All @@ -230,16 +239,21 @@ UnicodeSet* initNameChars(UErrorCode& status) {

UnicodeSet* initTextChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status);
UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
if (U_FAILURE(status)) {
return nullptr;
}
UnicodeSet* result = new UnicodeSet();
if (result == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
};
result->addAll(*unisets::getImpl(unisets::CONTENT));
result->addAll(*unisets::getImpl(unisets::WHITESPACE));
result->addAll(*content);
result->addAll(*whitespace);
result->add(PERIOD);
result->add(AT);
result->add(PIPE);
Expand All @@ -249,16 +263,31 @@ UnicodeSet* initTextChars(UErrorCode& status) {

UnicodeSet* initQuotedChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status);
if (U_FAILURE(status)) {
return nullptr;
}
UnicodeSet* result = new UnicodeSet();
if (result == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
};
result->addAll(*unisets::getImpl(unisets::CONTENT));
result->addAll(*unisets::getImpl(unisets::WHITESPACE));
// content and whitespace were initialized by `initTextChars()`
UnicodeSet* content = unisets::getImpl(unisets::CONTENT);
if (content == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
result->addAll(*content);
UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE);
if (whitespace == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
result->addAll(*whitespace);
result->add(PERIOD);
result->add(AT);
result->add(LEFT_CURLY_BRACE);
Expand All @@ -269,7 +298,7 @@ UnicodeSet* initQuotedChars(UErrorCode& status) {

UnicodeSet* initEscapableChars(UErrorCode& status) {
if (U_FAILURE(status)) {
return {};
return nullptr;
}

UnicodeSet* result = new UnicodeSet();
Expand Down Expand Up @@ -298,26 +327,44 @@ UBool U_CALLCONV cleanupMF2ParseUniSets() {

void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) {
ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets);
/*
Each of the init functions initializes the UnicodeSets
that it depends on.

gUnicodeSets[unisets::CONTENT] = initContentChars(status);
gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status);
initBidiControls (no dependencies)

initEscapableChars (no dependencies)

initNameChars depends on
initDigits
initNameStartChars depends on
initAlpha

initQuotedChars depends on
initTextChars depends on
initContentChars
initWhitespace
*/
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the end of this function, we should do the following

if (U_FAILURE(status)) {
    cleanupMF2ParseUniSets();
}

so in case the failure is due to memory stress for the initialization, gMF2ParseUniSetsInitOnce will be reset after all the allocated UnicodeSet inside gUnicodeSets be deleted so it has a second chance to be sucessful.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the change in 7f19865 -- can you re-approve? Thanks!

gUnicodeSets[unisets::BIDI] = initBidiControls(status);
gUnicodeSets[unisets::ALPHA] = initAlpha(status);
gUnicodeSets[unisets::DIGIT] = initDigits(status);
gUnicodeSets[unisets::NAME_START] = initNameStartChars(status);
gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status);
gUnicodeSets[unisets::TEXT] = initTextChars(status);
gUnicodeSets[unisets::QUOTED] = initQuotedChars(status);
gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status);

if (U_FAILURE(status)) {
cleanupMF2ParseUniSets();
}
}

const UnicodeSet* get(Key key) {
UErrorCode localStatus = U_ZERO_ERROR;
umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, localStatus);
if (U_FAILURE(localStatus)) {
const UnicodeSet* get(Key key, UErrorCode& status) {
umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status);
if (U_FAILURE(status)) {
return nullptr;
}
return getImpl(key);
UnicodeSet* result = getImpl(key);
if (result == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
return result;
}

}
Expand Down
33 changes: 11 additions & 22 deletions icu4c/source/i18n/messageformat2_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace message2 {
UNISETS_KEY_COUNT
};

U_I18N_API const UnicodeSet* get(Key key);
U_I18N_API const UnicodeSet* get(Key key, UErrorCode& status);
}

// Parser class (private)
Expand Down Expand Up @@ -110,16 +110,16 @@ namespace message2 {
StaticErrors& e,
UnicodeString& normalizedInputRef,
UErrorCode& status)
: contentChars(unisets::get(unisets::CONTENT)),
whitespaceChars(unisets::get(unisets::WHITESPACE)),
bidiControlChars(unisets::get(unisets::BIDI)),
alphaChars(unisets::get(unisets::ALPHA)),
digitChars(unisets::get(unisets::DIGIT)),
nameStartChars(unisets::get(unisets::NAME_START)),
nameChars(unisets::get(unisets::NAME_CHAR)),
textChars(unisets::get(unisets::TEXT)),
quotedChars(unisets::get(unisets::QUOTED)),
escapableChars(unisets::get(unisets::ESCAPABLE)),
: contentChars(unisets::get(unisets::CONTENT, status)),
whitespaceChars(unisets::get(unisets::WHITESPACE, status)),
bidiControlChars(unisets::get(unisets::BIDI, status)),
alphaChars(unisets::get(unisets::ALPHA, status)),
digitChars(unisets::get(unisets::DIGIT, status)),
nameStartChars(unisets::get(unisets::NAME_START, status)),
nameChars(unisets::get(unisets::NAME_CHAR, status)),
textChars(unisets::get(unisets::TEXT, status)),
quotedChars(unisets::get(unisets::QUOTED, status)),
escapableChars(unisets::get(unisets::ESCAPABLE, status)),
source(input), index(0), errors(e), normalizedInput(normalizedInputRef), dataModel(dataModelBuilder) {
(void) status;
parseError.line = 0;
Expand All @@ -129,17 +129,6 @@ namespace message2 {
parseError.postContext[0] = '\0';
}

UnicodeSet initContentChars(UErrorCode& status);
UnicodeSet initWhitespace(UErrorCode& status);
UnicodeSet initBidiControls(UErrorCode& status);
UnicodeSet initAlpha(UErrorCode& status);
UnicodeSet initDigits(UErrorCode& status);
UnicodeSet initNameStartChars(UErrorCode& status);
UnicodeSet initNameChars(UErrorCode& status);
UnicodeSet initTextChars(UErrorCode& status);
UnicodeSet initQuotedChars(UErrorCode& status);
UnicodeSet initEscapableChars(UErrorCode& status);

bool isContentChar(UChar32) const;
bool isBidiControl(UChar32) const;
bool isWhitespace(UChar32) const;
Expand Down
Loading