From af23be36d779895f805333aac233522b2494ebac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Bargull?= Date: Fri, 15 Nov 2024 16:42:31 +0000 Subject: [PATCH] Bug 1648139 - Part 1: Add DateTimeFormat::GetTimeSeparator(). r=platform-i18n-reviewers,dminor ICU doesn't provide a public API to retrieve the time separator, so we have to read it manually from the resource bundles. Differential Revision: https://phabricator.services.mozilla.com/D152744 --- intl/components/gtest/TestDateTimeFormat.cpp | 71 ++++++++ intl/components/src/DateTimeFormat.cpp | 177 +++++++++++++++++++ intl/components/src/DateTimeFormat.h | 21 +++ 3 files changed, 269 insertions(+) diff --git a/intl/components/gtest/TestDateTimeFormat.cpp b/intl/components/gtest/TestDateTimeFormat.cpp index 6d9e0318382b..4a0b0788ed3d 100644 --- a/intl/components/gtest/TestDateTimeFormat.cpp +++ b/intl/components/gtest/TestDateTimeFormat.cpp @@ -633,4 +633,75 @@ TEST(IntlDateTimeFormat, SetStartTimeIfGregorian) ASSERT_TRUE(buffer.verboseMatches(Jan01_1583)); } } + +TEST(IntlDateTimeFormat, GetTimeSeparator) +{ + struct TestData { + const char* locale; + const char* numberingSystem; + const char16_t* expected; + } testData[] = { + {"root", "latn", u":"}, + {"root", "arab", u":"}, + {"root", "thai", u":"}, + {"root", "arabext", u"٫"}, + + // English uses the same data as the root locale. + {"en", "latn", u":"}, + {"en", "arab", u":"}, + {"en", "thai", u":"}, + {"en", "arabext", u"٫"}, + + // Spanish uses the same data as the root locale. + {"es", "latn", u":"}, + {"es", "arab", u":"}, + {"es", "thai", u":"}, + {"es", "arabext", u"٫"}, + + // German (Austria) uses the same data as the root locale. + {"de-AT", "latn", u":"}, + {"de-AT", "arab", u":"}, + {"de-AT", "thai", u":"}, + {"de-AT", "arabext", u"٫"}, + + // Danish has a different time separator for "latn". + {"da", "latn", u"."}, + {"da", "arab", u":"}, + {"da", "thai", u"."}, + {"da", "arabext", u"٫"}, + + // Same time separator as Danish. + {"en-DK", "latn", u"."}, + {"en-DK", "arab", u":"}, + {"en-DK", "thai", u"."}, + {"en-DK", "arabext", u"٫"}, + + // Norwegian overrides time separators for "arab" and "arabext". + {"no", "latn", u":"}, + {"no", "arab", u"."}, + {"no", "thai", u":"}, + {"no", "arabext", u"."}, + + // Parent locale of Bokmål is Norwegian. + {"nb", "latn", u":"}, + {"nb", "arab", u"."}, + {"nb", "thai", u":"}, + {"nb", "arabext", u"."}, + + // Farsi overrides the time separator for "arabext". + {"fa", "latn", u":"}, + {"fa", "arab", u":"}, + {"fa", "thai", u":"}, + {"fa", "arabext", u":"}, + }; + + for (const auto& data : testData) { + TestBuffer timeSeparator; + auto timeSeparatorResult = DateTimeFormat::GetTimeSeparator( + MakeStringSpan(data.locale), MakeStringSpan(data.numberingSystem), + timeSeparator); + ASSERT_TRUE(timeSeparatorResult.isOk()); + ASSERT_TRUE(timeSeparator.verboseMatches(data.expected)); + } +} } // namespace mozilla::intl diff --git a/intl/components/src/DateTimeFormat.cpp b/intl/components/src/DateTimeFormat.cpp index 2c09bb2adfdc..86e77c02f1d7 100644 --- a/intl/components/src/DateTimeFormat.cpp +++ b/intl/components/src/DateTimeFormat.cpp @@ -2,6 +2,7 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include #include #include "unicode/ucal.h" @@ -12,6 +13,7 @@ #include "DateTimeFormatUtils.h" #include "ScopedICUObject.h" +#include "mozilla/Buffer.h" #include "mozilla/EnumSet.h" #include "mozilla/intl/Calendar.h" #include "mozilla/intl/DateTimeFormat.h" @@ -804,6 +806,181 @@ DateTimeFormat::GetAllowedHourCycles(Span aLanguage, return result; } +template +static Result, ICUError> DuplicateChars(Span aView) { + auto chars = MakeUnique(aView.Length() + 1); + std::copy_n(aView.Elements(), aView.Length(), chars.get()); + chars[aView.Length()] = '\0'; + return Buffer{std::move(chars), aView.Length()}; +} + +static Result, ICUError> GetParentLocale( + const UResourceBundle* aLocaleBundle) { + UErrorCode status = U_ZERO_ERROR; + + // First check for an explicit parent locale using the "%%Parent" key. + int32_t length = 0; + const char16_t* parent = + ures_getStringByKey(aLocaleBundle, "%%Parent", &length, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + status = U_ZERO_ERROR; + parent = nullptr; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + if (parent) { + return DuplicateChars(Span{parent, size_t(length)}); + } + + // Retrieve the actual locale of the resource bundle. + const char* locale = + ures_getLocaleByType(aLocaleBundle, ULOC_ACTUAL_LOCALE, &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + // Strip off the last subtag, if possible. + if (const char* sep = std::strrchr(locale, '_')) { + return DuplicateChars(Span{locale, size_t(sep - locale)}); + } + + // The parent locale of all locales is "root". + if (std::strcmp(locale, "root") != 0) { + static constexpr auto root = MakeStringSpan("root"); + return DuplicateChars(root); + } + + // "root" itself doesn't have a parent locale. + static constexpr auto empty = MakeStringSpan(""); + return DuplicateChars(empty); +} + +static Result, ICUError> FindTimeSeparator( + Span aRequestedLocale, Span aLocale, + Span aNumberingSystem) { + // We didn't find the numbering system. Retry using the default numbering + // system "latn". (We don't use the default numbering system of the requested + // locale to match ICU.) + if (aLocale == MakeStringSpan("")) { + return FindTimeSeparator(aRequestedLocale, aRequestedLocale, "latn"); + } + + // First open the resource bundle of the input locale. + // + // Note: ICU's resource API accepts both Unicode CLDR locale identifiers and + // Unicode BCP 47 locale identifiers, so we don't have to convert the input + // into a Unicode CLDR locale identifier. + UErrorCode status = U_ZERO_ERROR; + UResourceBundle* localeBundle = + ures_open(nullptr, AssertNullTerminatedString(aLocale), &status); + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeLocaleBundle(localeBundle); + + do { + // Search for the "NumberElements" table. Fall back to the parent locale if + // no "NumberElements" table is present. + UResourceBundle* numberElements = + ures_getByKey(localeBundle, "NumberElements", nullptr, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeNumberElements( + numberElements); + + // Search for the table of the requested numbering system. Fall back to the + // parent locale if no table was found. + UResourceBundle* numberingSystem = ures_getByKey( + numberElements, AssertNullTerminatedString(aNumberingSystem), nullptr, + &status); + if (status == U_MISSING_RESOURCE_ERROR) { + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeNumberingSystem( + numberingSystem); + + // Search for the "symbols" table. Fall back to the parent locale if no + // "symbols" table is present. + UResourceBundle* symbols = + ures_getByKey(numberingSystem, "symbols", nullptr, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + ScopedICUObject closeSymbols(symbols); + + // And finally look up the "timeSeparator" string in the "symbols" table. If + // the string isn't present, fall back to the parent locale. + int32_t length = 0; + const UChar* str = + ures_getStringByKey(symbols, "timeSeparator", &length, &status); + if (status == U_MISSING_RESOURCE_ERROR) { + break; + } + if (U_FAILURE(status)) { + return Err(ToICUError(status)); + } + + Span timeSeparator{str, size_t(length)}; + + static constexpr auto defaultTimeSeparator = MakeStringSpan(u":"); + + // Many numbering systems don't define their own symbols, but instead link + // to the symbols for "latn" of the requested locale. The link is performed + // through an alias entry like: + // `symbols:alias{"/LOCALE/NumberElements/latn/symbols"}` + // + // ICU doesn't provide a public API to detect these alias entries, but + // instead always automatically resolves the link. But that leads to + // incorrectly using the symbols from the "root" locale instead of the + // requested locale. + // + // Thankfully these alias entries are only present on the "root" locale. So + // we are using this heuristic to detect alias entries: + // + // - If the resolved time separator is the default time separator ":". + // - The current locale is "root". + // - And the numbering system is neither "latn" nor "arab". + // - Then search the time separator for "latn" of the requested locale. + // + // We have to exclude "arab", because it's also using ":" for the time + // separator, but doesn't use an alias link to "latn". + if (timeSeparator == defaultTimeSeparator && + aLocale == MakeStringSpan("root") && + aNumberingSystem != MakeStringSpan("latn") && + aNumberingSystem != MakeStringSpan("arab")) { + return FindTimeSeparator(aRequestedLocale, aRequestedLocale, + MakeStringSpan("latn")); + } + + return timeSeparator; + } while (false); + + // Fall back to the parent locale. + auto parent = GetParentLocale(localeBundle); + if (parent.isErr()) { + return parent.propagateErr(); + } + return FindTimeSeparator(aRequestedLocale, parent.inspect().AsSpan(), + aNumberingSystem); +} + +/* static */ +Result, ICUError> DateTimeFormat::GetTimeSeparator( + Span aLocale, Span aNumberingSystem) { + return FindTimeSeparator(aLocale, aLocale, aNumberingSystem); +} + Result DateTimeFormat::ResolveComponents() { // Maps an ICU pattern string to a corresponding set of date-time components diff --git a/intl/components/src/DateTimeFormat.h b/intl/components/src/DateTimeFormat.h index 4853d9e3b282..661ca6dfd1d9 100644 --- a/intl/components/src/DateTimeFormat.h +++ b/intl/components/src/DateTimeFormat.h @@ -537,6 +537,24 @@ class DateTimeFormat final { udat_getAvailable>(); } + /** + * Return the time separator for the given locale and numbering system. + */ + template + static ICUResult GetTimeSeparator(Span aLocale, + Span aNumberingSystem, + B& aBuffer) { + static_assert(std::is_same_v); + auto separator = GetTimeSeparator(aLocale, aNumberingSystem); + if (separator.isErr()) { + return separator.propagateErr(); + } + if (!FillBuffer(separator.unwrap(), aBuffer)) { + return Err(ICUError::OutOfMemory); + } + return Ok(); + } + private: explicit DateTimeFormat(UDateFormat* aDateFormat); @@ -583,6 +601,9 @@ class DateTimeFormat final { DateTimeFormat::PatternVector& aPattern, bool aHour12, DateTimeFormat::SkeletonVector& aSkeleton); + static Result, ICUError> GetTimeSeparator( + Span aLocale, Span aNumberingSystem); + UDateFormat* mDateFormat = nullptr; SkeletonVector mOriginalSkeleton;