Bug 1614868 - Ignore combining diacritic characters in history search. r=jfkthame,mak

IsCombiningDiacritic(-1) returns false, so there is no need to specially handle -1 in GetLowerUTF8Codepoint_inline. It is no longer necessary for GetNaked to check whether a character is a combining character because all callers now skip combining diacritics and GetNaked already makes sure that decomposition removes a diacritic and not something else. Differential Revision: https://phabricator.services.mozilla.com/D62533 --HG-- extra : moz-landing-system : lando
2024-11-27 23:02:20 +00:00 · 2020-02-17 20:42:04 +00:00 · 2020-02-17 20:42:04 +00:00 · 676b1a533d
commit 676b1a533d
parent df11faa804
3 changed files with 25 additions and 11 deletions
--- a/browser/components/urlbar/tests/unit/test_providerUnifiedComplete.js
+++ b/browser/components/urlbar/tests/unit/test_providerUnifiedComplete.js
@ -212,7 +212,7 @@ add_task(async function test_diacritics() {
  let context = createContext(searchString, { isPrivate: false });

  await PlacesUtils.bookmarks.insert({
-    url: "https://bookmark.mozilla.org/%C3%A3gu%C4%A9",
+    url: "https://bookmark.mozilla.org/%C3%A3g%CC%83u%C4%A9",
    title: "Test bookmark with accents in path",
    parentGuid: PlacesUtils.bookmarks.unfiledGuid,
  });
--- a/intl/unicharutil/util/nsUnicharUtils.cpp
+++ b/intl/unicharutil/util/nsUnicharUtils.cpp
@ -434,6 +434,23 @@ int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight,
  return 0;
 }

+static MOZ_ALWAYS_INLINE uint32_t
+GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd,
+                             const char** aNext, bool aMatchDiacritics) {
+  uint32_t c;
+  for (;;) {
+    c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext);
+    if (aMatchDiacritics) {
+      break;
+    }
+    if (!mozilla::unicode::IsCombiningDiacritic(c)) {
+      break;
+    }
+    aStr = *aNext;
+  }
+  return c;
+}
+
 bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
                                   const char* aLeftEnd, const char* aRightEnd,
                                   const char** aLeftNext,
@ -445,14 +462,15 @@ bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

-  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext);
+  uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext,
+                                                   aMatchDiacritics);
  if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
    *aErr = true;
    return false;
  }

-  uint32_t rightChar =
-      GetLowerUTF8Codepoint_inline(aRight, aRightEnd, aRightNext);
+  uint32_t rightChar = GetLowerUTF8Codepoint_inline(
+      aRight, aRightEnd, aRightNext, aMatchDiacritics);
  if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
    *aErr = true;
    return false;
--- a/intl/unicharutil/util/nsUnicodeProperties.cpp
+++ b/intl/unicharutil/util/nsUnicodeProperties.cpp
@ -315,6 +315,9 @@ uint32_t GetNaked(uint32_t aCh) {
  static const UNormalizer2* normalizer;
  static HashMap<uint32_t, uint32_t> nakedCharCache;

+  NS_ASSERTION(!IsCombiningDiacritic(aCh),
+               "This character needs to be skipped");
+
  HashMap<uint32_t, uint32_t>::Ptr entry = nakedCharCache.lookup(aCh);
  if (entry.found()) {
    return entry->value();
@ -340,13 +343,6 @@ uint32_t GetNaked(uint32_t aCh) {
    return aCh;
  }

-  if (u_getIntPropertyValue(aCh, UCHAR_GENERAL_CATEGORY) & U_GC_M_MASK) {
-    // The character is itself a combining character, and we don't want to use
-    // its decomposition into multiple combining characters.
-    baseChar = aCh;
-    goto cache;
-  }
-
  if (NS_IS_HIGH_SURROGATE(decomposition[0])) {
    baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]);
    combiners = decomposition + 2;