From 00867c4809feadc80ca6f966c35d5768d9444b0b Mon Sep 17 00:00:00 2001 From: Alex Henrie Date: Mon, 9 Dec 2019 19:26:40 +0000 Subject: [PATCH] Bug 202251 - Add an option to ignore diacritics when searching. r=fluent-reviewers,mikedeboer,jfkthame,flod Differential Revision: https://phabricator.services.mozilla.com/D51841 --HG-- extra : moz-landing-system : lando --- intl/unicharutil/util/nsUnicharUtils.cpp | 26 +++++++ intl/unicharutil/util/nsUnicharUtils.h | 4 + intl/unicharutil/util/nsUnicodeProperties.cpp | 78 +++++++++++++++++++ intl/unicharutil/util/nsUnicodeProperties.h | 3 + mobile/android/app/mobile.js | 1 + .../modules/geckoview/GeckoViewContent.jsm | 1 + modules/libpref/init/all.js | 5 ++ toolkit/actors/FinderChild.jsm | 4 + toolkit/components/extensions/FindContent.jsm | 3 + toolkit/components/find/nsFind.cpp | 33 +++++++- toolkit/components/find/nsFind.h | 1 + toolkit/components/find/nsFindService.cpp | 10 +++ toolkit/components/find/nsFindService.h | 1 + toolkit/components/find/nsIFind.idl | 1 + toolkit/components/find/nsIFindService.idl | 1 + toolkit/components/find/nsIWebBrowserFind.idl | 7 ++ toolkit/components/find/nsWebBrowserFind.cpp | 15 ++++ toolkit/components/find/nsWebBrowserFind.h | 1 + .../typeaheadfind/nsITypeAheadFind.idl | 1 + .../typeaheadfind/nsTypeAheadFind.cpp | 21 ++++- .../typeaheadfind/nsTypeAheadFind.h | 2 + .../windowcreator/test/test_nsFind.html | 18 ++++- .../tests/chrome/findbar_events_window.xhtml | 19 +++++ .../content/tests/chrome/findbar_window.xhtml | 22 +++++- toolkit/content/widgets/findbar.js | 71 +++++++++++++++++ .../en-US/chrome/global/findbar.properties | 1 + .../en-US/toolkit/main-window/findbar.ftl | 5 ++ toolkit/modules/Finder.jsm | 10 +++ toolkit/modules/FinderHighlighter.jsm | 1 + toolkit/modules/FinderIterator.jsm | 24 +++++- toolkit/modules/FinderParent.jsm | 6 ++ .../tests/xpcshell/test_FinderIterator.js | 34 +++++++- 32 files changed, 419 insertions(+), 11 deletions(-) diff --git a/intl/unicharutil/util/nsUnicharUtils.cpp b/intl/unicharutil/util/nsUnicharUtils.cpp index c0e23f1a9f64..a3baa7561056 100644 --- a/intl/unicharutil/util/nsUnicharUtils.cpp +++ b/intl/unicharutil/util/nsUnicharUtils.cpp @@ -162,6 +162,32 @@ void ToFoldedCase(const char16_t* aIn, char16_t* aOut, uint32_t aLen) { } } +uint32_t ToNaked(uint32_t aChar) { + if (IS_ASCII(aChar)) { + return aChar; + } + return mozilla::unicode::GetNaked(aChar); +} + +void ToNaked(nsAString& aString) { + char16_t* buf = aString.BeginWriting(); + ToNaked(buf, buf, aString.Length()); +} + +void ToNaked(const char16_t* aIn, char16_t* aOut, uint32_t aLen) { + for (uint32_t i = 0; i < aLen; i++) { + uint32_t ch = aIn[i]; + if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { + ch = mozilla::unicode::GetNaked(SURROGATE_TO_UCS4(ch, aIn[i + 1])); + NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!"); + aOut[i++] = H_SURROGATE(ch); + aOut[i] = L_SURROGATE(ch); + continue; + } + aOut[i] = ToNaked(ch); + } +} + int32_t nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs, const char16_t* rhs, uint32_t lLength, diff --git a/intl/unicharutil/util/nsUnicharUtils.h b/intl/unicharutil/util/nsUnicharUtils.h index 1f1d1a949ae1..ea02c5dd5aa9 100644 --- a/intl/unicharutil/util/nsUnicharUtils.h +++ b/intl/unicharutil/util/nsUnicharUtils.h @@ -56,6 +56,10 @@ uint32_t ToFoldedCase(uint32_t aChar); void ToFoldedCase(nsAString& aString); void ToFoldedCase(const char16_t* aIn, char16_t* aOut, uint32_t aLen); +uint32_t ToNaked(uint32_t aChar); +void ToNaked(nsAString& aString); +void ToNaked(const char16_t* aIn, char16_t* aOut, uint32_t aLen); + class nsCaseInsensitiveStringComparator : public nsStringComparator { public: nsCaseInsensitiveStringComparator() = default; diff --git a/intl/unicharutil/util/nsUnicodeProperties.cpp b/intl/unicharutil/util/nsUnicodeProperties.cpp index a03ffc5d5452..a8c710b71ba1 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.cpp +++ b/intl/unicharutil/util/nsUnicodeProperties.cpp @@ -8,8 +8,12 @@ #include "nsUnicodePropertyData.cpp" #include "mozilla/ArrayUtils.h" +#include "mozilla/HashTable.h" #include "nsCharTraits.h" +#include "unicode/uchar.h" +#include "unicode/unorm2.h" + #define UNICODE_BMP_LIMIT 0x10000 #define UNICODE_LIMIT 0x110000 @@ -305,6 +309,80 @@ uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength) { return result; } +uint32_t GetNaked(uint32_t aCh) { + using namespace mozilla; + + static const UNormalizer2* normalizer; + static HashMap nakedCharCache; + + HashMap::Ptr entry = nakedCharCache.lookup(aCh); + if (entry.found()) { + return entry->value(); + } + + UErrorCode error = U_ZERO_ERROR; + if (!normalizer) { + normalizer = unorm2_getNFDInstance(&error); + if (U_FAILURE(error)) { + return aCh; + } + } + + static const size_t MAX_DECOMPOSITION_SIZE = 16; + UChar decomposition[MAX_DECOMPOSITION_SIZE]; + UChar* combiners; + int32_t decompositionLen; + uint32_t baseChar, nextChar; + decompositionLen = unorm2_getDecomposition(normalizer, aCh, decomposition, + MAX_DECOMPOSITION_SIZE, &error); + if (decompositionLen < 1) { + // The character does not decompose. + return aCh; + } + + if (u_getIntPropertyValue(aCh, UCHAR_GENERAL_CATEGORY) & U_GC_M_MASK) { + // The character is itself a combining character, and we don't want to use + // its decomposition into multiple combining characters. + baseChar = aCh; + goto cache; + } + + if (NS_IS_HIGH_SURROGATE(decomposition[0])) { + baseChar = SURROGATE_TO_UCS4(decomposition[0], decomposition[1]); + combiners = decomposition + 2; + } else { + baseChar = decomposition[0]; + combiners = decomposition + 1; + } + + if (IS_IN_BMP(baseChar) != IS_IN_BMP(aCh)) { + // Mappings that would change the length of a UTF-16 string are not + // currently supported. + baseChar = aCh; + goto cache; + } + + if (decompositionLen > 1) { + if (NS_IS_HIGH_SURROGATE(combiners[0])) { + nextChar = SURROGATE_TO_UCS4(combiners[0], combiners[1]); + } else { + nextChar = combiners[0]; + } + if (u_getCombiningClass(nextChar) == 0) { + // Hangul syllables decompose but do not actually have diacritics. + baseChar = aCh; + } + } + +cache: + if (!nakedCharCache.putNew(aCh, baseChar)) { + // We're out of memory, so delete the cache to free some up. + nakedCharCache.clearAndCompact(); + } + + return baseChar; +} + } // end namespace unicode } // end namespace mozilla diff --git a/intl/unicharutil/util/nsUnicodeProperties.h b/intl/unicharutil/util/nsUnicodeProperties.h index 8922b1d0b436..b34a527e360a 100644 --- a/intl/unicharutil/util/nsUnicodeProperties.h +++ b/intl/unicharutil/util/nsUnicodeProperties.h @@ -229,6 +229,9 @@ class ClusterIterator { // Count the number of grapheme clusters in the given string uint32_t CountGraphemeClusters(const char16_t* aText, uint32_t aLength); +// Remove diacritics from a character +uint32_t GetNaked(uint32_t aCh); + // A simple reverse iterator for a string of char16_t codepoints that // advances by Unicode grapheme clusters class ClusterReverseIterator { diff --git a/mobile/android/app/mobile.js b/mobile/android/app/mobile.js index 07bdc3c94a74..f0fb3aab3b49 100644 --- a/mobile/android/app/mobile.js +++ b/mobile/android/app/mobile.js @@ -236,6 +236,7 @@ pref("accessibility.typeaheadfind.flashBar", 1); pref("accessibility.typeaheadfind.linksonly", false); pref("accessibility.typeaheadfind.casesensitive", 0); pref("accessibility.browsewithcaret_shortcut.enabled", false); +pref("findbar.matchdiacritics", 0); // Whether the character encoding menu is under the main Firefox button. This // preference is a string so that localizers can alter it. diff --git a/mobile/android/modules/geckoview/GeckoViewContent.jsm b/mobile/android/modules/geckoview/GeckoViewContent.jsm index b52a161861e7..684f165ea013 100644 --- a/mobile/android/modules/geckoview/GeckoViewContent.jsm +++ b/mobile/android/modules/geckoview/GeckoViewContent.jsm @@ -332,6 +332,7 @@ class GeckoViewContent extends GeckoViewModule { finder.caseSensitive = !!aData.matchCase; finder.entireWord = !!aData.wholeWord; + finder.matchDiacritics = !!aData.matchDiacritics; finder.addResultListener(this._finderListener); const drawOutline = diff --git a/modules/libpref/init/all.js b/modules/libpref/init/all.js index 861ccd2a916d..4b0372ff5f03 100644 --- a/modules/libpref/init/all.js +++ b/modules/libpref/init/all.js @@ -773,6 +773,11 @@ pref("accessibility.typeaheadfind.matchesCountLimit", 1000); pref("findbar.highlightAll", false); pref("findbar.entireword", false); pref("findbar.iteratorTimeout", 100); +// matchdiacritics: controls the find bar's diacritic matching +// 0 - "never" (ignore diacritics) +// 1 - "always" (match diacritics) +// other - "auto" (match diacritics if input has diacritics, ignore otherwise) +pref("findbar.matchdiacritics", 0); // use Mac OS X Appearance panel text smoothing setting when rendering text, disabled by default pref("gfx.use_text_smoothing_setting", false); diff --git a/toolkit/actors/FinderChild.jsm b/toolkit/actors/FinderChild.jsm index b2a084394098..7fb6f39b0f7e 100644 --- a/toolkit/actors/FinderChild.jsm +++ b/toolkit/actors/FinderChild.jsm @@ -28,6 +28,10 @@ class FinderChild extends JSWindowActorChild { this.finder.caseSensitive = data.caseSensitive; break; + case "Finder:MatchDiacritics": + this.finder.matchDiacritics = data.matchDiacritics; + break; + case "Finder:EntireWord": this.finder.entireWord = data.entireWord; break; diff --git a/toolkit/components/extensions/FindContent.jsm b/toolkit/components/extensions/FindContent.jsm index 7ab732a138a4..930a3aea84c4 100644 --- a/toolkit/components/extensions/FindContent.jsm +++ b/toolkit/components/extensions/FindContent.jsm @@ -51,6 +51,7 @@ class FindContent { * @param {string} queryphrase - the text to search for. * @param {boolean} caseSensitive - whether to use case sensitive matches. * @param {boolean} includeRangeData - whether to collect and return range data. + * @param {boolean} matchDiacritics - whether diacritics must match. * @param {boolean} searchString - whether to collect and return rect data. * * @returns {object} that includes: @@ -66,6 +67,7 @@ class FindContent { entireWord, includeRangeData, includeRectData, + matchDiacritics, } = params; this.iterator.reset(); @@ -77,6 +79,7 @@ class FindContent { entireWord: !!entireWord, finder: this.finder, listener: this.finder, + matchDiacritics: !!matchDiacritics, useSubFrames: false, }); diff --git a/toolkit/components/find/nsFind.cpp b/toolkit/components/find/nsFind.cpp index f02296eaf613..fb04cde0f721 100644 --- a/toolkit/components/find/nsFind.cpp +++ b/toolkit/components/find/nsFind.cpp @@ -60,7 +60,10 @@ NS_IMPL_CYCLE_COLLECTING_RELEASE(nsFind) NS_IMPL_CYCLE_COLLECTION(nsFind) nsFind::nsFind() - : mFindBackward(false), mCaseSensitive(false), mWordBreaker(nullptr) {} + : mFindBackward(false), + mCaseSensitive(false), + mMatchDiacritics(false), + mWordBreaker(nullptr) {} nsFind::~nsFind() = default; @@ -396,6 +399,22 @@ nsFind::SetEntireWord(bool aEntireWord) { return NS_OK; } +NS_IMETHODIMP +nsFind::GetMatchDiacritics(bool* aMatchDiacritics) { + if (!aMatchDiacritics) { + return NS_ERROR_NULL_POINTER; + } + + *aMatchDiacritics = mMatchDiacritics; + return NS_OK; +} + +NS_IMETHODIMP +nsFind::SetMatchDiacritics(bool aMatchDiacritics) { + mMatchDiacritics = aMatchDiacritics; + return NS_OK; +} + // Here begins the find code. A ten-thousand-foot view of how it works: Find // needs to be able to compare across inline (but not block) nodes, e.g. find // for "abc" should match abc. So after we've searched a node, we're not @@ -506,6 +525,9 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange, if (!mCaseSensitive) { ToFoldedCase(patAutoStr); } + if (!mMatchDiacritics) { + ToNaked(patAutoStr); + } // Ignore soft hyphens in the pattern static const char kShy[] = {char(CH_SHY), 0}; @@ -684,8 +706,13 @@ nsFind::Find(const nsAString& aPatText, nsRange* aSearchRange, } if (!inWhitespace && IsSpace(patc)) { inWhitespace = true; - } else if (!inWhitespace && !mCaseSensitive) { - c = ToFoldedCase(c); + } else if (!inWhitespace) { + if (!mCaseSensitive) { + c = ToFoldedCase(c); + } + if (!mMatchDiacritics) { + c = ToNaked(c); + } } if (c == CH_SHY) { diff --git a/toolkit/components/find/nsFind.h b/toolkit/components/find/nsFind.h index 76cc4537d717..37668f702709 100644 --- a/toolkit/components/find/nsFind.h +++ b/toolkit/components/find/nsFind.h @@ -42,6 +42,7 @@ class nsFind : public nsIFind { // Parameters set from the interface: bool mFindBackward; bool mCaseSensitive; + bool mMatchDiacritics; // Use "find entire words" mode by setting to a word breaker or null, to // disable "entire words" mode. diff --git a/toolkit/components/find/nsFindService.cpp b/toolkit/components/find/nsFindService.cpp index 1d1731787f94..7499132c2c2c 100644 --- a/toolkit/components/find/nsFindService.cpp +++ b/toolkit/components/find/nsFindService.cpp @@ -79,3 +79,13 @@ NS_IMETHODIMP nsFindService::SetMatchCase(bool aMatchCase) { mMatchCase = aMatchCase; return NS_OK; } + +NS_IMETHODIMP nsFindService::GetMatchDiacritics(bool* aMatchDiacritics) { + NS_ENSURE_ARG_POINTER(aMatchDiacritics); + *aMatchDiacritics = mMatchDiacritics; + return NS_OK; +} +NS_IMETHODIMP nsFindService::SetMatchDiacritics(bool aMatchDiacritics) { + mMatchDiacritics = aMatchDiacritics; + return NS_OK; +} diff --git a/toolkit/components/find/nsFindService.h b/toolkit/components/find/nsFindService.h index 595b797728a7..5204076f5d91 100644 --- a/toolkit/components/find/nsFindService.h +++ b/toolkit/components/find/nsFindService.h @@ -40,4 +40,5 @@ class nsFindService : public nsIFindService { bool mWrapFind; bool mEntireWord; bool mMatchCase; + bool mMatchDiacritics; }; diff --git a/toolkit/components/find/nsIFind.idl b/toolkit/components/find/nsIFind.idl index 6cae5dff63c7..941ed56cacc3 100644 --- a/toolkit/components/find/nsIFind.idl +++ b/toolkit/components/find/nsIFind.idl @@ -15,6 +15,7 @@ interface nsIFind : nsISupports attribute boolean findBackwards; attribute boolean caseSensitive; attribute boolean entireWord; + attribute boolean matchDiacritics; /** * Find some text in the current context. The implementation is diff --git a/toolkit/components/find/nsIFindService.idl b/toolkit/components/find/nsIFindService.idl index 0c1b0e215e62..f5a5e18ce653 100644 --- a/toolkit/components/find/nsIFindService.idl +++ b/toolkit/components/find/nsIFindService.idl @@ -22,5 +22,6 @@ interface nsIFindService : nsISupports attribute boolean wrapFind; attribute boolean entireWord; attribute boolean matchCase; + attribute boolean matchDiacritics; }; diff --git a/toolkit/components/find/nsIWebBrowserFind.idl b/toolkit/components/find/nsIWebBrowserFind.idl index 634d39973acc..1a78ea7269a0 100644 --- a/toolkit/components/find/nsIWebBrowserFind.idl +++ b/toolkit/components/find/nsIWebBrowserFind.idl @@ -79,6 +79,13 @@ interface nsIWebBrowserFind : nsISupports */ attribute boolean matchCase; + /** + * matchDiacritics + * + * Whether to match diacritics when searching. Default is false. + */ + attribute boolean matchDiacritics; + /** * searchFrames * diff --git a/toolkit/components/find/nsWebBrowserFind.cpp b/toolkit/components/find/nsWebBrowserFind.cpp index d764d0c635e8..b3dc41d1133e 100644 --- a/toolkit/components/find/nsWebBrowserFind.cpp +++ b/toolkit/components/find/nsWebBrowserFind.cpp @@ -50,6 +50,7 @@ nsWebBrowserFind::nsWebBrowserFind() mWrapFind(false), mEntireWord(false), mMatchCase(false), + mMatchDiacritics(false), mSearchSubFrames(true), mSearchParentFrames(true) {} @@ -286,6 +287,19 @@ nsWebBrowserFind::SetMatchCase(bool aMatchCase) { return NS_OK; } +NS_IMETHODIMP +nsWebBrowserFind::GetMatchDiacritics(bool* aMatchDiacritics) { + NS_ENSURE_ARG_POINTER(aMatchDiacritics); + *aMatchDiacritics = mMatchDiacritics; + return NS_OK; +} + +NS_IMETHODIMP +nsWebBrowserFind::SetMatchDiacritics(bool aMatchDiacritics) { + mMatchDiacritics = aMatchDiacritics; + return NS_OK; +} + void nsWebBrowserFind::SetSelectionAndScroll(nsPIDOMWindowOuter* aWindow, nsRange* aRange) { RefPtr doc = aWindow->GetDoc(); @@ -622,6 +636,7 @@ nsresult nsWebBrowserFind::SearchInFrame(nsPIDOMWindowOuter* aWindow, NS_ENSURE_SUCCESS(rv, rv); (void)find->SetCaseSensitive(mMatchCase); + (void)find->SetMatchDiacritics(mMatchDiacritics); (void)find->SetFindBackwards(mFindBackwards); (void)find->SetEntireWord(mEntireWord); diff --git a/toolkit/components/find/nsWebBrowserFind.h b/toolkit/components/find/nsWebBrowserFind.h index f886b66fd847..d6d5596e2ce5 100644 --- a/toolkit/components/find/nsWebBrowserFind.h +++ b/toolkit/components/find/nsWebBrowserFind.h @@ -79,6 +79,7 @@ class nsWebBrowserFind : public nsIWebBrowserFind, bool mWrapFind; bool mEntireWord; bool mMatchCase; + bool mMatchDiacritics; bool mSearchSubFrames; bool mSearchParentFrames; diff --git a/toolkit/components/typeaheadfind/nsITypeAheadFind.idl b/toolkit/components/typeaheadfind/nsITypeAheadFind.idl index 9bfb39874921..2f94136e72bf 100644 --- a/toolkit/components/typeaheadfind/nsITypeAheadFind.idl +++ b/toolkit/components/typeaheadfind/nsITypeAheadFind.idl @@ -70,6 +70,7 @@ interface nsITypeAheadFind : nsISupports readonly attribute AString searchString; // Most recent search string attribute boolean caseSensitive; // Searches are case sensitive + attribute boolean matchDiacritics; // Searches preserve diacritics attribute boolean entireWord; // Search for whole words only readonly attribute Element foundLink; // Most recent elem found, if a link diff --git a/toolkit/components/typeaheadfind/nsTypeAheadFind.cpp b/toolkit/components/typeaheadfind/nsTypeAheadFind.cpp index 7d204a7f8cde..ee0fc644626a 100644 --- a/toolkit/components/typeaheadfind/nsTypeAheadFind.cpp +++ b/toolkit/components/typeaheadfind/nsTypeAheadFind.cpp @@ -79,7 +79,8 @@ nsTypeAheadFind::nsTypeAheadFind() mLastFindLength(0), mIsSoundInitialized(false), mCaseSensitive(false), - mEntireWord(false) {} + mEntireWord(false), + mMatchDiacritics(false) {} nsTypeAheadFind::~nsTypeAheadFind() { nsCOMPtr prefInternal( @@ -204,6 +205,24 @@ nsTypeAheadFind::GetEntireWord(bool* isEntireWord) { return NS_OK; } +NS_IMETHODIMP +nsTypeAheadFind::SetMatchDiacritics(bool matchDiacritics) { + mMatchDiacritics = matchDiacritics; + + if (mFind) { + mFind->SetMatchDiacritics(mMatchDiacritics); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsTypeAheadFind::GetMatchDiacritics(bool* matchDiacritics) { + *matchDiacritics = mMatchDiacritics; + + return NS_OK; +} + NS_IMETHODIMP nsTypeAheadFind::SetDocShell(nsIDocShell* aDocShell) { mDocShell = do_GetWeakReference(aDocShell); diff --git a/toolkit/components/typeaheadfind/nsTypeAheadFind.h b/toolkit/components/typeaheadfind/nsTypeAheadFind.h index 6dbbeac9fcc3..ffe450874390 100644 --- a/toolkit/components/typeaheadfind/nsTypeAheadFind.h +++ b/toolkit/components/typeaheadfind/nsTypeAheadFind.h @@ -118,6 +118,7 @@ class nsTypeAheadFind : public nsITypeAheadFind, bool mCaseSensitive; bool mEntireWord; + bool mMatchDiacritics; bool EnsureFind() { if (mFind) { @@ -131,6 +132,7 @@ class nsTypeAheadFind : public nsITypeAheadFind, mFind->SetCaseSensitive(mCaseSensitive); mFind->SetEntireWord(mEntireWord); + mFind->SetMatchDiacritics(mMatchDiacritics); return true; } diff --git a/toolkit/components/windowcreator/test/test_nsFind.html b/toolkit/components/windowcreator/test/test_nsFind.html index 9689d21a219c..0e6ebcacab1f 100644 --- a/toolkit/components/windowcreator/test/test_nsFind.html +++ b/toolkit/components/windowcreator/test/test_nsFind.html @@ -1,6 +1,7 @@